# Enrichment Analysis using KnetMiner SPARQL endpoint with Jupyter

## Import the libraries used in this script.

In [1]:
from enrichment_analysis_functions import *

# Import ipywidgets for interactive interface
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

## Choose the tax ID, study (or list of genes), and get the trait enrichment table

### Please note, for the list of studies:

1. It take a couple of seconds to get the list of studies.
2. The only tax IDs that will generate a table of studies are:
    - 4565: Triticum aestivum (wheat)
    - 3702: Arabidopsis thaliana

In [2]:
# display full dataframe pandas
pd.set_option('display.max_rows', None)

# create dataframe for Tax IDs and their names
dframe_taxID = df_taxID()

# create global variables
taxID = ''
total_db_genes = 0
total_DEXgenes = set()
dframe_GeneTrait = pd.DataFrame()
dframe_GeneTrait_filtered = pd.DataFrame()
df_Ftest_sorted = pd.DataFrame()

# view the interactive interface
@interact_manual
def choose_taxid(Species = dframe_taxID['Tax Names']):
    global taxID
    global total_db_genes
    global dframe_GeneTrait
    
    taxID = dframe_taxID[dframe_taxID['Tax Names'] == Species]['Tax IDs'].item()
    print("Tax ID is: " + taxID)
    
    # get total number of genes in the database for the selected tax ID
    total_db_genes = get_gene_count(taxID)
    
    # import csv for genes and related traits (for the selected tax ID)
    dframe_GeneTrait = pd.read_csv (f'GeneTraitTable_{taxID}.csv',
                                usecols= ["Gene Accession", "Gene Name", "Trait Accession", "Trait Name",
                                          "Evidence", "Network URL"])
    
    
    print("\nDo you want to get the list of genes from a study or use your own list?")

    @interact_manual
    def choose_gene_list(Choice = ["Study", "List of Genes"]): 
            
        if Choice == "Study":
            dframe_study_list = get_study_list(taxID)
            
            if dframe_study_list.shape[0] != 0:
                print("\nChoose from the list of studies related to the chosen Tax ID:")

                @interact_manual
                def get_study_list_for_triat(Study_Title = dframe_study_list['Study Title']):
                    global total_DEXgenes
                    global dframe_GeneTrait_filtered
                    global df_Ftest_sorted
                    
                    # get study accession number
                    studyAcc = dframe_study_list[dframe_study_list['Study Title'] == Study_Title]['Study Accession'].item()
                    print(" Study Accession is: " + studyAcc)
                    
                    # get unique set of genes
                    total_DEXgenes = get_study_DEXgenes(studyAcc)
                    # get final tables
                    dframe_GeneTrait_filtered, df_Ftest_sorted = get_df_Ftest_sorted(dframe_GeneTrait,
                                                                                     total_DEXgenes, total_db_genes)

            else:
                print("No studies in the databse for the selected tax ID. Please provide your list of genes.")
            
        
        # if Choice == "List of Genes"    
        else:
            print ("\nPlease paste the list of genes (separated by spaces).")
            
            @interact_manual
            def input_genes_list(genes = ''):
                global total_DEXgenes
                global dframe_GeneTrait_filtered
                global df_Ftest_sorted
                
                # get user input genes as list
                genes_list = genes.split()
                # get unique set of genes
                total_DEXgenes = set(genes_list)
                                
                print("\n" + str(len(total_DEXgenes)) + " genes provided:")
                for g in genes_list:
                    print(g)
                
                # get final tables
                dframe_GeneTrait_filtered, df_Ftest_sorted = get_df_Ftest_sorted(dframe_GeneTrait,
                                                                                 total_DEXgenes, total_db_genes)
                

interactive(children=(Dropdown(description='Species', options=('Triticum aestivum (wheat)', 'Arabidopsis thali…

## Show the gene-trait table containing the genes list and their related traits and evidence

###  Please note, you can filter the gene-trait table using the cells below by either:
- choosing a trait to display the related genes
- or choosing a gene to display the related traits

In [6]:
@interact
def show_GeneTrait_table():
    # copy dataframe to avoid editing and changing data type of the original
    df_GeneTrait_filtered = dframe_GeneTrait_filtered[:].copy()

    # display gene-trait table by rendering the HTML to clickable
    s = "View Network"
    df_GeneTrait_filtered['Network URL'] = df_GeneTrait_filtered['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')
    
    return HTML(df_GeneTrait_filtered.to_html(render_links=True, escape=False))

interactive(children=(Output(),), _dom_classes=('widget-interact',))

## The table below shows the meaning of the evidence codes
- A homologous gene (or homolog) is a gene inherited in two species by a common ancestor.
- Genetic interaction networks represent the functional interactions between pairs of genes.


In [12]:
df_evidence()

Unnamed: 0,Evidence Code,Evidence Type,Homology,Interaction
0,TM_0-0,Text Mining (TM),0,0
1,TM_0-1,Text Mining,0,1
2,TM_1-0,Text Mining,1,0
3,TM_1-1,Text Mining,1,1
4,GWAS_0-0,Genetic Study (GWAS),0,0
5,GWAS_0-1,Genetic Study,0,1
6,GWAS_1-0,Genetic Study,1,0
7,GWAS_1-1,Genetic Study,1,1


## Choose a trait to display related genes

In [7]:
@interact
def get_gene_list_for_triat(Trait_Name = sorted(df_Ftest_sorted['Trait Name'].unique())):
    
    print(" Trait Accession is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Trait Name'] == Trait_Name]['Trait Accession'].item()))
    
    print(" Adjusted p-value is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Trait Name'] == Trait_Name]['adj p-value'].item()))
    
    df = dframe_GeneTrait_filtered.loc[dframe_GeneTrait_filtered['Trait Name'] == Trait_Name]
    df = df[["Gene Accession", "Gene Name", "Evidence", "Network URL"]]
    df = df.reset_index(drop=True)
    
    s = "View Network"
    df['Network URL'] = df['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')
    
    print("\n Total number of related unique genes from \n user/study list of genes is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Trait Name'] == Trait_Name]['Number of related genes in user/study list'].item()))  

    return HTML(df.to_html(render_links=True, escape=False))

interactive(children=(Dropdown(description='Trait_Name', options=('abscisic acid content', 'arsenic concentrat…

## Choose a gene to display related traits

In [8]:
@interact
def get_gene_list_for_triat(Gene_Name = sorted(dframe_GeneTrait_filtered['Gene Name'].unique())):
    
    df = dframe_GeneTrait_filtered.loc[dframe_GeneTrait_filtered['Gene Name'] == Gene_Name]
    df = df[["Gene Accession","Trait Accession", "Trait Name", "Evidence", "Network URL"]]
    df = df.reset_index(drop=True)
    
    s = "View Network"
    df['Network URL'] = df['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')
    
    print("\n Total number of related unique traits is: " + str(len(df['Trait Accession'].unique())))
    
    return HTML(df.to_html(render_links=True, escape=False))

interactive(children=(Dropdown(description='Gene_Name', options=('CCD1', 'E1-BETA-2', 'HAIKU2', 'LEA', 'NCED3'…