# Enrichment Analysis using KnetMiner SPARQL endpoint with Jupyter

## Step 1: Import the libraries used in this script.

In [1]:
from enrichment_analysis_functions import *

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

from IPython.display import HTML

## Step 2: Choose the tax ID, study (or list of genes), and get the trait enrichment table

### Please note for the list of studies:

1. It take a couple of seconds to get the list of studies.
2. The only tax IDs that will generate a table of studies are:
    - 4565: Triticum aestivum (wheat)
    - 3702: Arabidopsis thaliana

In [6]:
# create dataframe for Tax IDs and their names
dframe_taxID = df_taxID()

# create global variables
taxID = ''
total_db_genes = 0
total_DEXgenes = set()
dframe_GeneTrait = pd.DataFrame()
dframe_GeneTrait_filtered = pd.DataFrame()
df_Ftest_sorted = pd.DataFrame()

# view the interactive interface
@interact_manual
def show_taxid(Species = dframe_taxID['Tax Names']):
    global taxID
    global total_db_genes
    global dframe_GeneTrait
    
    taxID = dframe_taxID[dframe_taxID['Tax Names'] == Species]['Tax IDs'].item()
    print("Tax ID is: " + taxID)
    
    # get total number of genes in the database for the selected tax ID
    total_db_genes = get_gene_count(taxID)
    
    # import csv for genes and related traits (for the selected tax ID)
    dframe_GeneTrait = pd.read_csv (f'GeneTraitTable_{taxID}.csv',
                                usecols= ["Gene Accession", "Gene Name", "Trait Accession", "Trait Name",
                                          "Evidence", "Network URL"])
    
    #dframe_GeneTrait = dframe_GeneTrait.drop_duplicates()
    
    
    print("\nDo you want to get the list of genes from a study or use your own list?")

    @interact_manual
    def show_study_list(Choice = ["Study", "List of Genes"]): 
            
        if Choice == "Study":
            dframe_study_list = get_study_list(taxID)
            
            if dframe_study_list.shape[0] != 0:
                print("\nChoose from the list of studies related to the chosen Tax ID:")

                @interact_manual
                def get_study_list_for_triat(Study_Title = dframe_study_list['Study Title']):
                    global total_DEXgenes
                    global dframe_GeneTrait_filtered
                    global df_Ftest_sorted
                    
                    # get study accession number
                    studyAcc = dframe_study_list[dframe_study_list['Study Title'] == Study_Title]['Study Accession'].item()
                    print(" Study Accession is: " + studyAcc)
                    
                    # get unique set of genes
                    total_DEXgenes = get_study_DEXgenes(studyAcc)
                    # get final tables
                    dframe_GeneTrait_filtered, df_Ftest_sorted = get_df_Ftest_sorted(dframe_GeneTrait,
                                                                                     total_DEXgenes, total_db_genes)

            else:
                print("No studies in the databse for the selected tax ID. Please provide your list of genes.")
            
            
        else:
            print ("\nPlease paste the list of genes (separated by spaces).")
            @interact_manual
            def input_genes_list(genes = ''):
                global total_DEXgenes
                global dframe_GeneTrait_filtered
                global df_Ftest_sorted
                
                # get user input genes as list
                genes_list = genes.split()
                # get unique set of genes
                total_DEXgenes = set(genes_list)
                                
                print("\n" + str(len(total_DEXgenes)) + " genes provided:")
                for g in genes_list:
                    print(g)
                
                # get final tables
                dframe_GeneTrait_filtered, df_Ftest_sorted = get_df_Ftest_sorted(dframe_GeneTrait,
                                                                                 total_DEXgenes, total_db_genes)
                

interactive(children=(Dropdown(description='Species', options=('Triticum aestivum (wheat)', 'Arabidopsis thali…

In [3]:
# display full dataframe pandas
pd.set_option('display.max_rows', None)

# display trait enrichment analysis table
df_Ftest_sorted

Unnamed: 0,Trait Accession,Trait Name,odds ratio,exact p-value,adj p-value,Total number of related genes in database,Number of related genes in user/study list
0,TO_0000430,germination rate,5.599665,1.665317e-128,4.7794590000000003e-126,5626,364
1,TO_0000190,seed coat color,11.718694,9.178751e-102,1.317151e-99,1150,159
2,TO_0000276,drought tolerance,3.098846,2.814126e-90,2.69218e-88,16360,568
3,TO_0006002,proline content,3.542854,1.303954e-80,9.355871e-79,8960,382
4,TO_0002661,seed maturation,3.526011,2.053047e-61,1.178449e-59,6291,280
5,TO_0000253,seed dormancy,3.760688,6.068132e-61,2.90259e-59,5296,253
6,TO_0000344,days to flowering trait,2.202993,2.428542e-20,9.957020999999998e-19,6362,191
7,TO_0000112,disease resistance,1.716402,1.297589e-17,4.6551e-16,15631,358
8,TO_0006001,salt tolerance,2.028255,3.303867e-17,1.053567e-15,7015,195
9,TO_0006019,floral organ identity,2.486894,6.52359e-17,1.87227e-15,3426,118


## The table below shows the meaning of the evidence codes
- A homologous gene (or homolog) is a gene inherited in two species by a common ancestor.
- Genetic interaction networks represent the functional interactions between pairs of genes.


In [6]:
df_evidence()

Unnamed: 0,Evidence Code,Evidence Type,Homology,Interaction
0,TM_0-0,Text Mining (TM),0,0
1,TM_0-1,Text Mining,0,1
2,TM_1-0,Text Mining,1,0
3,TM_1-1,Text Mining,1,1
4,GWAS_0-0,Genetic Study (GWAS),0,0
5,GWAS_0-1,Genetic Study,0,1
6,GWAS_1-0,Genetic Study,1,0
7,GWAS_1-1,Genetic Study,1,1


## Choose a trait to display related genes

In [8]:
@interact
def get_gene_list_for_triat(Trait_Name = sorted(df_Ftest_sorted['Trait Name'].unique())):
    
    print(" Trait Accession is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Trait Name'] == Trait_Name]['Trait Accession'].item()))
    
    print(" Adjusted p-value is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Trait Name'] == Trait_Name]['adj p-value'].item()))
    
    df = dframe_GeneTrait_filtered.loc[dframe_GeneTrait_filtered['Trait Name'] == Trait_Name]
    df = df[["Gene Accession", "Gene Name", "Evidence", "Network URL"]]
    df = df.reset_index(drop=True)
    
    s = "View Network"
    df['Network URL'] = df['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')
    
    print("\n Total number of related unique genes from \n user/study list of genes is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Trait Name'] == Trait_Name]['Number of related genes in user/study list'].item()))  

    return HTML(df.to_html(render_links=True, escape=False))

interactive(children=(Dropdown(description='Trait_Name', options=('1000-grain weight', 'Common bunt spike inci…

## Choose a gene to display related traits

In [9]:
@interact
def get_gene_list_for_triat(Gene_Name = sorted(dframe_GeneTrait_filtered['Gene Name'].unique())):
    
    df = dframe_GeneTrait_filtered.loc[dframe_GeneTrait_filtered['Gene Name'] == Gene_Name]
    df = df[["Gene Accession","Trait Accession", "Trait Name", "Evidence", "Network URL"]]
    #df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    
    s = "View Network"
    df['Network URL'] = df['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')
    
    print("\n Total number of related unique traits is: " + str(len(df['Trait Accession'].unique())))
    
    return HTML(df.to_html(render_links=True, escape=False))

interactive(children=(Dropdown(description='Gene_Name', options=('4CLL9', 'AAC1', 'AAE14', 'AAO1', 'AAP2', 'AB…