# Enrichment Analysis using KnetMiner SPARQL endpoint with Jupyter

This Jupyter Notebook uses KnetMiner SPARQL endpoint to extract gene expression data from the RDF database and perfrom enrichment analysis.

## Choose the tax ID, concept and study or enter list of genes

### Steps:
1. Run the first cell to get the radio buttons for selection of species, concept and study or list.
2. Then run the second cell to get the results for the selected species and concept.

Note: You only need to run the first cell once. If you you want to change your choices, make the selections then run the second cell.

### The resuts generated are:
1. The gene-concept table containing the genes list and their related ontology terms and evidence
2. The enrichment table

### Please note:
1. The only tax IDs that will generate a table of studies are:
    - 4565: Triticum aestivum (wheat)
    - 3702: Arabidopsis thaliana


2. You can filter the gene-concept table using the 2 cells at the bottom by either:
    - choosing the ontology term to display the related genes
    - or choosing a gene to display the related ontology terms

In [None]:
# Import the libraries and functions
from enrichment_analysis_functions import *

# display full dataframe pandas
pd.set_option('display.max_rows', None)

# create dataframe for Tax IDs and their names
dframe_taxID = df_taxID()
# create list of concepts
concepts = get_concepts()

# display radio buttons for choosing species and concept
print("Select the desired species:")
radiobuttons1 = display_radiobuttons(data = list(dframe_taxID['Tax Names']))

print("Select the desired concept:")
radiobuttons2 = display_radiobuttons(data = concepts)

print("Do you want to get the list of genes from a study or use your own list?")
radiobuttons3 = display_radiobuttons(data = ["Study", "List of Genes"])

In [None]:
# get the slections
species = radiobuttons1.get_interact_value()
concept = radiobuttons2.get_interact_value()
studyOrList = radiobuttons3.get_interact_value()

# get the tax ID 
taxID = dframe_taxID[dframe_taxID['Tax Names'] == species]['Tax IDs'].item()
print(f'Tax ID for {species} is: {taxID}')

# get total number of genes in the database for the selected tax ID
total_db_genes = get_gene_count(taxID)
print('Total Number of Genes = ' + str(total_db_genes))

# import csv for genes and related concept (for the selected tax ID)
dframe_GeneTrait = pd.read_csv (f'Gene{concept}Table_{taxID}.csv',
                                usecols= ["Gene Accession", "Gene Name", "Ontology Term",
                                    "Preferred Name", "Evidence", "Network URL"])


# create global variables for @interact_manual
total_DEXgenes = set()
dframe_GeneTrait_filtered = pd.DataFrame()
df_Ftest_sorted = pd.DataFrame()


if studyOrList == "Study":
    print("\nLoading studies ... Please wait.")
    
    # get the dataframe of the studies and their accession numbers
    dframe_study_list = get_study_list(taxID)
    
    if dframe_study_list.shape[0] != 0:
        print("\nChoose from the list of studies related to the chosen Tax ID:")

        @interact_manual
        def get_study_list_for_triat(Study_Title = dframe_study_list['Study Title']):
            global total_DEXgenes
            global dframe_GeneTrait_filtered
            global df_Ftest_sorted
            
            # get study accession number
            studyAcc = dframe_study_list[dframe_study_list['Study Title'] == Study_Title]['Study Accession'].item()
            print("Study Accession is: " + studyAcc)
                
            # get unique set of genes
            total_DEXgenes = get_study_DEXgenes(studyAcc)
            
            print("\nLoading results ... Please wait.")
            # get final tables
            dframe_GeneTrait_filtered, df_Ftest_sorted = get_df_Ftest_sorted(dframe_GeneTrait, total_DEXgenes, total_db_genes)

    else:
        print("\nNo studies in the databse for the selected tax ID. Please provide your list of genes.")
    

# if studyOrList == "List of Genes"    
else:
    print ("\nPlease paste the list of genes (separated by spaces).")
    
    @interact_manual
    def input_genes_list(genes = ''):
        global total_DEXgenes
        global dframe_GeneTrait_filtered
        global df_Ftest_sorted
        
        # get user input genes as list
        genes_list = genes.split()
        # get unique set of genes
        total_DEXgenes = set(genes_list)
                        
        print("\n" + str(len(total_DEXgenes)) + " genes provided:")
        for g in genes_list:
            print(g)
        
        print("\nLoading results ... Please wait.")
        # get final tables
        dframe_GeneTrait_filtered, df_Ftest_sorted = get_df_Ftest_sorted(dframe_GeneTrait, total_DEXgenes, total_db_genes)

## View whole tables section
If you want to display the whole tables in the notebook, run each of the two cells below.

Gene-concept Table:

In [None]:
# copy dataframe to avoid editing and changing data type of the original
df_GeneTrait_filtered = dframe_GeneTrait_filtered[:].copy()

# display gene-trait table by rendering the HTML to clickable
s = "View Network"
df_GeneTrait_filtered['Network URL'] = df_GeneTrait_filtered['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')

HTML(df_GeneTrait_filtered.to_html(render_links=True, escape=False))

Enrichment Table:

In [None]:
df_Ftest_sorted

## The table below shows the meaning of the evidence codes
- A homologous gene (or homolog) is a gene inherited in two species by a common ancestor.
- Genetic interaction networks represent the functional interactions between pairs of genes.


In [3]:
df_evidence()

Unnamed: 0,Evidence Code,Evidence Type,Homology,Interaction
0,TM_0-0,Text Mining (TM),0,0
1,TM_0-1,Text Mining,0,1
2,TM_1-0,Text Mining,1,0
3,TM_1-1,Text Mining,1,1
4,GWAS_0-0,Genetic Study (GWAS),0,0
5,GWAS_0-1,Genetic Study,0,1
6,GWAS_1-0,Genetic Study,1,0
7,GWAS_1-1,Genetic Study,1,1


## Choose a ontology term to display related genes

In [None]:
@interact
def get_gene_list_for_triat(Ontology = sorted(df_Ftest_sorted['Preferred Name'].unique())):
    
    print(" Ontology Term is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Preferred Name'] == Ontology]['Ontology Term'].item()))
    
    print(" Adjusted p-value is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Preferred Name'] == Ontology]['adj p-value'].item()))
    
    df = dframe_GeneTrait_filtered.loc[dframe_GeneTrait_filtered['Preferred Name'] == Ontology]
    df = df[["Gene Accession", "Gene Name", "Evidence", "Network URL"]]
    df = df.reset_index(drop=True)
    
    s = "View Network"
    df['Network URL'] = df['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')
    
    print("\n Total number of related unique genes from \n user/study list of genes is: " +
          str(df_Ftest_sorted[df_Ftest_sorted['Preferred Name'] == Ontology]['User/Study Genes'].item()))  

    return HTML(df.to_html(render_links=True, escape=False))

## Choose a gene to display related ontology terms

### Please Note:
A gene name can have multiple accession numbers, which will be displayed in the printed table.

In [None]:
@interact
def get_gene_list_for_triat(Gene_Name = sorted(dframe_GeneTrait_filtered['Gene Name'].unique())):
    
    df = dframe_GeneTrait_filtered.loc[dframe_GeneTrait_filtered['Gene Name'] == Gene_Name]
    df = df[["Gene Accession", "Ontology Term", "Preferred Name", "Evidence", "Network URL"]]
    df = df.reset_index(drop=True)
    
    s = "View Network"
    df['Network URL'] = df['Network URL'].apply(lambda x: f'<a href="{x}">{s}</a>')
    
    print("\n Total number of related unique ontology terms is: " + str(len(df['Ontology Term'].unique())))
    
    return HTML(df.to_html(render_links=True, escape=False))