# Use Case 5: Gene set enrichment analysis

<b>Import standard data analysis imports, as well as the gseapy which will allow us to perform a Gene set enrichment analysis</b>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import gseapy as gp

<b>Import the CPTAC data</b>

In [2]:
import CPTAC

Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Data...

 ******PLEASE READ******


<b>Retrieve the clinical and proteomics dataframes</b>

In [3]:
clinical = CPTAC.get_clinical()
proteomics = CPTAC.get_proteomics()

<b>For this example we will be separating the protein abudance based on the clinical MSI. Our first step is to combine the MSI information into the proteomics dataframe utilizing the <code>CPTAC.compare_clinical()</code> function</b>

In [4]:
msiProt = CPTAC.compare_clinical(clinical, proteomics, 'MSI')

<b>Separate the proteomics into two groups based on whether MSI is MSI-H or other </b>

In [5]:
high = msiProt['MSI'] == "MSI-H"
other = msiProt['MSI'] != "MSI-H"
highMSI = msiProt[high]
otherMSI = msiProt[other]

<b>Find which genes are upregulated in each partition</b>

In [18]:
genes_passing = []
genes = highMSI.columns[1:]
threshold = .05 / len(genes)
for gene in genes:
    high_abundance = highMSI[gene]
    other_abundance = otherMSI[gene]
    pvalue = stats.ttest_ind(high_abundance, other_abundance, nan_policy='omit').pvalue
    if pvalue < threshold:
        genes_passing.append(gene)
print(genes_passing)

['ACLY', 'ACY3', 'ADD1', 'ADD3', 'ADO', 'AGO2', 'AGR2', 'ALDH18A1', 'ALKBH4', 'AP1G2', 'AP1M2', 'ARFIP1', 'ARFIP2', 'ARHGAP23', 'ARHGEF40', 'ARMCX1', 'ARMCX2', 'ARPC1A', 'BAG3', 'CACYBP', 'CAND2', 'CBFB', 'CCNA2', 'CCT3', 'CCT5', 'CD109', 'CD2AP', 'CDC42BPB', 'CDC42EP4', 'CELF2', 'CEP112', 'CFI', 'CHAF1A', 'CHAF1B', 'CIC', 'CLPP', 'CLPX', 'CRABP2', 'DCK', 'DDAH1', 'DDAH2', 'DHFR', 'DIP2C', 'DPF2', 'DST', 'DTYMK', 'EFL1', 'EHBP1', 'EIF4E3', 'EML1', 'EPM2AIP1', 'EXOC6B', 'FABP3', 'FANCI', 'FEN1', 'FGD5', 'FNBP1L', 'FOXJ2', 'GALNT7', 'GCC2', 'GCHFR', 'GNPNAT1', 'GOLM1', 'GOLPH3L', 'GPRASP2', 'GSN', 'GSTM2', 'HDAC5', 'HDGFL3', 'HECA', 'HMBS', 'HMGCS1', 'HSPA12A', 'HSPA12B', 'HSPB8', 'IDI1', 'IQSEC1', 'IREB2', 'IRF2BPL', 'IRS1', 'JAK1', 'KIF13A', 'KIF4A', 'KIF7', 'L3MBTL3', 'LAD1', 'LAMB2', 'LDAH', 'LDHA', 'LIG1', 'LLGL1', 'LRBA', 'LRP1', 'LTN1', 'MAOB', 'MAP1B', 'MAP1S', 'MAP4', 'MAP7D1', 'MATN2', 'MBD2', 'MLH1', 'MMP15', 'MMRN2', 'MRE11', 'MVD', 'NCAPG2', 'NCOA1', 'NEK6', 'NES', 'NHSL2', 

In [12]:
#TODO: how to pass otherMSI column into apply function? otherMSI[x] doesn't work. Should I combine the dataframes?
passing2 = highMSI[highMSI.columns[1:]].apply(lambda x:stats.ttest_ind(x, otherMSI[x])) 
passing2

KeyError: ('[-1.01 -0.51 -0.93 -0.03 -1.52 -0.05 -0.53  0.28 -0.4   1.55 -0.45  0.41\n -0.42 -0.39 -0.67  0.19  0.48  1.44 -0.05 -0.79 -0.94 -0.53 -0.51 -0.47\n  0.14 -0.06 -0.89] not in index', 'occurred at index A1BG')

<b>Then use the genes that are up-regulated in these partitions to do a gene set enrichment analysis</b>

In [7]:
enr = gp.enrichr(gene_list = genes_passing, description='MSI partitions', gene_sets='KEGG_2016', outdir='test/enrichr_kegg',cutoff=.5)
enr.res2d

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Z-score,Combined Score,Genes,Gene_set
0,Terpenoid backbone biosynthesis_Homo sapiens_h...,4/22,0.000056,0.008267,0.000169,0.024984,-1.811406,17.738499,IDI1;HMGCS1;PDSS2;MVD,KEGG_2016
1,Pathogenic Escherichia coli infection_Homo sap...,5/55,0.000196,0.014509,0.000483,0.035724,-1.823937,15.571069,OCLN;ROCK1;YWHAQ;ROCK2;ARPC1A,KEGG_2016
2,Mismatch repair_Homo sapiens_hsa03430,3/23,0.001400,0.069059,0.002778,0.137051,-1.499972,9.856899,LIG1;PMS2;MLH1,KEGG_2016
3,Metabolic pathways_Homo sapiens_hsa01100,21/1239,0.009633,0.248834,0.028499,0.527227,-1.912256,8.877714,GALNT7;IDI1;RRM1;DTYMK;HMGCS1;MAOB;UAP1;NMNAT3...,KEGG_2016
4,Amino sugar and nucleotide sugar metabolism_Ho...,3/48,0.011442,0.248834,0.018288,0.495007,-1.621962,7.250948,UGDH;GNPNAT1;UAP1,KEGG_2016
5,Tight junction_Homo sapiens_hsa04530,5/139,0.011769,0.248834,0.020830,0.495007,-1.554297,6.904605,OCLN;TJAP1;LLGL1;TJP3;TJP2,KEGG_2016
6,Fanconi anemia pathway_Homo sapiens_hsa03460,3/53,0.014959,0.276738,0.023412,0.495007,-1.485949,6.244631,FANCI;PMS2;MLH1,KEGG_2016
7,Non-homologous end-joining_Homo sapiens_hsa03450,2/13,0.006873,0.248834,0.011441,0.423323,-1.101071,5.483427,FEN1;RAD50,KEGG_2016
8,Shigellosis_Homo sapiens_hsa05131,3/65,0.025633,0.421519,0.038566,0.634195,-1.454109,5.327678,ROCK1;ROCK2;ARPC1A,KEGG_2016
9,AMPK signaling pathway_Homo sapiens_hsa04152,4/124,0.033300,0.451568,0.052046,0.676213,-1.561928,5.314006,CCNA2;IRS1;PPP2R3A;PFKM,KEGG_2016
