In [1]:
import pandas as pd
from gprofiler import GProfiler

In [2]:
proteomics_t_test = pd.read_csv('data/proteomics_t_tests.csv')
proteomics_t_test['Cancer'] = [cancer.lower() for cancer in proteomics_t_test.Cancer]
transcriptomics_t_test = pd.read_csv('data/transcriptomics_t_tests.csv')
transcriptomics_t_test.Cancer = [cancer.lower() for cancer in transcriptomics_t_test.Cancer]
delta_corr_pvals = pd.read_csv('data/full_10k_permutation_corrected.csv')
delta_corr_pvals = delta_corr_pvals.replace('en', 'endometrial')
delta_corr_pvals = delta_corr_pvals.melt(id_vars='Cancer', var_name = 'Gene', value_name = 'perm_pval')
delta_corr_pvals = delta_corr_pvals.dropna()

In [3]:
exp_background = list(set(transcriptomics_t_test.Gene) & set(proteomics_t_test.Gene))
print(len(exp_background))
corr_background = list(pd.unique(delta_corr_pvals.Gene))
print(len(corr_background))
trans_background = list(pd.unique(transcriptomics_t_test.Gene))
print(len(trans_background))
prot_background = list(pd.unique(proteomics_t_test.Gene))
print(len(prot_background))

13700
13026
37921
13709


In [4]:
dfs = []
for cancer in pd.unique(delta_corr_pvals.Cancer):
    sig_prot = proteomics_t_test[proteomics_t_test.Cancer == cancer]
    sig_trans = transcriptomics_t_test[transcriptomics_t_test.Cancer == cancer]   
    sig_corr = delta_corr_pvals[delta_corr_pvals.Cancer == cancer]
    sig_prot = set(sig_prot[sig_prot.qval < 0.05].Gene)
    sig_trans = set(sig_trans[sig_trans.qval < 0.05].Gene)
    sig_corr = set(sig_corr[sig_corr.perm_pval < 0.05].Gene)
    sig_exp = set(sig_prot) & set(sig_trans)
    df = pd.DataFrame({'Cancer': [cancer] * len(sig_exp), 'Gene': list(sig_exp), 'Group': ['expression'] * len(sig_exp)})
    dfs.append(df)
    df = pd.DataFrame({'Cancer': [cancer] * len(sig_corr), 'Gene': list(sig_corr), 'Group': ['correlation'] * len(sig_corr)})
    dfs.append(df)
    df = pd.DataFrame({'Cancer': [cancer] * len(sig_trans), 'Gene': list(sig_trans), 'Group': ['transcriptomics'] * len(sig_trans)})
    dfs.append(df)
    df = pd.DataFrame({'Cancer': [cancer] * len(sig_prot), 'Gene': list(sig_prot), 'Group': ['proteomics'] * len(sig_prot)})
    dfs.append(df)
df = pd.concat(dfs)
df = pd.DataFrame(df.groupby(['Group', 'Gene']).size())
df.reset_index(inplace=True)
df = df[df[0] >2]
df

Unnamed: 0,Group,Gene,0
3,correlation,AAAS,3
7,correlation,AAGAB,4
8,correlation,AAK1,4
10,correlation,AAMP,3
12,correlation,AARS,4
...,...,...,...
64040,transcriptomics,ZXDC,3
64041,transcriptomics,ZYG11A,3
64042,transcriptomics,ZYG11B,5
64043,transcriptomics,ZYX,5


In [6]:
corr = df[df.Group == 'correlation'].Gene
exp = df[df.Group == 'expression'].Gene
prot = df[df.Group == 'proteomics'].Gene
trans = df[df.Group == 'transcriptomics'].Gene

In [7]:
gp = GProfiler(return_dataframe=True)

In [8]:
corr_pathways = gp.profile(organism='hsapiens', query = list(corr), no_iea=True,
                       ordered=False, no_evidences=False, background= corr_background, sources = ['KEGG'])

In [9]:
exp_pathways = gp.profile(organism='hsapiens', query = list(exp), no_iea=True,
                       ordered=False, no_evidences=False, background= exp_background, sources = ['KEGG'])

In [10]:
prot_pathways = gp.profile(organism='hsapiens', query = list(prot), no_iea=True,
                       ordered=False, no_evidences=False, background= prot_background, sources = ['KEGG'])

In [11]:
trans_pathways = gp.profile(organism='hsapiens', query = list(trans), no_iea=True,
                       ordered=False, no_evidences=False, background= trans_background, sources = ['KEGG'])

In [12]:
corr_pathways = corr_pathways[corr_pathways.term_size < 500]
corr_pathways = corr_pathways[corr_pathways.term_size > 15]
corr_pathways

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences
1,KEGG,KEGG:01240,Biosynthesis of cofactors,1.783777e-07,True,Biosynthesis of cofactors,134,3746,72,12945,0.019221,0.537313,query_1,[KEGG:00000],"[ADSL, ADSS, AK1, AK2, AK3, AK4, AK6, ALAD, AL...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
2,KEGG,KEGG:00240,Pyrimidine metabolism,6.378181e-07,True,Pyrimidine metabolism,51,3746,35,12945,0.009343,0.686275,query_1,[KEGG:00000],"[CAD, CANT1, CMPK1, CMPK2, CTPS1, CTPS2, DCK, ...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
3,KEGG,KEGG:01200,Carbon metabolism,6.398922e-07,True,Carbon metabolism,110,3746,61,12945,0.016284,0.554545,query_1,[KEGG:00000],"[ACADS, ACAT1, ACOX3, ACSS1, ACSS2, ALDH6A1, A...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
4,KEGG,KEGG:00970,Aminoacyl-tRNA biosynthesis,7.770558e-05,True,Aminoacyl-tRNA biosynthesis,28,3746,21,12945,0.005606,0.75,query_1,[KEGG:00000],"[AARS2, CARS, DARS2, EARS2, FARS2, HARS2, IARS...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
5,KEGG,KEGG:00280,"Valine, leucine and isoleucine degradation",0.0001777308,True,"Valine, leucine and isoleucine degradation",48,3746,30,12945,0.008009,0.625,query_1,[KEGG:00000],"[ABAT, ACAA2, ACAD8, ACADM, ACADS, ACADSB, ACA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
7,KEGG,KEGG:05164,Influenza A,0.001425699,True,Influenza A,133,3746,62,12945,0.016551,0.466165,query_1,[KEGG:00000],"[ADAR, AKT2, AKT3, APAF1, BAK1, BID, CASP1, CA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
8,KEGG,KEGG:04621,NOD-like receptor signaling pathway,0.002627233,True,NOD-like receptor signaling pathway,146,3746,66,12945,0.017619,0.452055,query_1,[KEGG:00000],"[ANTXR1, ATG16L1, CARD16, CARD6, CARD8, CARD9,...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
9,KEGG,KEGG:00760,Nicotinate and nicotinamide metabolism,0.002648745,True,Nicotinate and nicotinamide metabolism,30,3746,20,12945,0.005339,0.666667,query_1,[KEGG:00000],"[AOX1, ENPP1, NADK2, NADSYN1, NAMPT, NAPRT, NM...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
10,KEGG,KEGG:05160,Hepatitis C,0.004404378,True,Hepatitis C,118,3746,55,12945,0.014682,0.466102,query_1,[KEGG:00000],"[AKT2, AKT3, APAF1, BAK1, BID, BRAF, CASP8, CA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
11,KEGG,KEGG:05162,Measles,0.006026793,True,Measles,111,3746,52,12945,0.013881,0.468468,query_1,[KEGG:00000],"[ADAR, AKT2, AKT3, APAF1, BAK1, BID, CASP8, CA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."


In [13]:
exp_pathways = exp_pathways[exp_pathways.term_size < 500]
exp_pathways = exp_pathways[exp_pathways.term_size > 15]
exp_pathways

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences
1,KEGG,KEGG:03010,Ribosome,2.454217e-13,True,Ribosome,132,6597,108,13598,0.016371,0.818182,query_1,[KEGG:00000],"[FAU, MRPL1, MRPL10, MRPL11, MRPL12, MRPL13, M...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
2,KEGG,KEGG:03013,RNA transport,6.860733e-09,True,RNA transport,155,6597,115,13598,0.017432,0.741935,query_1,[KEGG:00000],"[AAAS, ACIN1, ALYREF, CLNS1A, CYFIP1, DDX19A, ...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
3,KEGG,KEGG:05171,Coronavirus disease - COVID-19,1.503773e-06,True,Coronavirus disease - COVID-19,201,6597,137,13598,0.020767,0.681592,query_1,[KEGG:00000],"[ACE, ADAM17, ADAR, C1QA, C1QB, C1QC, C2, C3, ...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
4,KEGG,KEGG:03008,Ribosome biogenesis in eukaryotes,2.672239e-05,True,Ribosome biogenesis in eukaryotes,73,6597,57,13598,0.00864,0.780822,query_1,[KEGG:00000],"[BMS1, CSNK2A1, DKC1, DROSHA, EIF6, EMG1, FBL,...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
5,KEGG,KEGG:04144,Endocytosis,2.86486e-05,True,Endocytosis,232,6597,151,13598,0.022889,0.650862,query_1,[KEGG:00000],"[ACAP1, ACAP2, ACAP3, ACTR2, ACTR3, ACTR3B, AG...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
7,KEGG,KEGG:01200,Carbon metabolism,0.0001583017,True,Carbon metabolism,113,6597,80,13598,0.012127,0.707965,query_1,[KEGG:00000],"[ACAT1, ACAT2, ACO1, ACO2, ACOX1, ACOX3, ACSS1...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
8,KEGG,KEGG:03040,Spliceosome,0.000159567,True,Spliceosome,135,6597,93,13598,0.014097,0.688889,query_1,[KEGG:00000],"[ACIN1, ALYREF, BCAS2, BUD31, CCDC12, CDC5L, C...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
9,KEGG,KEGG:05169,Epstein-Barr virus infection,0.0003142469,True,Epstein-Barr virus infection,171,6597,113,13598,0.017129,0.660819,query_1,[KEGG:00000],"[ADRM1, AKT1, AKT3, APAF1, B2M, BAK1, BAX, BID...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
10,KEGG,KEGG:03030,DNA replication,0.0006764488,True,DNA replication,35,6597,30,13598,0.004548,0.857143,query_1,[KEGG:00000],"[FEN1, LIG1, MCM2, MCM3, MCM4, MCM5, MCM6, MCM...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
11,KEGG,KEGG:05014,Amyotrophic lateral sclerosis,0.001090329,True,Amyotrophic lateral sclerosis,326,6597,197,13598,0.029862,0.604294,query_1,[KEGG:00000],"[ACTB, ACTG1, ACTR10, ACTR1A, ADRM1, ALYREF, A...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."


In [14]:
prot_pathways = prot_pathways[prot_pathways.term_size < 500]
prot_pathways = prot_pathways[prot_pathways.term_size > 15]
prot_pathways

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences
1,KEGG,KEGG:05171,Coronavirus disease - COVID-19,1.745898e-13,True,Coronavirus disease - COVID-19,201,8699,178,13603,0.020462,0.885572,query_1,[KEGG:00000],"[ACE, ADAM17, ADAR, C1QA, C1QB, C1QC, C1R, C1S...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
2,KEGG,KEGG:03010,Ribosome,3.565362e-13,True,Ribosome,132,8699,123,13603,0.01414,0.931818,query_1,[KEGG:00000],"[FAU, MRPL1, MRPL10, MRPL11, MRPL12, MRPL13, M...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
3,KEGG,KEGG:04144,Endocytosis,2.427947e-11,True,Endocytosis,232,8699,198,13603,0.022761,0.853448,query_1,[KEGG:00000],"[ACAP1, ACAP2, ACAP3, ACTR2, ACTR3, ACTR3B, AG...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
5,KEGG,KEGG:03013,RNA transport,7.783271e-09,True,RNA transport,155,8699,135,13603,0.015519,0.870968,query_1,[KEGG:00000],"[AAAS, ACIN1, ALYREF, CASC3, CLNS1A, CYFIP1, C...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
6,KEGG,KEGG:04610,Complement and coagulation cascades,8.030456e-09,True,Complement and coagulation cascades,79,8699,75,13603,0.008622,0.949367,query_1,[KEGG:00000],"[A2M, C1QA, C1QB, C1QC, C1R, C1S, C2, C3, C4A,...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
7,KEGG,KEGG:03040,Spliceosome,1.864217e-08,True,Spliceosome,135,8699,119,13603,0.01368,0.881481,query_1,[KEGG:00000],"[ACIN1, ALYREF, AQR, BCAS2, BUD31, CCDC12, CDC...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
8,KEGG,KEGG:05415,Diabetic cardiomyopathy,1.035857e-07,True,Diabetic cardiomyopathy,191,8699,160,13603,0.018393,0.837696,query_1,[KEGG:00000],"[ACE, AGT, AKT1, AKT3, ATP2A2, ATP5F1A, ATP5F1...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
9,KEGG,KEGG:05014,Amyotrophic lateral sclerosis,4.908653e-07,True,Amyotrophic lateral sclerosis,331,8699,260,13603,0.029888,0.785498,query_1,[KEGG:00000],"[ACTB, ACTG1, ACTR10, ACTR1A, ACTR1B, ADRM1, A...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
10,KEGG,KEGG:05132,Salmonella infection,4.17561e-06,True,Salmonella infection,240,8699,192,13603,0.022072,0.8,query_1,[KEGG:00000],"[ABI1, ACBD3, ACTB, ACTG1, ACTR10, ACTR1A, ACT...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
11,KEGG,KEGG:04141,Protein processing in endoplasmic reticulum,7.239206e-06,True,Protein processing in endoplasmic reticulum,160,8699,133,13603,0.015289,0.83125,query_1,[KEGG:00000],"[AMFR, ATF6, ATXN3, BAG1, BAG2, BAK1, BAX, BCA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."


In [15]:
trans_pathways = trans_pathways[trans_pathways.term_size < 500]
trans_pathways = trans_pathways[trans_pathways.term_size > 15]
trans_pathways

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences
3,KEGG,KEGG:05168,Herpes simplex virus 1 infection,2.890048e-24,True,Herpes simplex virus 1 infection,493,15521,403,26001,0.025965,0.817444,query_1,[KEGG:00000],"[AKT1, AKT2, AKT3, ALYREF, APAF1, B2M, BAD, BA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
4,KEGG,KEGG:05022,Pathways of neurodegeneration - multiple diseases,1.003099e-22,True,Pathways of neurodegeneration - multiple diseases,475,15521,387,26001,0.024934,0.814737,query_1,[KEGG:00000],"[ACTR10, ACTR1A, ADRM1, AGER, APAF1, APC, APC2...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
5,KEGG,KEGG:04144,Endocytosis,9.277320e-21,True,Endocytosis,252,15521,221,26001,0.014239,0.876984,query_1,[KEGG:00000],"[ACAP1, ACAP2, ACAP3, ACTR2, ACTR3, ACTR3B, AG...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
6,KEGG,KEGG:05014,Amyotrophic lateral sclerosis,8.672610e-19,True,Amyotrophic lateral sclerosis,363,15521,299,26001,0.019264,0.823691,query_1,[KEGG:00000],"[ACTB, ACTG1, ACTR10, ACTR1A, ADRM1, ALYREF, A...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
7,KEGG,KEGG:05166,Human T-cell leukemia virus 1 infection,8.712541e-19,True,Human T-cell leukemia virus 1 infection,216,15521,191,26001,0.012306,0.884259,query_1,[KEGG:00000],"[ADCY2, ADCY3, ADCY4, ADCY5, ADCY6, ADCY7, ADC...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,KEGG,KEGG:00480,Glutathione metabolism,3.547639e-02,True,Glutathione metabolism,56,15521,46,26001,0.002964,0.821429,query_1,[KEGG:00000],"[ANPEP, CHAC1, CHAC2, G6PD, GCLC, GCLM, GGCT, ...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
238,KEGG,KEGG:00770,Pantothenate and CoA biosynthesis,3.769693e-02,True,Pantothenate and CoA biosynthesis,21,15521,20,26001,0.001289,0.952381,query_1,[KEGG:00000],"[AASDHPPT, ALDH1B1, ALDH2, ALDH3A2, BCAT1, BCA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
239,KEGG,KEGG:03420,Nucleotide excision repair,4.018691e-02,True,Nucleotide excision repair,45,15521,38,26001,0.002448,0.844444,query_1,[KEGG:00000],"[CCNH, CDK7, CUL4A, CUL4B, DDB1, DDB2, ERCC2, ...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
240,KEGG,KEGG:04978,Mineral absorption,4.100770e-02,True,Mineral absorption,59,15521,48,26001,0.003093,0.813559,query_1,[KEGG:00000],"[ATOX1, ATP1A1, ATP1A2, ATP1A3, ATP1A4, ATP1B1...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
