In [1]:
import cptac
import cptac.utils as ut
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from scipy import stats
import gseapy as gp
from gseapy.plot import barplot, dotplot
import statsmodels.stats.multitest as ssm

In [2]:
perm_df = pd.read_csv("data/full_10k_permutation.csv")
perm_df

Unnamed: 0,Cancer,A1BG,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,...,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDA,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,ccrcc,0.05281,0.000838,0.029716,,0.337474,0.728086,0.06413643,0.350422,0.0577,...,,0.1506673,0.03202242,0.160367,,0.033585,0.5243498,0.08419726,0.3727352,0.202787
1,en,0.78172,,0.911887,0.01389663,0.332572,0.005615,6.073417e-09,,0.309108,...,0.601641,0.7465966,7.258794e-06,5.565692e-09,,0.285104,0.08265039,0.5077769,0.2323887,0.196638
2,luad,0.758399,,0.010615,,,0.139531,1.567981e-36,0.022448,0.017033,...,,4.054898e-10,5.038784e-20,1.490761e-05,,0.577845,1.056011e-10,4.521974e-08,2.750287e-15,1.3e-05
3,hnscc,0.996778,0.776459,0.228337,0.0005558182,0.040184,0.014557,0.8203827,0.00028,,...,,0.0006634511,0.1484058,0.001437383,,0.002749,0.0008339845,0.8879081,0.1516343,0.762835
4,lscc,0.137994,,0.097044,2.033188e-22,0.154022,0.008081,0.07086254,0.456664,,...,0.001427,1.917384e-13,7.815858999999999e-51,2.8451910000000003e-33,,0.011741,0.007526734,3.401658e-09,1.794113e-06,0.018557


In [3]:
delta_corr_df = pd.read_csv('data/corr_diff.csv')
delta_corr_df

Unnamed: 0,Cancer,A1BG,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,...,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDA,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,ccrcc,-0.388745,0.146993,-0.277035,,-0.795957,-0.044291,-0.128218,1.196756,-0.174925,...,,0.269728,0.329727,-0.337301,,0.33747,0.109414,-0.223934,-0.138352,-0.220727
1,endometrial,0.061654,,-0.025535,0.40221,0.399061,0.663703,0.593777,,0.302879,...,0.177842,0.088061,0.759429,1.006282,,-0.267078,0.400272,0.147305,0.367791,0.342423
2,luad,-0.039149,,0.097233,,,0.206094,0.683092,-0.119746,0.630155,...,,0.557832,0.904669,0.645076,,0.087794,0.707991,0.472139,0.418589,0.661323
3,hnscc,-0.002651,0.407295,-0.194186,0.117838,0.555675,0.462864,0.024965,-0.333934,-0.294627,...,1.039965,0.395189,0.119698,1.079145,,0.591841,-0.453269,-0.010672,0.255505,-0.060708
4,lscc,0.196946,,-0.06263,0.649123,0.246831,0.304418,0.150665,-0.068258,,...,0.613891,0.491043,0.64643,0.838981,,0.33235,0.231167,0.315679,0.268239,0.446553


In [4]:
perm_df = perm_df.melt(id_vars='Cancer', var_name = 'Gene', value_name = 'pval')
delta_corr_df = delta_corr_df.melt(id_vars='Cancer', var_name='Gene', value_name='delta_corr')

In [5]:
df = pd.merge(perm_df, delta_corr_df)

In [6]:
df = df.dropna()
df

Unnamed: 0,Cancer,Gene,pval,delta_corr
0,ccrcc,A1BG,0.052810,-0.388745
1,luad,A1BG,0.758399,-0.039149
2,hnscc,A1BG,0.996778,-0.002651
3,lscc,A1BG,0.137994,0.196946
4,ccrcc,A1CF,0.000838,0.146993
...,...,...,...,...
54671,lscc,ZZEF1,0.000002,0.268239
54672,ccrcc,ZZZ3,0.202787,-0.220727
54673,luad,ZZZ3,0.000013,0.661323
54674,hnscc,ZZZ3,0.762835,-0.060708


In [7]:
background_genes = list(pd.unique(df.Gene))

In [8]:
cancer_dfs = []
for cancer in pd.unique(df.Cancer):
    cancer_df = df[df.Cancer == cancer]
    pvals = list(cancer_df.pval)
    corrected_pvals = list(ssm.fdrcorrection(pvals)[1])
    cancer_df.pval = corrected_pvals
    cancer_dfs.append(cancer_df)
df = pd.concat(cancer_dfs)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [9]:
sig_perm_df = df[df.pval <= 0.05]

In [10]:
sig_genes = pd.DataFrame(sig_perm_df.groupby('Gene').size())
sig_genes = sig_genes[sig_genes[0] > 2]
sig_genes = list(sig_genes.index)
avg_pvals = []
avg_corrs = []
for gene in sig_genes:
    gene_df = sig_perm_df[sig_perm_df.Gene==gene]
    avg_pval = np.average(gene_df.pval)
    avg_corr = np.average(gene_df.delta_corr)
    avg_corrs.append(avg_corr)
    avg_pvals.append(avg_pval)
sig_genes = pd.DataFrame({'Gene': sig_genes, 'avg_pval': avg_pvals, 'avg_delta_corr': avg_corrs})
sig_genes

Unnamed: 0,Gene,avg_pval,avg_delta_corr
0,AAGAB,1.191913e-05,0.513239
1,AAK1,2.187208e-09,0.539893
2,AAMP,7.805338e-03,0.422332
3,AARS,1.022777e-02,0.443220
4,AARS2,1.766085e-05,0.563538
...,...,...,...
3044,ZPR1,4.507710e-03,0.552338
3045,ZSCAN18,9.995611e-03,0.461501
3046,ZW10,1.301439e-03,0.481355
3047,ZWINT,2.499200e-03,0.854401


In [11]:
sig_genes = sig_genes.sort_values('avg_pval')
sig_genes = sig_genes.reset_index()
ranked_gene_list = list(sig_genes.Gene)

In [12]:
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)
results_df = gp.profile(organism='hsapiens', query = ranked_gene_list, no_iea=True,
                       ordered=True, no_evidences=False, background= background_genes)

In [13]:
filtered_results = results_df[results_df.term_size <= 500]
filtered_results = filtered_results[filtered_results.term_size >=15]

In [14]:
kegg_results = filtered_results[filtered_results.source == 'KEGG']
kegg_results

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences
219,KEGG,KEGG:00280,"Valine, leucine and isoleucine degradation",0.000101,True,"Valine, leucine and isoleucine degradation",48,2397,24,12939,0.010013,0.5,query_1,[KEGG:00000],"[ALDH6A1, OXCT1, ACAT1, IL4I1, ALDH7A1, MUT, D...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
232,KEGG,KEGG:01200,Carbon metabolism,0.000552,True,Carbon metabolism,112,2596,45,12939,0.017334,0.401786,query_1,[KEGG:00000],"[ALDH6A1, ALDOC, ACAT1, PKM, RPIA, PFKM, RGN, ...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
240,KEGG,KEGG:00240,Pyrimidine metabolism,0.001212,True,Pyrimidine metabolism,51,2738,27,12939,0.009861,0.529412,query_1,[KEGG:00000],"[CAD, ENTPD1, TK1, NME1, UPP1, TYMS, CMPK2, CA...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
255,KEGG,KEGG:01230,Biosynthesis of amino acids,0.003258,True,Biosynthesis of amino acids,72,2757,32,12939,0.011607,0.444444,query_1,[KEGG:00000],"[ALDOC, PKM, RPIA, PFKM, ASS1, PGAM1, PFKP, EN...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
260,KEGG,KEGG:01240,Biosynthesis of cofactors,0.004217,True,Biosynthesis of cofactors,136,2808,56,12939,0.019943,0.411765,query_1,[KEGG:00000],"[FLAD1, TPK1, GCLC, AK4, CAD, ALPL, PKM, NADK2...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
261,KEGG,KEGG:00270,Cysteine and methionine metabolism,0.004425,True,Cysteine and methionine metabolism,46,1564,17,12939,0.01087,0.369565,query_1,[KEGG:00000],"[GCLC, SMS, MTAP, IL4I1, TST, MPST, DNMT3A, LD...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
307,KEGG,KEGG:00010,Glycolysis / Gluconeogenesis,0.01844,True,Glycolysis / Gluconeogenesis,62,565,11,12939,0.019469,0.177419,query_1,[KEGG:00000],"[ALDOC, PKM, PFKM, ALDH7A1, PGAM1, PFKP, LDHB,...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
316,KEGG,KEGG:03030,DNA replication,0.022233,True,DNA replication,34,605,8,12939,0.013223,0.235294,query_1,[KEGG:00000],"[RFC5, PRIM1, MCM5, MCM6, POLA2, LIG1, POLD2, ...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
317,KEGG,KEGG:00640,Propanoate metabolism,0.022739,True,Propanoate metabolism,34,2725,17,12939,0.006239,0.5,query_1,[KEGG:00000],"[ALDH6A1, ACAT1, LDHB, LDHA, MUT, DBT, ABAT, A...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
333,KEGG,KEGG:00250,"Alanine, aspartate and glutamate metabolism",0.032579,True,"Alanine, aspartate and glutamate metabolism",32,951,10,12939,0.010515,0.3125,query_1,[KEGG:00000],"[CAD, PPAT, IL4I1, ASS1, GFPT1, GPT2, ALDH5A1,...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."


In [15]:
sig_genes

Unnamed: 0,index,Gene,avg_pval,avg_delta_corr
0,1744,NSUN2,4.249096e-68,0.815741
1,2582,SULF1,2.602251e-48,0.449835
2,2761,TPX2,5.535236e-30,0.848989
3,1286,ITGB6,4.277663e-28,0.671351
4,1447,MAOA,4.938832e-28,0.517566
...,...,...,...,...
3044,639,DAPK3,3.277154e-02,0.068519
3045,2200,RBM33,3.333064e-02,0.345110
3046,195,ARL15,3.440768e-02,0.314849
3047,2599,TAB3,3.588943e-02,0.392011


Make KEGG Mapper scripts
https://www.kegg.jp/kegg/tool/map_pathway2.html
Search mode: 
    organism-specific: hsa
    outside ID: UniProt

In [16]:
gene_ids = []
for index, row in kegg_results.iterrows():
    pathway = row.description
    print('\n' + str(pathway))
    old_genes = (row.intersections)
    new_genes = list(gp.convert(query = old_genes, organism = 'hsapiens', target_namespace= 'uniprotswissprot_acc').converted)
    for gene, ID in zip(old_genes, new_genes):
        s = ID
        delta_corr = list(sig_genes[sig_genes.Gene == gene].avg_delta_corr)[0]
        if delta_corr > 0:
            s += ' green'
        else:
            s+= ' red'
        print(s)
        
    


Valine, leucine and isoleucine degradation
Q02252 green
P55809 green
P24752 green
Q96RQ9 green
P49419 green
P22033 green
P11182 green
P26440 green
P80404 green
P16219 green
Q9UKU7 green
Q06278 green
P45954 green
Q6NVY1 green
P35914 green
P42765 green
P11310 green
Q96RQ3 green
P30084 green
P31937 green
P51648 green
Q01581 green
P05165 green
Q99714 green

Carbon metabolism
Q02252 green
P09972 green
P24752 green
P14618 green
P49247 green
P08237 green
Q15493 green
P18669 green
Q01813 green
P10768 green
P13929 green
Q8TD30 green
P60174 green
Q5T6J7 green
P78330 green
P22033 green
P36957 green
P11498 green
P17174 green
P11413 green
P16219 green
P00558 green
Q9P2R7 green
P34896 green
P48735 green
Q6NVY1 green
Q9Y617 green
P04075 green
P07954 green
P17858 green
Q3LXA3 green
P06744 green
P08559 green
P30084 green
O43175 green
P00367 green
P23378 green
P09467 red
P53597 red
P31327 green
P05165 green
P19367 green
P20132 green
O75390 green
P06733 green

Pyrimidine metabolism
P27708 green
P49961 g