In [1]:
import pandas as pd
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)

In [2]:
delta_correlation_df = pd.read_csv('delta_correlation_df.csv')
delta_correlation_df

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.268533,5.703182e-02,1.320375e-01,CCRCC
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
2,A2M,-0.191619,1.277644e-01,2.439276e-01,CCRCC
3,AAAS,0.019654,8.963138e-01,9.409267e-01,CCRCC
4,AACS,-0.169937,6.007042e-02,1.375402e-01,CCRCC
5,AADAT,-0.263372,1.531939e-02,4.572896e-02,CCRCC
6,AAED1,0.190506,2.882440e-01,4.385777e-01,CCRCC
7,AAGAB,0.364999,7.633656e-04,3.646252e-03,CCRCC
8,AAK1,0.356932,6.347464e-14,1.385608e-12,CCRCC
9,AAMP,0.281272,2.536223e-02,6.920503e-02,CCRCC


In [9]:
def get_enriched_pathways(delta_corr, source):

    cancer_types = pd.unique(delta_corr.Cancer) # ['CCRCC', 'Endometrial', 'HNSCC', 'LSCC', 'LUAD']
    enriched_pathways = pd.DataFrame()
    for cancer in cancer_types:
        
        cancer_df = delta_corr[delta_corr.Cancer == cancer]
        background_genes  = list(pd.unique(cancer_df.Gene))
        sig_genes_df = cancer_df[cancer_df.FDR < 0.05]
        sig_genes  = list(pd.unique(sig_genes_df.Gene))

        cancer_profile = gp.profile(
            organism='hsapiens',
            query=sig_genes, 
            no_iea=True, 
            sources=[source],
            ordered=True,
            no_evidences=False,
            background=background_genes
        ).assign(cancer_type=cancer)
        
        enriched_pathways = enriched_pathways.append(cancer_profile)
    
    return enriched_pathways
        

In [10]:
enriched_kegg = get_enriched_pathways(delta_correlation_df, "KEGG")
enriched_wp = get_enriched_pathways(delta_correlation_df, "WP")

## Which pathways are found in all cancers?

In [11]:
def summarize_pathways(all_profiles):
    
    # Group by pathway, to have an entry for which cancer types each pathway is enriched in
    # Then group by which cancer types each pathway is enriched in, so for each cancer type
    # group we have a list of all pathways that are enriched in all members of that group of
    # cancer types
    summ = all_profiles.\
    groupby("name").\
    agg(cancers_str=("cancer_type", lambda x: "_".join(sorted(set(x))))).\
    reset_index(drop=False).\
    groupby("cancers_str").\
    agg(pathways=("name", lambda x: sorted(set(x)))).\
    reset_index(drop=False)

    # Sort everything nicely
    summ.insert(0, "cancers", summ["cancers_str"].str.split("_"))
    summ = summ.\
    assign(cancers_ct=summ["cancers"].apply(len)).\
    sort_values(by=["cancers_ct", "cancers_str"], ascending=[False, True]).\
    drop(columns=["cancers_str", "cancers_ct"]).\
    reset_index(drop=True)

    summ = summ.\
    assign(pathways_ct=summ["pathways"].apply(len))

    return summ

In [15]:
kegg_pw_summ = summarize_pathways(enriched_kegg)
print(kegg_pw_summ["pathways_ct"].sum())
kegg_pw_summ

67


Unnamed: 0,cancers,pathways,pathways_ct
0,"[CCRCC, Endometrial, HNSCC, LSCC, LUAD]","[Alanine, aspartate and glutamate metabolism, ...",10
1,"[CCRCC, Endometrial, LSCC, LUAD]","[Arginine and proline metabolism, Butanoate me...",4
2,"[CCRCC, HNSCC, LSCC, LUAD]","[ABC transporters, Glycerolipid metabolism]",2
3,"[Endometrial, HNSCC, LSCC, LUAD]","[Fatty acid biosynthesis, Riboflavin metabolism]",2
4,"[CCRCC, Endometrial, LUAD]",[Biosynthesis of cofactors],1
5,"[CCRCC, HNSCC, LSCC]",[Vibrio cholerae infection],1
6,"[CCRCC, LSCC, LUAD]","[Carbon metabolism, Glyoxylate and dicarboxyla...",3
7,"[Endometrial, LSCC, LUAD]",[Thiamine metabolism],1
8,"[HNSCC, LSCC, LUAD]",[Nitrogen metabolism],1
9,"[CCRCC, HNSCC]","[Human immunodeficiency virus 1 infection, Oxi...",2


In [14]:
wp_pw_summ = summarize_pathways(enriched_wp)
print(wp_pw_summ["pathways_ct"].sum())
wp_pw_summ

70


Unnamed: 0,cancers,pathways,pathways_ct
0,"[CCRCC, Endometrial, HNSCC, LSCC, LUAD]",[Fatty Acid Biosynthesis],1
1,"[CCRCC, Endometrial, LSCC, LUAD]","[7-oxo-C and 7beta-HC pathways, Amino Acid met...",7
2,"[CCRCC, LSCC, LUAD]","[Ethanol effects on histone modifications, Fat...",6
3,"[Endometrial, LSCC, LUAD]","[Nanomaterial induced apoptosis, Prostaglandin...",2
4,"[HNSCC, LSCC, LUAD]","[DNA Damage Response, Integrated Cancer Pathwa...",3
5,"[CCRCC, HNSCC]","[Ebola Virus Pathway on Host, Mitochondrial co...",3
6,"[CCRCC, LUAD]","[Drug Induction of Bile Acid Pathway, Oxidativ...",2
7,"[HNSCC, LSCC]","[Apoptosis, Small cell lung cancer]",2
8,"[LSCC, LUAD]","[Apoptosis Modulation by HSP70, Benzo(a)pyrene...",8
9,[CCRCC],"[Butyrate-induced histone acetylation, Codeine...",9


Next step: heatmap
- One axis is cancer types
- Other axis is pathways, grouped by hallmark
- Color is proportion of genes in pathways that have a sigificant delta correlation