In [2]:
import pandas as pd
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)

In [3]:
delta_correlation_df = pd.read_csv('delta_correlation_df.csv')
delta_correlation_df

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.268533,5.703182e-02,1.320375e-01,CCRCC
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
2,A2M,-0.191619,1.277644e-01,2.439276e-01,CCRCC
3,AAAS,0.019654,8.963138e-01,9.409267e-01,CCRCC
4,AACS,-0.169937,6.007042e-02,1.375402e-01,CCRCC
...,...,...,...,...,...
50303,ZXDC,0.073854,6.395306e-01,6.962162e-01,LUAD
50304,ZYG11B,0.734569,1.583743e-10,1.504331e-09,LUAD
50305,ZYX,0.440861,2.253307e-06,9.499585e-06,LUAD
50306,ZZEF1,0.422001,3.731185e-11,3.889196e-10,LUAD


In [None]:
cancer_types = pd.unique(delta_correlation_df.Cancer) # ['CCRCC', 'Endometrial', 'HNSCC', 'LSCC', 'LUAD']
all_profiles = []
for cancer in cancer_types:
    cancer_df = delta_correlation_df[delta_correlation_df.Cancer == cancer]
    background_genes  = list(pd.unique(cancer_df.Gene))
    cancer_df = cancer_df[cancer_df.FDR < 0.05]
    sig_genes  = list(pd.unique(cancer_df.Gene))
    
    cancer_profile = gp.profile(organism='hsapiens', query = sig_genes, no_iea=True, sources = ["KEGG"],
                       ordered=True, no_evidences=False, background= background_genes)
    all_profiles.append(cancer_profile)

In [None]:
cancer_types

In [None]:
#CCRCC
(all_profiles[0])

In [None]:
#Endometrial
(all_profiles[1])

In [None]:
#HNSCC
(all_profiles[2])

In [None]:
#LSCC
(all_profiles[3])

In [None]:
#LUAD
(all_profiles[4])

# Exploration: Sustaining and Proliferating Hallmark Pathways

In [10]:
# These are the Sustaining and Proliferating Hallmark Pathways
sus_prof_pathways = [
    'ABC transporters',
    'Fatty acid degradation',
    'Metabolic pathways',
    'Fatty acid metabolism',
    'Pyruvate metabolism',
    'Valine, leucine and isoleucine degradation',
    'Butanoate metabolism',
    'Glycolysis / Gluconeogenesis',
    'Lysosome',
    'Peroxisome',
    'Biosynthesis of unsaturated fatty acids',
    'Thiamine metabolism',
    'Purine metabolism',
    'Glyoxylate and dicarboxylate metabolism',
    'Tyrosine metabolism',
    'Carbon metabolism',
    'Various types of N-glycan biosynthesis',
    'Lysine degradation',
    'Chemical carcinogenesis',
    'Arginine biosynthesis',
    'Cysteine and methionine metabolism',
    'Proteasome',
    'Drug metabolism - cytochrome P450',
    'Arginine and proline metabolism',
    'Synthesis and degradation of ketone bodies',
    'Citrate cycle (TCA cycle)',
    'Pantothenate and CoA biosynthesis',
    'Oxidative phosphorylation',
    'Fructose and mannose metabolism',
    'Biosynthesis of amino acids',
    'DNA replication',
    'D-Glutamine and D-glutamate metabolism',
    'Phenylalanine metabolism',
    'Translation Factors',
    'Mitochondrial LC-Fatty Acid Beta-Oxidation',
    'Fatty Acid Beta Oxidation',
    'Fatty Acid Biosynthesis',
    'Nuclear Receptors in Lipid Metabolism and Toxicity',
    'Fatty Acid Omega Oxidation',
    'Fluoroacetic acid toxicity',
    'Statin Pathway',
    'Pregnane X Receptor pathway',
    'Integrated Cancer Pathway',
    'G Protein Signaling Pathways',
    'Cytoplasmic Ribosomal Proteins',
    'Mitochondrial complex I assembly model OXPHOS system',
    'Nanoparticle triggered autophagic cell death',
    'Nanomaterial induced apoptosis',
    'Composition of Lipid Particles'
]

## Pathway searching

First we'll make one dataframe that has all the enriched pathways for all cancer types, adding a column that identifies which cancer type each pathway came from. Note that if one pathway is enriched in multiple cancer types, it will be in the table multiple times, once for each cancer type.

In [11]:
all_profiles_df = pd.DataFrame()
for profile, cancer_type in zip(all_profiles, cancer_types):
    profile = profile.assign(cancer_type=cancer_type)
    all_profiles_df = all_profiles_df.append(profile)

Now we'll write a function that will take a list of pathways we're interested in, and return a table showing which cancer types those pathways are enriched in.

In [12]:
def summarize_pathways(all_profiles, selected_pathways):
    
    # Pull out the profiles for our selected pathways
    # Note that if a pathway isn't found in the profiles table, this will omit it silently
    selected_profiles = all_profiles[all_profiles["name"].isin(selected_pathways)]
    
    # Group by pathway, to have an entry for which cancer types each pathway is enriched in
    # Then group by which cancer types each pathway is enriched in, so for each cancer type
    # group we have a list of all pathways that are enriched in all members of that group of
    # cancer types
    summ = selected_profiles.\
    groupby("name").\
    agg(cancers_str=("cancer_type", lambda x: "_".join(sorted(set(x))))).\
    reset_index(drop=False).\
    groupby("cancers_str").\
    agg(pathways=("name", lambda x: sorted(set(x)))).\
    reset_index(drop=False)

    # Sort everything nicely
    summ.insert(0, "cancers", summ["cancers_str"].str.split("_"))
    summ = summ.\
    assign(cancers_ct=summ["cancers"].apply(len)).\
    sort_values(by=["cancers_ct", "cancers_str"], ascending=[False, True]).\
    drop(columns=["cancers_str", "cancers_ct"]).\
    reset_index(drop=True)

    summ = summ.\
    assign(pathways_ct=summ["pathways"].apply(len))

    return summ

sus_prof_sum = summarize_pathways(all_profiles_df, sus_prof_pathways)
sus_prof_sum

Unnamed: 0,cancers,pathways,pathways_ct
0,"[CCRCC, Endometrial, LSCC, LUAD]",[Fatty acid metabolism],1
1,"[CCRCC, LSCC, LUAD]","[ABC transporters, Fatty acid degradation]",2
2,"[LSCC, LUAD]","[Glycolysis / Gluconeogenesis, Pyruvate metabo...",3
3,[CCRCC],[Biosynthesis of unsaturated fatty acids],1
4,[LSCC],[Peroxisome],1


In [13]:
sus_prof_sum.loc[0, "pathways"]

['Fatty acid metabolism']

This list copied from output of cell above. Descriptions from KEGG and other searches.

- ABC transporters
    - Use ATP hydrolysis for active transport of many substrates, including ions, sugars, lipids, sterols, peptides, proteins, and drugs.
- Arginine and proline metabolism
    - These reactions are bidirectional, so adjusts to the current needs of the cell
    - Metabolic needs shift to support metastasis or other activities [[28492237]](https://pubmed.ncbi.nlm.nih.gov/28492237/)
- Biosynthesis of unsaturated fatty acids
    - Important for cancer to change growth patterns [[31819192]](https://pubmed.ncbi.nlm.nih.gov/31819192/)
- Butanoate metabolism
    - Type of fatty acid metabolism
    - Involved in gut microbiome homeostasis?
- Carbon metabolism
- Citrate cycle (TCA cycle)
- Fatty acid degradation
- Fatty acid metabolism
- Glycolysis / Gluconeogenesis
- Glyoxylate and dicarboxylate metabolism
- Lysine degradation
- Lysosome
- Metabolic pathways
- Oxidative phosphorylation
- Pantothenate and CoA biosynthesis
- Peroxisome
- Purine metabolism
- Pyruvate metabolism
- Synthesis and degradation of ketone bodies
- Thiamine metabolism
- Tyrosine metabolism
- Valine, leucine and isoleucine degradation
- Various types of N-glycan biosynthesis

### Let's explore which pathways are found in which cancer types

Why are some pathways not showing up in any cancers in our table?

In [14]:
not_found = []
found = {}
cancers_map = {cancer: [] for cancer in cancer_types}

for pw in sus_prof_pathways:
    cancers = all_profiles_df[all_profiles_df["name"] == pw]["cancer_type"]
    if len(cancers) == 0:
        not_found.append(pw)
    else:
        found[pw] = []
        for cancer in cancers:
            found[pw].append(cancer)
            cancers_map[cancer].append(pw)

In [15]:
not_found

['Metabolic pathways',
 'Butanoate metabolism',
 'Lysosome',
 'Thiamine metabolism',
 'Purine metabolism',
 'Glyoxylate and dicarboxylate metabolism',
 'Tyrosine metabolism',
 'Carbon metabolism',
 'Various types of N-glycan biosynthesis',
 'Lysine degradation',
 'Chemical carcinogenesis',
 'Arginine biosynthesis',
 'Cysteine and methionine metabolism',
 'Proteasome',
 'Drug metabolism - cytochrome P450',
 'Arginine and proline metabolism',
 'Synthesis and degradation of ketone bodies',
 'Citrate cycle (TCA cycle)',
 'Pantothenate and CoA biosynthesis',
 'Oxidative phosphorylation',
 'Fructose and mannose metabolism',
 'Biosynthesis of amino acids',
 'DNA replication',
 'D-Glutamine and D-glutamate metabolism',
 'Phenylalanine metabolism',
 'Translation Factors',
 'Mitochondrial LC-Fatty Acid Beta-Oxidation',
 'Fatty Acid Beta Oxidation',
 'Fatty Acid Biosynthesis',
 'Nuclear Receptors in Lipid Metabolism and Toxicity',
 'Fatty Acid Omega Oxidation',
 'Fluoroacetic acid toxicity',
 'St

In [16]:
def print_map(dmap):
    for k in sorted(dmap.keys()):
        print(k)
        for v in sorted(dmap[k]):
            print(f"\t{v}")

In [17]:
print_map(found)

ABC transporters
	CCRCC
	LSCC
	LUAD
Biosynthesis of unsaturated fatty acids
	CCRCC
Fatty acid degradation
	CCRCC
	LSCC
	LUAD
Fatty acid metabolism
	CCRCC
	Endometrial
	LSCC
	LUAD
Glycolysis / Gluconeogenesis
	LSCC
	LUAD
Peroxisome
	LSCC
Pyruvate metabolism
	LSCC
	LUAD
Valine, leucine and isoleucine degradation
	LSCC
	LUAD


In [18]:
print_map(cancers_map)

CCRCC
	ABC transporters
	Biosynthesis of unsaturated fatty acids
	Fatty acid degradation
	Fatty acid metabolism
Endometrial
	Fatty acid metabolism
HNSCC
LSCC
	ABC transporters
	Fatty acid degradation
	Fatty acid metabolism
	Glycolysis / Gluconeogenesis
	Peroxisome
	Pyruvate metabolism
	Valine, leucine and isoleucine degradation
LUAD
	ABC transporters
	Fatty acid degradation
	Fatty acid metabolism
	Glycolysis / Gluconeogenesis
	Pyruvate metabolism
	Valine, leucine and isoleucine degradation
