# Run a Gene Set Enrichment Analysis using Reactome_2016

This notebook looks at the significant genes in at least one cancer. GSEA is used to recognize pathways with multiple significant genes. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

# Step 1: Run GSEA for significant genes in at least 1 cancer

First read in sig_pval_all_proteins.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. Then run GSEA using a list of genes from the df.

In [2]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
df = pd.read_csv(root+R'\sig_pval_heatmap.csv')

prot_list = list(df.Proteomics.unique()) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='/Enrichr')

In [8]:
prot_enr.res2d.head(30)

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Gene Expression Homo sapiens R-HSA-74160,101/1631,1.775164e-24,2.7160009999999998e-21,0,0,3.013392,164.796815,EHMT1;EPRS;PWP2;EFTUD2;PSMD9;SNRPD2;KHSRP;TXNL...,Reactome_2016
1,Processing of Capped Intron-Containing Pre-mRN...,36/193,3.337592e-24,2.553258e-21,0,0,9.076813,490.663374,DDX5;DHX9;DDX23;USP39;PRPF8;ELAVL1;NUP160;EFTU...,Reactome_2016
2,"Cell Cycle, Mitotic Homo sapiens R-HSA-69278",47/462,1.0828829999999999e-19,5.5227020000000006e-17,0,0,4.950443,216.183318,TOP2A;AHCTF1;PCNA;MCM7;DCTN1;NCAPG;BUB1B;SMC4;...,Reactome_2016
3,Cell Cycle Homo sapiens R-HSA-1640170,48/566,7.248901e-17,2.772705e-14,0,0,4.126796,153.364511,TOP2A;AHCTF1;PCNA;MCM7;DCTN1;NCAPG;BUB1B;SMC4;...,Reactome_2016
4,mRNA Splicing - Major Pathway Homo sapiens R-H...,24/134,4.208022e-16,1.287655e-13,0,0,8.715546,308.568418,DDX5;CPSF1;NCBP1;DHX9;NCBP2;CPSF3;CPSF2;DDX23;...,Reactome_2016
5,DNA strand elongation Homo sapiens R-HSA-69190,14/32,6.472012e-16,1.650363e-13,0,0,21.289538,744.577619,RFC5;GINS2;RFC3;PCNA;RFC4;MCM7;RFC1;RFC2;GINS4...,Reactome_2016
6,mRNA Splicing Homo sapiens R-HSA-72172,24/144,2.305548e-15,5.039269e-13,0,0,8.1103,273.345159,DDX5;CPSF1;NCBP1;DHX9;NCBP2;CPSF3;CPSF2;DDX23;...,Reactome_2016
7,Transport of Mature Transcript to Cytoplasm Ho...,18/74,7.476537e-15,1.429888e-12,0,0,11.836654,385.010929,RANBP2;NCBP1;NUP210;NUP133;CPSF1;NUP155;NCBP2;...,Reactome_2016
8,Transport of Mature mRNA Derived from an Intro...,13/37,2.198546e-13,3.737527e-11,0,0,17.097389,498.317265,RANBP2;CPSF1;NCBP1;NUP210;NUP133;NUP155;NCBP2;...,Reactome_2016
9,Transport of Mature mRNAs Derived from Intronl...,13/38,3.280112e-13,5.018571e-11,0,0,16.647458,478.543314,RANBP2;CPSF1;NCBP1;NUP210;NUP133;NUP155;NCBP2;...,Reactome_2016


In [4]:
pathway = prot_enr.res2d.Term.to_frame()
genes = prot_enr.res2d.Genes.to_frame()

In [5]:
sig = pathway.join(genes)
sig

Unnamed: 0,Term,Genes
0,Processing of Capped Intron-Containing Pre-mRN...,NUP107;NUP188;EIF4A3;HNRNPU;EFTUD2;SNRPD2;SNRP...
1,Gene Expression Homo sapiens R-HSA-74160,TDRKH;RPL4;ATF2;MDC1;NUP107;HNRNPU;EHMT1;PHAX;...
2,mRNA Splicing - Major Pathway Homo sapiens R-H...,EIF4A3;HNRNPU;YBX1;PRPF19;USP39;ELAVL1;EFTUD2;...
3,mRNA Splicing Homo sapiens R-HSA-72172,EIF4A3;HNRNPU;YBX1;PRPF19;USP39;ELAVL1;EFTUD2;...
4,Infectious disease Homo sapiens R-HSA-5663205,RPL4;NUP107;NUP188;RPL10A;RPL9;PSMD8;RPS15;PSM...
...,...,...
1311,Signaling by GPCR Homo sapiens R-HSA-372790,CHRM1;RASGRF2;PIK3CD;PEBP1;ARRB1;WDR83;ARRB2;P...
1312,GPCR downstream signaling Homo sapiens R-HSA-3...,CHRM1;RASGRF2;PIK3CD;ARRB1;ARRB2;PIK3CG;ARHGAP...
1313,GPCR ligand binding Homo sapiens R-HSA-500792,P2RY12;CCR1;P2RY13;CHRM1;C3;HEBP1;XK;GNG2;GNG7...
1314,Class A/1 (Rhodopsin-like receptors) Homo sapi...,C3;CCR1;P2RY12;HEBP1;P2RY13;XK;CHRM1;CCRL2;C3AR1


# Run GSEA for significant genes in multiple cancers

In [6]:
df = pd.read_csv(root+R'\mult_sig_pval_heatmap.csv')

mult_sig_list = list(df.Proteomics.unique()) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = mult_sig_list, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='/Enrichr')

In [7]:
prot_enr.res2d.head()

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Gene Expression Homo sapiens R-HSA-74160,101/1631,1.775164e-24,2.7160009999999998e-21,0,0,3.013392,164.796815,EHMT1;EPRS;PWP2;EFTUD2;PSMD9;SNRPD2;KHSRP;TXNL...,Reactome_2016
1,Processing of Capped Intron-Containing Pre-mRN...,36/193,3.337592e-24,2.553258e-21,0,0,9.076813,490.663374,DDX5;DHX9;DDX23;USP39;PRPF8;ELAVL1;NUP160;EFTU...,Reactome_2016
2,"Cell Cycle, Mitotic Homo sapiens R-HSA-69278",47/462,1.0828829999999999e-19,5.5227020000000006e-17,0,0,4.950443,216.183318,TOP2A;AHCTF1;PCNA;MCM7;DCTN1;NCAPG;BUB1B;SMC4;...,Reactome_2016
3,Cell Cycle Homo sapiens R-HSA-1640170,48/566,7.248901e-17,2.772705e-14,0,0,4.126796,153.364511,TOP2A;AHCTF1;PCNA;MCM7;DCTN1;NCAPG;BUB1B;SMC4;...,Reactome_2016
4,mRNA Splicing - Major Pathway Homo sapiens R-H...,24/134,4.208022e-16,1.287655e-13,0,0,8.715546,308.568418,DDX5;CPSF1;NCBP1;DHX9;NCBP2;CPSF3;CPSF2;DDX23;...,Reactome_2016
