# Run a Gene Set Enrichment Analysis using NCI_Nature

This notebook looks at the significant genes in at least one cancer. GSEA is used to recognize pathways with multiple significant genes. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

# Step 1: Run GSEA for significant genes in at least 1 cancer

First read in sig_pval_all_proteins.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. Then run GSEA using a list of genes from the df.

In [2]:
df = pd.read_csv('../../Make_Tables/csv/sig_pval_heatmap.csv')

prot_list = list(df.Proteomics.unique()) # list of genes with a sig pval in >= 1 cancer
print(len(prot_list))
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='/Enrichr')
#gp.get_library_name() #Cancer_Cell_Line_Encyclopedia #NCI-60_Cancer_Cell_Lines

2630


In [3]:
prot_enr.res2d.head()

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Regulation of RAC1 activity Homo sapiens 351aa...,17/38,2e-06,0.000384,0,0,3.402041,44.928225,DOCK6;ARHGAP9;RASGRF2;RAP1GDS1;VAV1;DEF6;EPS8;...,NCI-Nature_2016
1,Class I PI3K signaling events Homo sapiens 12b...,18/48,1.9e-05,0.001949,0,0,2.851711,31.053913,HSP90AA1;SYK;PLEKHA1;PDPK1;PIK3CD;PIK3R1;ADAP1...,NCI-Nature_2016
2,Fanconi anemia pathway Homo sapiens 6befb873-6...,17/47,5.4e-05,0.003782,0,0,2.750587,27.014064,FANCI;RFC5;RFC3;WDR48;RFC4;RFC2;RMI1;TOP3A;RPA...,NCI-Nature_2016
3,Signaling events mediated by TCPTP Homo sapien...,15/42,0.000174,0.009094,0,0,2.715915,23.509534,CSF1R;STAT1;PIK3CD;EIF2AK2;PIK3R1;EGFR;VEGFA;P...,NCI-Nature_2016
4,FAS (CD95) signaling pathway Homo sapiens 79cc...,14/38,0.000195,0.008151,0,0,2.801681,23.9333,SYK;CHUK;RFC1;PDPK1;CLTC;PIK3CD;PIK3R1;MAPK10;...,NCI-Nature_2016


# Run GSEA for significant genes in multiple cancers

In [4]:
df = pd.read_csv('../../Make_Tables/csv/mult_sig_pval_heatmap.csv')

mult_sig_list = list(df.Proteomics.unique()) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = mult_sig_list, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='/Enrichr')

In [5]:
prot_enr.res2d.head(6)

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,BCR signaling pathway Homo sapiens acbf44e2-61...,9/64,1e-06,0.000222,0,0,8.471386,116.542158,MAP4K1;PPP3CB;PTPRC;SYK;CHUK;PIK3CA;BLNK;BTK;P...,NCI-Nature_2016
1,Class I PI3K signaling events Homo sapiens 12b...,8/48,1e-06,0.000117,0,0,10.040161,137.556168,ZAP70;SYK;PIK3CA;BLNK;PLCG2;BTK;PIK3CD;CYTH1,NCI-Nature_2016
2,IL8- and CXCR2-mediated signaling events Homo ...,6/34,1.8e-05,0.001266,0,0,10.630758,116.04232,GNG2;PRKCB;RAC2;ELMO1;DOCK2;GNAI2,NCI-Nature_2016
3,ATR signaling pathway Homo sapiens 8991cbac-61...,6/39,4.1e-05,0.002148,0,0,9.267841,93.598766,RFC3;RFC4;MCM7;RFC2;TOPBP1;MCM2,NCI-Nature_2016
4,CXCR3-mediated signaling events Homo sapiens 3...,6/43,7.3e-05,0.003037,0,0,8.405716,80.10565,GNG2;PIK3CA;ITGB2;PIK3CD;ITGAL;GNAI2,NCI-Nature_2016
5,TCR signaling in naive CD4+ T cells Homo sapie...,7/64,9e-05,0.003134,0,0,6.588855,61.381742,MAP4K1;ZAP70;CD4;PTPRC;CHUK;PRKCB;WAS,NCI-Nature_2016


In [23]:
i = 2
print(prot_enr.res2d.Term[i])
test = prot_enr.res2d.Genes[i]

t_list = test.split(';')
print(len(t_list), 'genes')
t_list

IL8- and CXCR2-mediated signaling events Homo sapiens fe78e284-6193-11e5-8ac5-06603eb7f303
6 genes


['GNG2', 'PRKCB', 'RAC2', 'ELMO1', 'DOCK2', 'GNAI2']

In [6]:
all_genes = "CD3E,CD3G,CD4,CD8A,LCK,ZAP70,GRAP2,VAV1,GRB2,FYB1,PLCG1,NFATC2,PPP3CC,RAC2,CD79A,SYK,BTK,CD5,PTPN6,LYN,PTPRC,PIK3CG,INPP5D,PRKCQ,BLNK,BLK,HLA-DMA,HLA-DMB,NFKB2,PRKCB,PLCG2,ELMO1,DOCK2,WAS"

In [7]:
prot_list = all_genes.split(',')

In [8]:
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='Reactome_2016', #'NCI-Nature_2016', 
                       outdir='/Enrichr')

In [9]:
prot_enr.res2d.head(6)

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Adaptive Immune System Homo sapiens R-HSA-1280218,25/762,8.60009e-29,1.315814e-25,0,0,19.299058,1247.166803,BLK;WAS;CD3G;CD3E;CD79A;HLA-DMA;HLA-DMB;INPP5D...,Reactome_2016
1,Immune System Homo sapiens R-HSA-168256,28/1547,5.10057e-26,3.901936e-23,0,0,10.646793,620.046422,BLK;WAS;CD3G;CD3E;CD79A;HLA-DMA;HLA-DMB;INPP5D...,Reactome_2016
2,Antigen activates B Cell Receptor (BCR) leadin...,11/47,9.365155e-22,4.776229e-19,0,0,137.67209,6666.065553,LYN;BLK;CD79A;SYK;BTK;BLNK;PLCG2;GRB2;PTPN6;PL...,Reactome_2016
3,Signaling by the B Cell Receptor (BCR) Homo sa...,13/233,3.8853160000000003e-17,1.486134e-14,0,0,32.819995,1240.160687,LYN;BLK;SYK;PRKCB;VAV1;CD79A;LCK;BTK;BLNK;PLCG...,Reactome_2016
4,TCR signaling Homo sapiens R-HSA-202403,11/118,4.7745900000000004e-17,1.461025e-14,0,0,54.835494,2060.752855,ZAP70;CD4;PTPRC;LCK;INPP5D;GRAP2;WAS;PRKCQ;CD3...,Reactome_2016
5,GPVI-mediated activation cascade Homo sapiens ...,9/53,1.569577e-16,4.002423e-14,0,0,99.889012,3635.016596,LYN;SYK;LCK;RAC2;PLCG2;GRB2;PTPN6;VAV1;PIK3CG,Reactome_2016


In [11]:
i = 4
print(prot_enr.res2d.Term[i])
t_cell = prot_enr.res2d.Genes[i]

t_cell_list = t_cell.split(';')
print(len(t_cell_list), 'genes')
t_cell_list

TCR signaling Homo sapiens R-HSA-202403
11 genes


['ZAP70',
 'CD4',
 'PTPRC',
 'LCK',
 'INPP5D',
 'GRAP2',
 'WAS',
 'PRKCQ',
 'CD3G',
 'PLCG1',
 'CD3E']

In [12]:
i = 3
print(prot_enr.res2d.Term[i])
b_cell = prot_enr.res2d.Genes[i]

b_cell_list = b_cell.split(';')
print(len(b_cell_list), 'genes')
b_cell_list

Signaling by the B Cell Receptor (BCR) Homo sapiens R-HSA-983705
13 genes


['LYN',
 'BLK',
 'SYK',
 'PRKCB',
 'VAV1',
 'CD79A',
 'LCK',
 'BTK',
 'BLNK',
 'PLCG2',
 'GRB2',
 'PTPN6',
 'PLCG1']

In [13]:
both = [value for value in t_cell_list if value in b_cell_list] 

In [14]:
both

['LCK', 'PLCG1']

# 