# Create Heatmaps for significant ATR Signaling genes

Pancancer heatmaps are created with circle size showing significance and color showing differences in median.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

# Step 1: Run GSEA for significant genes in at least 1 cancer

First read in sig_pval_all_proteins.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. Then run GSEA using a list of genes from the df.

In [2]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
sig_df = pd.read_csv(root+R'\sig_pval_heatmap.csv')

prot_list = list(sig_df.Proteomics) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='/Enrichr')

In [47]:
prot_enr.res2d.head(30)

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Regulation of RAC1 activity Homo sapiens 351aa...,17/38,2e-06,0.000493,0,0,3.342312,43.306082,DOCK6;ARHGAP9;RASGRF2;RAP1GDS1;VAV1;DEF6;EPS8;...,NCI-Nature_2016
1,Class I PI3K signaling events Homo sapiens 12b...,18/48,2.4e-05,0.002489,0,0,2.801644,29.823357,HSP90AA1;SYK;PLEKHA1;PDPK1;PTEN;PIK3CD;PIK3R1;...,NCI-Nature_2016
2,E2F transcription factor network Homo sapiens ...,22/72,0.000119,0.008305,0,0,2.282821,20.624449,PRMT5;RANBP1;RRM1;CDKN1B;CBX5;APAF1;HDAC1;TFE3...,NCI-Nature_2016
3,Signaling events mediated by TCPTP Homo sapien...,15/42,0.000213,0.011105,0,0,2.668232,22.563751,CSF1R;STAT1;PIK3CD;EIF2AK2;PIK3R1;EGFR;VEGFA;P...,NCI-Nature_2016
4,FAS (CD95) signaling pathway Homo sapiens 79cc...,14/38,0.000236,0.009853,0,0,2.752492,22.991088,SYK;CHUK;RFC1;PDPK1;CLTC;PIK3CD;PIK3R1;MAPK10;...,NCI-Nature_2016
5,Fanconi anemia pathway Homo sapiens 6befb873-6...,16/47,0.00025,0.008721,0,0,2.543336,21.090858,FANCI;RFC5;RFC3;WDR48;RFC4;RFC2;RMI1;TOP3A;RPA...,NCI-Nature_2016
6,IL8- and CXCR2-mediated signaling events Homo ...,13/34,0.000256,0.007629,0,0,2.856578,23.630412,PRKCG;PDPK1;PRKCB;ARRB1;ARRB2;PIK3CG;GNAI2;HCK...,NCI-Nature_2016
7,Netrin-mediated signaling events Homo sapiens ...,12/30,0.000268,0.007008,0,0,2.98842,24.575391,PAK1;MAP2K1;MYO10;PIK3CA;UNC5B;MAP1B;CAMK2A;EL...,NCI-Nature_2016
8,ATR signaling pathway Homo sapiens 8991cbac-61...,14/39,0.000323,0.007496,0,0,2.681915,21.558545,RFC5;RFC3;TIPIN;RFC4;MCM7;RFC2;PLK1;RPA1;FANCD...,NCI-Nature_2016
9,BCR signaling pathway Homo sapiens acbf44e2-61...,19/64,0.000508,0.010628,0,0,2.217968,16.821174,MAP4K1;MAP2K1;SYK;CHUK;PDPK1;PTEN;PIK3R1;POU2F...,NCI-Nature_2016


# Step 2: Get the list of significant genes 

In [48]:
i = 8
trans = prot_enr.res2d.Genes[i]
genes = trans.split(';')
print(prot_enr.res2d.Term[i])
print('total genes:',len(genes))

ATR signaling pathway Homo sapiens 8991cbac-618b-11e5-8ac5-06603eb7f303
total genes: 14


# Step 3: Create HeatMap

Slice out genes from the DNA Replication pathway from the df with genes sig in > 1 cancer.

In [49]:
# sig > 1 cancer
bool_df = sig_df.Proteomics.isin(genes)
plot_df = sig_df[bool_df]
len(plot_df.Proteomics.unique())

14

In [50]:
# Only include p-values < a
a = 0.05
plot_df = plot_df.loc[plot_df['P_Value'] <= a]

In [51]:
p.plotCircleHeatMap(plot_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=400, plot_width= 600)

# Step 4: Create a HeatMap with both +/- differences in median

 Read in the pos_neg_df.csv to create a df with only genes that have a pos and neg difference in median in different cancers. Slice out the genes that have a pos and neg difference in median in the pathway using the list of genes with a significant p-value in the pathway. 

In [52]:
pos_neg_df = pd.read_csv(root+R'\pos_neg_df.csv')

In [53]:
get = pos_neg_df.Proteomics.isin(genes) # bool df where True has both pos and neg
genes_pn = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn.Proteomics.unique()

array(['RFC5', 'MCM2', 'MCM7', 'RFC2', 'TIPIN', 'RFC4', 'TOPBP1', 'CHEK1',
       'PLK1'], dtype=object)

In [54]:
# Only include p-values < a
genes_pn = genes_pn.loc[genes_pn['P_Value'] <= a]

In [55]:
p.plotCircleHeatMap(genes_pn, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=300, plot_width=500)

# Sig in multiple cancers

In [32]:
mult_sig_df = pd.read_csv(root+R'\mult_sig_pval_heatmap.csv')

mult_sig_list = list(mult_sig_df.Proteomics.unique()) # list of genes with a sig pval in > 1 cancer
enr2 = gp.enrichr(gene_list = mult_sig_list, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='/Enrichr')

In [33]:
enr2.res2d.iloc[[4]]

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
4,ATR signaling pathway Homo sapiens 8991cbac-61...,6/39,5.2e-05,0.002162,0,0,8.892841,87.768209,RFC3;RFC4;MCM7;RFC2;TOPBP1;MCM2,NCI-Nature_2016


In [34]:
i = 4
trans2 = enr2.res2d.Genes[i]
genes_mult = trans2.split(';')
print(enr2.res2d.Term[i])
print('total genes:',len(genes_mult))

ATR signaling pathway Homo sapiens 8991cbac-618b-11e5-8ac5-06603eb7f303
total genes: 6


In [39]:
bool_df = mult_sig_df.Proteomics.isin(genes_mult)
plot_df2 = mult_sig_df[bool_df]
plot_df2.Proteomics.unique()

array(['MCM2', 'MCM7', 'RFC3', 'RFC2', 'RFC4', 'TOPBP1'], dtype=object)

In [40]:
# Only include p-values < 0.05
plot_df2 = plot_df2.loc[plot_df2['P_Value'] <= 0.05]

In [41]:
p.plotCircleHeatMap(plot_df2, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=300, plot_width=500)

In [43]:
get = pos_neg_df.Proteomics.isin(genes_mult) # bool df where True has both pos and neg
genes_pn_mult = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn_mult.Proteomics.unique()

array(['MCM2', 'MCM7', 'RFC2', 'RFC4', 'TOPBP1'], dtype=object)

In [44]:
# Only include p-values < 0.05
genes_pn_mult = genes_pn_mult.loc[genes_pn_mult['P_Value'] <= 0.05]

In [45]:
p.plotCircleHeatMap(genes_pn_mult, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=300, plot_width=400)

# Check other pathways the genes (+/- and sig in mult. cancers) are part of

In [21]:
gnm = list(genes_pn_mult.Proteomics.unique())
print(gnm)
enr3 = gp.enrichr(gene_list = gnm, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='/Enrichr')

['MCM2', 'MCM7', 'RFC2', 'TOPBP1']


In [22]:
enr3.res2d.head()

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,ATR signaling pathway Homo sapiens 8991cbac-61...,4/39,1.233764e-11,2.578567e-09,0,0,512.820513,12881.213531,MCM7;RFC2;TOPBP1;MCM2,NCI-Nature_2016
1,Fanconi anemia pathway Homo sapiens 6befb873-6...,2/47,3.233274e-05,0.003378772,0,0,212.765957,2199.87874,RFC2;TOPBP1,NCI-Nature_2016
2,BARD1 signaling events Homo sapiens 75b04491-6...,1/29,0.005787756,0.4032137,0,0,172.413793,888.277681,TOPBP1,NCI-Nature_2016
3,E2F transcription factor network Homo sapiens ...,1/72,0.01432337,0.748396,0,0,69.444444,294.851589,TOPBP1,NCI-Nature_2016


In [23]:
# Look at p-values 
certain_gene = 'SYK'
all_df = pd.read_csv(root+R'\all_heatmap.csv')
gene_df = all_df.loc[all_df['Proteomics'] == certain_gene]
gene_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
497,SYK,0.00687,-0.69755,Gbm
11509,SYK,0.03688,-0.225323,Hnscc
21864,SYK,0.230027,-0.489,Luad
34578,SYK,0.999914,0.2223,Lscc
48048,SYK,0.981705,0.0204,Brca
48872,SYK,0.897563,0.405618,Ov
61969,SYK,0.335351,0.079,En
76133,SYK,0.98191,0.0347,Colon
