# Create Heat Map for significant interacting proteins

Pancancer heat maps are created with circle size showing significance and color showing differences in median.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

# Step 1: Run GSEA for significant genes in at least 1 cancer

First read in sig_pval_all_proteins.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. Then run GSEA using a list of genes from the df.

In [2]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
sig_df = pd.read_csv(root+R'\sig_pval_heatmap.csv')

prot_list = list(sig_df.Proteomics) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='/Enrichr')

In [3]:
prot_enr.res2d.head()

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Processing of Capped Intron-Containing Pre-mRN...,116/193,1.379053e-53,2.1099519999999998e-50,0,0,4.603878,560.363828,NUP107;NUP188;EIF4A3;HNRNPU;EFTUD2;SNRPD2;SNRP...,Reactome_2016
1,Gene Expression Homo sapiens R-HSA-74160,434/1631,1.734808e-53,1.327128e-50,0,0,2.038257,247.619892,TDRKH;RPL4;ATF2;MDC1;NUP107;HNRNPU;EHMT1;PHAX;...,Reactome_2016
2,mRNA Splicing - Major Pathway Homo sapiens R-H...,85/134,3.384052e-42,1.7258660000000002e-39,0,0,4.858892,463.973174,EIF4A3;HNRNPU;YBX1;PRPF19;USP39;ELAVL1;EFTUD2;...,Reactome_2016
3,mRNA Splicing Homo sapiens R-HSA-72172,85/144,1.0813669999999998e-38,4.13623e-36,0,0,4.521469,395.266853,EIF4A3;HNRNPU;YBX1;PRPF19;USP39;ELAVL1;EFTUD2;...,Reactome_2016
4,Infectious disease Homo sapiens R-HSA-5663205,123/348,8.726101e-27,2.670187e-24,0,0,2.707379,162.452135,RPL4;NUP107;NUP188;RPL10A;RPL9;PSMD8;RPS15;PSM...,Reactome_2016


# Step 2: Get the list of significant genes 

In [4]:
trans = prot_enr.res2d.Genes[1]
genes = trans.split(';')
print('total genes:',len(genes))

total genes: 434


# Step 3: Create HeatMap

Slice out genes from the DNA Replication pathway from the df with genes sig in > 1 cancer.

In [5]:
# sig > 1 cancer
bool_df = sig_df.Proteomics.isin(genes)
plot_df = sig_df[bool_df]
len(plot_df.Proteomics.unique())

434

In [6]:
# Only include p-values < a certain cutoff
a = 0.01
plot_df = plot_df.loc[plot_df['P_Value'] <= a]

In [7]:
p.plotCircleHeatMap(plot_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=500, plot_width= 4500)

# Step 4: Create a HeatMap with both +/- differences in median

 Read in the pos_neg_df.csv to create a df with only genes that have a pos and neg difference in median in different cancers. Slice out the genes that have a pos and neg difference in median in the pathway using the list of genes with a significant p-value in the pathway. 

In [8]:
pos_neg_df = pd.read_csv(root+R'\pos_neg_df.csv')

In [9]:
get = pos_neg_df.Proteomics.isin(genes) # bool df where True has both pos and neg
genes_pn = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn.Proteomics.unique()

array(['MAPK3', 'USP39', 'RFC5', 'HNRNPD', 'ZKSCAN1', 'NUP37', 'PUS7',
       'YBX1', 'NUP153', 'RPS15A', 'EZH2', 'SSRP1', 'DDX39A', 'TEAD1',
       'PRMT1', 'CD2BP2', 'SESN1', 'ADAR', 'PARP1', 'RFC2', 'ZC3H8',
       'DEK', 'WDR75', 'BYSL', 'RFC4', 'RPN1', 'XPO5', 'TOPBP1', 'RPL39',
       'CHEK1', 'EXOSC8', 'ZNF776', 'HEATR1', 'HNRNPA1', 'FANCI', 'NFIA',
       'NR2F1', 'FUS', 'RPL9', 'GTF2E2', 'ZNF655', 'RPS24', 'PPP2R2A',
       'NAT10', 'HDAC1', 'MSH2', 'DCAF13', 'DDOST', 'TPX2', 'AKT2',
       'RPS26', 'WDR46', 'IGF2BP3', 'ATF2', 'NFIC', 'NUAK1', 'IMP3',
       'EXOSC9', 'E2F5', 'NOB1', 'RARA', 'ELL', 'GSR', 'BOP1', 'MDC1',
       'TRMT6', 'DKC1', 'G6PD', 'MPHOSPH10', 'DDX47', 'SP1', 'HNRNPA3',
       'UTP6', 'NUP210', 'CDKAL1', 'TOP3A', 'TFB2M', 'UTP3', 'RPL4',
       'DDX52', 'TXNRD1', 'TDRKH', 'PSMB8', 'DARS2', 'FAS', 'TXN',
       'GTPBP3', 'TP53', 'PSMB9', 'IGF2BP2', 'SUV39H1'], dtype=object)

In [10]:
# Only include p-values <  a certain cutoff
genes_pn = genes_pn.loc[genes_pn['P_Value'] <= a]

In [11]:
p.plotCircleHeatMap(genes_pn, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=400, plot_width=1500)

# Sig in multiple cancers

In [12]:
mult_sig_df = pd.read_csv(root+R'\mult_sig_pval_heatmap.csv')

mult_sig_list = list(mult_sig_df.Proteomics.unique()) # list of genes with a sig pval in > 1 cancer
enr2 = gp.enrichr(gene_list = mult_sig_list, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='/Enrichr')

In [13]:
enr2.res2d.head()

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Gene Expression Homo sapiens R-HSA-74160,74/1631,2.0810670000000002e-17,3.184033e-14,0,0,2.917745,112.073703,WDR3;WDR4;PTEN;HNRNPU;NAT10;ZC3H8;ADAR;PPP2R2A...,Reactome_2016
1,"Cell Cycle, Mitotic Homo sapiens R-HSA-69278",37/462,2.789117e-16,2.133675e-13,0,0,5.150262,184.459924,TOP2A;AHCTF1;PCNA;MCM7;DCTN1;BUB1B;PPP2R2A;ORC...,Reactome_2016
2,Cell Cycle Homo sapiens R-HSA-1640170,38/566,3.204908e-14,1.634503e-11,0,0,4.317544,134.152603,TOP2A;AHCTF1;PCNA;MCM7;DCTN1;BUB1B;PPP2R2A;ORC...,Reactome_2016
3,Processing of Capped Intron-Containing Pre-mRN...,23/193,3.85826e-14,1.475784e-11,0,0,7.663729,236.70174,RANBP2;DDX5;NUP210;CPSF1;NUP155;NCBP1;NUP133;D...,Reactome_2016
4,DNA strand elongation Homo sapiens R-HSA-69190,11/32,1.039399e-12,3.180562e-10,0,0,22.106109,609.960125,GINS2;RFC3;PCNA;RFC4;MCM7;RFC2;GINS4;MCM3;MCM4...,Reactome_2016


In [14]:
trans2 = enr2.res2d.Genes[0]
genes_mult = trans2.split(';')
print('total genes:',len(genes_mult))

total genes: 74


In [15]:
bool_df = mult_sig_df.Proteomics.isin(genes_mult)
plot_df2 = mult_sig_df[bool_df]
len(plot_df2.Proteomics.unique())

74

In [16]:
# Only include p-values <  a certain cutoff
plot_df2 = plot_df2.loc[plot_df2['P_Value'] <= a]

In [17]:
p.plotCircleHeatMap(plot_df2, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=400, plot_width=1500)

In [30]:
get = pos_neg_df.Proteomics.isin(genes_mult) # bool df where True has both pos and neg
genes_pn_mult = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn_mult.Proteomics.unique()

array(['USP39', 'PUS7', 'SSRP1', 'DDX39A', 'ADAR', 'PARP1', 'RFC2',
       'ZC3H8', 'WDR75', 'BYSL', 'RFC4', 'XPO5', 'TOPBP1', 'HEATR1',
       'PPP2R2A', 'NAT10', 'MSH2', 'DCAF13', 'TPX2', 'WDR46', 'IMP3',
       'EXOSC9', 'GSR', 'MPHOSPH10', 'UTP6', 'NUP210', 'TFB2M', 'DARS2'],
      dtype=object)

In [31]:
# Only include p-values <  a certain cutoff
a = 0.05
genes_pn_mult = genes_pn_mult.loc[genes_pn_mult['P_Value'] <= a]

In [32]:
p.plotCircleHeatMap(genes_pn_mult, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=400, plot_width=850)

In [40]:
# checks
luad = pd.read_csv(root+R'\all_heatmap.csv')
luad.loc[luad['Proteomics'] == 'FAS']

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
9004,FAS,0.627584,-0.237472,Gbm
11112,FAS,0.000117,-0.513435,Hnscc
22668,FAS,0.00353,-0.8015,Luad
33678,FAS,0.058031,-0.69585,Lscc
52737,FAS,0.76446,0.17595,Brca
64407,FAS,0.907713,-0.059653,Ov
65817,FAS,0.002754,0.579,En
77642,FAS,0.187303,-0.207,Colon


# Check other pathways the genes (+/- and sig in mult. cancers) are part of

In [22]:
gnm = list(genes_pn_mult.Proteomics.unique())
enr3 = gp.enrichr(gene_list = gnm, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='/Enrichr')

In [23]:
enr3.res2d.head()

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Gene Expression Homo sapiens R-HSA-74160,28/1631,2.665841e-31,4.078736e-28,0,0,12.262416,863.269386,HEATR1;NAT10;ZC3H8;PPP2R2A;ADAR;USP39;WDR46;EX...,Reactome_2016
1,rRNA modification in the nucleus Homo sapiens ...,8/58,8.982745e-15,6.8718e-12,0,0,98.522167,3186.548859,UTP6;IMP3;HEATR1;WDR75;NAT10;MPHOSPH10;WDR46;D...,Reactome_2016
2,rRNA processing Homo sapiens R-HSA-72312,10/180,3.090586e-14,1.576199e-11,0,0,39.68254,1234.437725,UTP6;IMP3;EXOSC9;HEATR1;WDR75;NAT10;MPHOSPH10;...,Reactome_2016
3,Major pathway of rRNA processing in the nucleo...,9/166,9.062684e-13,3.466476e-10,0,0,38.726334,1073.859589,UTP6;IMP3;EXOSC9;HEATR1;WDR75;MPHOSPH10;WDR46;...,Reactome_2016
4,Regulation of TP53 Activity through Phosphoryl...,5/89,1.411826e-07,4.320187e-05,0,0,40.128411,632.953924,TPX2;RFC4;RFC2;SSRP1;TOPBP1,Reactome_2016


Transcriptional Regulation by TP53 

In [33]:
i = 4
print(enr3.res2d.Term[i])
tp53 = enr3.res2d.Genes[i]
tp53_list = tp53.split(';')
tp53_list

Regulation of TP53 Activity through Phosphorylation Homo sapiens R-HSA-6804756


['TPX2', 'RFC4', 'RFC2', 'SSRP1', 'TOPBP1']

In [34]:
tp53_df = genes_pn_mult[genes_pn_mult['Proteomics'].isin(tp53_list)]

In [36]:
# Only include p-values <  a certain cutoff
a = 0.01
genes_pn_mult = genes_pn_mult.loc[genes_pn_mult['P_Value'] <= a]

In [37]:
p.plotCircleHeatMap(tp53_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=400, plot_width=500)

In [46]:
i = 2
print(enr3.res2d.Term[i])
proc = enr3.res2d.Genes[i]
proc_list = proc.split(';')
proc_list

rRNA processing Homo sapiens R-HSA-72312


['UTP6',
 'IMP3',
 'EXOSC9',
 'HEATR1',
 'WDR75',
 'NAT10',
 'MPHOSPH10',
 'WDR46',
 'DCAF13',
 'BYSL']

In [47]:
proc_df = genes_pn_mult[genes_pn_mult['Proteomics'].isin(proc_list)]

In [50]:
# Only include p-values <  a certain cutoff
a = 0.01
genes_pn_mult = genes_pn_mult.loc[genes_pn_mult['P_Value'] <= a]

In [51]:
p.plotCircleHeatMap(proc_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=400, plot_width=500)