# Create Combined Heatmap for significant Complexes

This notebook looks at the significant genes in at least one cancer. Pancancer heat maps are created with circle size showing significance and color showing differences in median.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

First read in sig_pval_all_proteins.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. Then run GSEA using a list of genes from the df.

In [2]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
sig_df = pd.read_csv(root+R'\mult_sig_pval_heatmap.csv')

# Step 1: Get the list of genes from certain pathways

In [3]:
dna_rep = u.get_proteins_in_pathway('DNA Replication')

In [4]:
#mis_repair = prot_enr.res2d.Genes[4]
#repair_genes = mis_repair.split(';')
#genes = dna_genes +repair_genes 
print('total genes:',len(dna_rep))

total genes: 42


# Step 3: Create HeatMap

Slice out the significant genes in the pathway from sig_df (sig_pval_heatmap.csv).

In [5]:
bool_df = sig_df.Proteomics.isin(dna_rep)
plot_df = sig_df[bool_df]
len(plot_df.Proteomics.unique())

9

In [21]:
# Only include p-values < a certain pval
a = 0.05
plot_df = plot_df.loc[plot_df['P_Value'] <= a]

In [22]:
p.plotCircleHeatMap(plot_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                     plot_height=400, plot_width=500)

# Step 4: Create a HeatMap with both pos and neg differences in median

 Read in the pos_neg_df.csv to create a df with only genes that have a pos and neg difference in median in different cancers. Slice out the genes that have a pos and neg difference in median in the pathway using the list of genes with a significant p-value in the pathway. 

In [23]:
pos_neg_df = pd.read_csv(root+R'\pos_neg_df.csv')

In [24]:
get = pos_neg_df.Proteomics.isin(dna_rep) # bool df where True has both pos and neg
genes_pn = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn.Proteomics.unique()

array(['MCM6', 'MCM4', 'RFC5', 'MCM5', 'MCM2', 'MCM7', 'MCM3', 'RFC2',
       'RFC4', 'TOPBP1', 'CHEK1', 'GINS4', 'POLA2', 'GINS2', 'TOP2A'],
      dtype=object)

In [27]:
# Only include p-values < a
genes_pn = genes_pn.loc[genes_pn['P_Value'] <= a]

In [29]:
genes_pn.loc[genes_pn['Proteomics'] == 'CHEK1']

Unnamed: 0,Proteomics,P_Value,Medians,Cancer,size2,size
256,CHEK1,0.001264,0.461686,Gbm,6.673721,20.021163
923,CHEK1,0.005425,0.312344,Hnscc,5.216671,15.650014
4533,CHEK1,0.013974,-0.528,En,4.270586,12.811757


In [28]:
p.plotCircleHeatMap(genes_pn, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                     plot_height= 400, plot_width=600)

# Revised Heatmap 

In [12]:
df = pd.read_csv(root+R'\all_heatmap.csv')
mult_sig_list = list(df.Proteomics.unique()) # list of genes with a sig pval in >= 1 cancer

In [13]:
found = ['TOPBP1', 'TOP2A', 'GINS2', 'GINS4', 'POLA2', 'CHEK1']
for gene in found:
    dna_rep.append(gene)
dna_rep[-6:]

['TOPBP1', 'TOP2A', 'GINS2', 'GINS4', 'POLA2', 'CHEK1']

In [14]:
get = df.Proteomics.isin(dna_rep) # bool df where True has both pos and neg
genes_k = df[get] # Keep only genes with pos and neg
genes_k.Proteomics.unique()

array(['MCM6', 'MCM4', 'PCNA', 'POLD2', 'POLD3', 'RFC5', 'MCM5', 'MCM2',
       'MCM7', 'MCM3', 'RFC3', 'RPA1', 'RFC2', 'POLD1', 'RFC4', 'TOPBP1',
       'RPA3', 'CHEK1', 'GINS4', 'POLA2', 'PRIM1', 'RFC1', 'GINS2',
       'TOP2A', 'CDK2', 'POLE', 'RPA2', 'UBA52', 'GMNN', 'POLD4', 'POLE2',
       'CDT1', 'CDC7', 'MCM10'], dtype=object)

In [15]:
df.loc[df['Proteomics'] == 'CHEK1']

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
921,CHEK1,0.001264,0.461686,Gbm
11793,CHEK1,0.005425,0.312344,Hnscc
32436,CHEK1,0.923865,0.01005,Luad
35948,CHEK1,0.267259,0.2313,Lscc
46874,CHEK1,0.137476,0.53665,Brca
59014,CHEK1,0.404643,0.078545,Ov
66440,CHEK1,0.013974,-0.528,En


In [16]:
# Only include p-values < 0.01
genes_k = genes_k.loc[genes_k['P_Value'] <= 0.05]

In [17]:
p.plotCircleHeatMap(genes_k, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                     plot_height= 400, plot_width=800)

In [19]:
response = u.get_proteins_in_pathway('DNA Damage Response')

In [20]:
response

['CDKN1B',
 'GADD45B',
 'RAD52',
 'TP53AIP1',
 'CDK6',
 'GADD45G',
 'MDM2',
 'NBN',
 'CDK5',
 'CYCS',
 'CASP8',
 'CHEK2',
 'BAX',
 'SESN1',
 'BRCA1',
 'BBC3',
 'PRKDC',
 'RPA2',
 'CASP9',
 'APAF1',
 'CHEK1',
 'CASP3',
 'RAD50',
 'RB1',
 'HUS1',
 'PMAIP1',
 'PIDD',
 'RRM2B',
 'E2F1',
 'RAD9A',
 'SMC1A',
 'CCNB3',
 'CCND3',
 'CDKN1A',
 'RAD17',
 'CDK4',
 'SFN',
 'ATRIP',
 'DDB2',
 'CCND2',
 'CCNE2',
 'ATR',
 'CDC2',
 'CDC25A',
 'FAS',
 'TNFRSF10B',
 'TLK2',
 'CCNB2',
 'FANCD2',
 'CDK2',
 'CCNB1',
 'ATM',
 'CCND1',
 'CCNE1',
 'H2AFX',
 'MRE11',
 'PML',
 'BID',
 'TLK1',
 'RAD51',
 'TP53',
 'CDC25C',
 'ABL1',
 'GADD45A',
 'RAD1']