# Create Combined Heatmap for significant Complexes

This notebook looks at the significant genes in at least one cancer. Pancancer heat maps are created with circle size showing significance and color showing differences in median.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gseapy as gp

import cptac
import cptac.utils as u

import plot_utils as p

# Step 1: Run GSEA for significant genes in at least 1 cancer

First read in sig_pval_all_proteins.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. Then run GSEA using a list of genes from the df.

In [2]:
sig_df = pd.read_csv('../../../Make_Tables/csv/sig_pval_heatmap.csv')

prot_list = list(sig_df.Proteomics) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', 
                      gene_sets='KEGG_2016') 
                     
prot_enr.res2d.head()

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,KEGG_2016,Spliceosome Homo sapiens hsa03040,84/134,1.588886e-40,4.5124359999999994e-38,0,0,11.286154,1034.267321,RBM25;EIF4A3;DDX42;HNRNPU;PRPF19;USP39;EFTUD2;...
1,KEGG_2016,RNA transport Homo sapiens hsa03013,77/172,3.917898e-24,5.563415e-22,0,0,5.416175,291.912799,CYFIP2;NUP107;NUP188;POP4;RPP30;EIF4A3;PHAX;PN...
2,KEGG_2016,DNA replication Homo sapiens hsa03030,22/36,2.786306e-11,2.637703e-09,0,0,10.329937,251.055891,RFC5;FEN1;RFC3;PCNA;RFC4;MCM7;RFC1;RFC2;PRIM1;...
3,KEGG_2016,Ribosome biogenesis in eukaryotes Homo sapiens...,35/89,7.608707e-10,5.008356e-08,0,0,4.271897,89.695126,NVL;WDR3;POP4;RPP30;HEATR1;NAT10;NMD3;PWP2;WDR...
4,KEGG_2016,Mismatch repair Homo sapiens hsa03430,16/23,8.817528e-10,5.008356e-08,0,0,14.997296,312.680273,RFC5;RFC3;PCNA;RFC4;RFC1;RFC2;RPA1;MLH1;POLD3;...


# Step 2: Get the list of significant genes from certain pathways

In [3]:
dna_rep = prot_enr.res2d.Genes[2]
dna_genes = dna_rep.split(';')
mis_repair = prot_enr.res2d.Genes[4]
repair_genes = mis_repair.split(';')
genes = dna_genes +repair_genes 
print('total genes:',len(genes))

total genes: 38


# Step 3: Create HeatMap

Slice out the proteins found to be significant in at least one cancer in the pathway  (sig_pval_heatmap.csv).

In [4]:
bool_df = sig_df.Proteomics.isin(genes)
plot_df = sig_df[bool_df]
len(plot_df.Proteomics.unique())

26

In [5]:
# Only include p-values < a certain pval
a = 0.05
plot_df = plot_df.loc[plot_df['P_Value'] <= a]

In [6]:
p.plotCircleHeatMap(plot_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                     plot_width=700, plot_height = 400)

# Step 4: Create a HeatMap with both pos and neg differences in median

 Read in the pos_neg_df.csv to create a df with only genes that have a pos and neg difference in median in different cancers. Slice out the genes that have a pos and neg difference in median in the pathway using the list of genes with a significant p-value in the pathway. 

In [7]:
pos_neg_df = pd.read_csv('../../../Make_Tables/csv/pos_neg_df.csv')

In [8]:
get = pos_neg_df.Proteomics.isin(genes) # bool df where True has both pos and neg
genes_pn = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn.Proteomics.unique()

array(['RFC2', 'MSH2', 'MSH6'], dtype=object)

In [9]:
genes_pn = genes_pn.loc[genes_pn['Proteomics'] != 'SSBP1'] # NOT sig change in med for endo (blue)
genes_pn.Proteomics.unique()

array(['RFC2', 'MSH2', 'MSH6'], dtype=object)

In [10]:
# Only include p-values < a
genes_pn = genes_pn.loc[genes_pn['P_Value'] <= a]

In [11]:
p.plotCircleHeatMap(genes_pn, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                     plot_height= 300, plot_width=300)

# Revised Heatmap 

Read in mult_sig_pval_heatmap.csv into a df. This csv file contains proteins with a significant p-value in more than one cancer. 

In [12]:
df = pd.read_csv('../../../Make_Tables/csv/mult_sig_pval_heatmap.csv')

mult_sig_list = list(df.Proteomics.unique()) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = mult_sig_list, description='Tumor_partition', gene_sets='KEGG_2016')


In [13]:
prot_enr.res2d.head()

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,KEGG_2016,Ribosome biogenesis in eukaryotes Homo sapiens...,13/89,4.583854e-09,8.286973e-07,0,0,9.995601,191.922792,UTP6;IMP3;WDR3;HEATR1;NAT10;WDR75;IMP4;PWP2;UT...
1,KEGG_2016,DNA replication Homo sapiens hsa03030,9/36,8.204924e-09,8.286973e-07,0,0,19.296952,359.280903,RFC3;PCNA;RFC4;MCM7;RFC2;MCM3;MCM4;MCM5;MCM2
2,KEGG_2016,B cell receptor signaling pathway Homo sapiens...,11/73,5.135863e-08,3.458148e-06,0,0,10.313487,173.106033,PPP3CB;SYK;CHUK;PIK3CA;PRKCB;INPP5D;BLNK;RAC2;...
3,KEGG_2016,Fc gamma R-mediated phagocytosis Homo sapiens ...,12/93,7.506138e-08,3.7906e-06,0,0,8.629189,141.561492,PAK1;PTPRC;SYK;PIK3CA;MYO10;PRKCB;INPP5D;RAC2;...
4,KEGG_2016,RNA transport Homo sapiens hsa03013,15/172,3.543331e-07,1.431506e-05,0,0,5.593332,83.077922,RANBP2;NUP210;NUP155;NCBP1;NUP133;THOC3;THOC2;...


In [14]:
print(prot_enr.res2d.Term[1])
genes = prot_enr.res2d.Genes[1]
genes = genes.split(';')
found = ['TOPBP1', 'TOP2A', 'GINS2', 'GINS4', 'POLA2', 'CHEK1']
for gene in found:
    genes.append(gene)
genes

DNA replication Homo sapiens hsa03030


['RFC3',
 'PCNA',
 'RFC4',
 'MCM7',
 'RFC2',
 'MCM3',
 'MCM4',
 'MCM5',
 'MCM2',
 'TOPBP1',
 'TOP2A',
 'GINS2',
 'GINS4',
 'POLA2',
 'CHEK1']

In [15]:
get = df.Proteomics.isin(genes) # bool df where True has both pos and neg
genes_k = df[get] # Keep only genes with pos and neg
genes_k.Proteomics.unique()

array(['MCM4', 'PCNA', 'MCM5', 'MCM2', 'MCM7', 'MCM3', 'RFC3', 'RFC2',
       'RFC4', 'TOPBP1', 'GINS4', 'GINS2', 'TOP2A'], dtype=object)

In [16]:
# Only include p-values < a certain pval
genes_k = genes_k.loc[genes_k['P_Value'] <= a]

In [17]:
p.plotCircleHeatMap(genes_k, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                     plot_height= 400, plot_width=800)