# Create Heat Map

This notebook looks at the significant genes in at least one cancer in the DNA Replication pathway. Pancancer heat maps are created with circle size showing significance and color showing differences in median.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math
import scipy.stats
import re
import sys
import statsmodels.stats.multitest
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, export_png, export_svgs
from bokeh.layouts import row

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
import gseapy as gp

In [3]:
def create_circle_legend_df(legend_min, legend_max):
    
    # Find middle pvals
    # Find difference between exponents of the min and max
    exp_legend_min = np.log10(legend_min)
    exp_legend_max = np.log10(legend_max)
    delta = exp_legend_min - exp_legend_max 
    # Split difference into quarters 
    num = 4
    val = delta / num
    # Find middle exponents
    exp2 = round(exp_legend_min - val)
    exp3 = round(exp_legend_max + val)
    # Create middle pvals
    pval2 = 1*10**exp2
    pval3 = 1*10**exp3
    
    # Foramat scientific notation pvals as strings for y_axis labels  
    max_str = "{:.1e}".format(legend_max, '.2f')
    pval_str_3 = "{:.1e}".format(pval3, '.2f')
    pval_str_2 = "{:.1e}".format(pval2, '.2f')
    min_str = "{:.1e}".format(legend_min, '.2f')
    
    # max to min
    data = {'P_Value':  [legend_max, pval3, pval2, legend_min],
            'y_axis': [max_str, pval_str_3, pval_str_2, min_str],
            'x_axis': ['', '', '', ''],
            'Medians': [1.5, 1.5, 1.5, 1.5]}

    fake_df = pd.DataFrame (data, columns = ['x_axis', 'y_axis', 'P_Value', 'Medians'])
    
    fake_df["size2"] = fake_df['P_Value'].apply(lambda x: -1*(np.log(x)))
    fake_df['size'] = (fake_df["size2"])*3
    
    return fake_df

In [4]:
def create_circle_legend(df, circle_var, color_var, legend_min, legend_max,
                         x_axis = 'x_axis', y_axis = 'y_axis', 
                         plot_height = 200, plot_width = 120):
    # Use the smallest pval
    if df[circle_var].min() < legend_min:
        legend_min = df[circle_var].min()
    circle_df = create_circle_legend_df(legend_min, legend_max)
    
    maxval = circle_df[color_var].max()
    minval = circle_df[color_var].min()
    if maxval > abs(minval):
        minval = maxval * -1 
    if maxval < abs(minval):
        maxval = minval * -1
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = minval, high = maxval)
    
    circle = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, toolbar_location=None, tools="hover")

    circle.scatter(x_axis, y_axis, source = circle_df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})
    
    circle.x_range.factors = sorted(circle_df[x_axis].unique().tolist())
    circle.y_range.factors = circle_df[y_axis].unique().tolist() # plots in reverse order of df (max to min)
    circle.xaxis.major_label_orientation = math.pi/2
    
    circle.xaxis.axis_label = 'P-Values'
    
    return circle

# Step 1: Run GSEA for significant genes in at least 1 cancer

First read in sig_pval_all_proteins.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. Then run GSEA using a list of genes from the df.

In [5]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
sig_df = pd.read_csv(root+R'\sig_pval_heatmap.csv')

prot_list = list(sig_df.Proteomics) # list of genes with a sig pval in >= 1 cancer
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='/Enrichr')

In [6]:
prot_enr.res2d.head(30)

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Gene Expression Homo sapiens R-HSA-74160,440/1631,6.403332e-53,9.797098e-50,0,0,2.015489,242.221802,TDRKH;RPL4;ATF2;MDC1;NUP107;HNRNPU;EHMT1;PHAX;...,Reactome_2016
1,Processing of Capped Intron-Containing Pre-mRN...,116/193,1.992679e-52,1.524399e-49,0,0,4.490372,534.556067,NUP107;NUP188;EIF4A3;HNRNPU;EFTUD2;SNRPD2;SNRP...,Reactome_2016
2,mRNA Splicing - Major Pathway Homo sapiens R-H...,85/134,2.4329929999999997e-41,1.2408259999999999e-38,0,0,4.739099,443.18566,EIF4A3;HNRNPU;YBX1;PRPF19;USP39;ELAVL1;EFTUD2;...,Reactome_2016
3,mRNA Splicing Homo sapiens R-HSA-72172,85/144,7.488069999999999e-38,2.864187e-35,0,0,4.409995,376.988047,EIF4A3;HNRNPU;YBX1;PRPF19;USP39;ELAVL1;EFTUD2;...,Reactome_2016
4,Major pathway of rRNA processing in the nucleo...,82/166,4.309535e-29,1.318718e-26,0,0,3.690519,241.043035,LTV1;RPL4;DDX47;RPP30;WDR3;RPLP0;PWP2;RPL10A;R...,Reactome_2016
5,rRNA processing Homo sapiens R-HSA-72312,85/180,2.535428e-28,6.465341e-26,0,0,3.527996,224.175973,LTV1;RPL4;DDX47;RPP30;WDR3;RPLP0;NAT10;PWP2;RP...,Reactome_2016
6,Infectious disease Homo sapiens R-HSA-5663205,127/348,4.2381300000000005e-28,9.263341e-26,0,0,2.726504,171.846789,RPL4;NUP107;NUP188;RPL10A;RPL9;PSMD8;RPS15;PSM...,Reactome_2016
7,"Cell Cycle, Mitotic Homo sapiens R-HSA-69278",145/462,3.3021579999999998e-24,6.315378000000001e-22,0,0,2.34481,126.777933,ANKLE2;NUP107;NUP188;ZWILCH;BUB1B;PPP2R2A;SMC4...,Reactome_2016
8,Cell Cycle Homo sapiens R-HSA-1640170,163/566,1.242046e-22,2.111478e-20,0,0,2.151557,108.524761,MDC1;ANKLE2;NUP107;NUP188;ZWILCH;BUB1B;PPP2R2A...,Reactome_2016
9,Transport of Mature Transcript to Cytoplasm Ho...,46/74,1.962264e-22,3.0022649999999997e-20,0,0,4.644166,232.128295,NUP205;NUP107;SEH1L;NUP188;EIF4A3;NUP160;NXF1;...,Reactome_2016


# Step 2: Get the list of significant genes 

In [7]:
index = 7
trans = prot_enr.res2d.Genes[index]
genes = trans.split(';')
print(prot_enr.res2d.Term[index])
genes = genes[:20]
print('total genes:',len(genes))

Cell Cycle, Mitotic Homo sapiens R-HSA-69278
total genes: 20


# Step 3: Create HeatMap

Slice out genes from the DNA Replication pathway from the df with genes sig in > 1 cancer.

In [8]:
# sig > 1 cancer
bool_df = sig_df.Proteomics.isin(genes)
plot_df = sig_df[bool_df]
len(plot_df.Proteomics.unique())

20

In [9]:
# Only include p-values < a certain cutoff
a = 0.05
plot_df = plot_df.loc[plot_df['P_Value'] <= a]

In [12]:
def plotCircleHeatMap (df, circle_var, color_var, x_axis, y_axis,plot_width= 1000, font_size=12,
                       plot_height = 650, legend_min = 1e-6, legend_max = 0.01,
                       x_axis_lab = "no_label", y_axis_lab = "no_label", 
                       show_plot = True, save_png = "plot.png"):
  
    # circle_var designed for pvalues. Normalized by taking log 10 of values and multiplying by 5 
    #added a new column to make the plot size
    
    df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
    df['size'] = (df["size2"])*3
    #find values to set color bar min/ max as 
    maxval = df[color_var].max()
    minval = df[color_var].min()
    if maxval > abs(minval):
        minval = maxval * -1 
    if maxval < abs(minval):
        maxval = minval * -1
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = minval, high = maxval)
    p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, 
               toolbar_location=None, tools="hover")

    p.scatter(x_axis,y_axis,source=df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})
    
    p.x_range.factors = sorted(df[x_axis].unique().tolist())
    p.y_range.factors = sorted(df[y_axis].unique().tolist(), reverse = True)
    p.xaxis.major_label_orientation = math.pi/2
    
    # font size
    p.axis.major_label_text_font_size = str(font_size)+"pt"
    
    if (x_axis_lab != "no_label" ):
        p.xaxis.axis_label = x_axis_lab
        p.xaxis.axis_label_text_font_size = '12pt'
    if (y_axis_lab != "no_label" ):   ### change to y
        p.yaxis.axis_label = y_axis_lab
        p.yaxis.axis_label_text_font_size = '12pt'

    bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
    p.add_layout(bar, "right")
    
    # Create Circle Legend
    circle_legend = create_circle_legend(df, circle_var, color_var, 
                                         legend_min = legend_min, legend_max = legend_max)
    
    if show_plot:
        output_notebook()
        show(row(p, circle_legend))
      
    if save_png != "plot.png":
        export_png(row(p, circle_legend), filename= save_png)

In [15]:
p.plotCircleHeatMap(plot_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=300, plot_width= 500, font_size= 10, legend_min = 5e-3, legend_max = 0.05, 
                    x_axis_lab= 'Proteomics', save_png = 'test4.png')