# Figure 3 Circle Heatmap all trans Proteins

This notebook take lists of genes from the enrichment analysis and graphs the correlation in circle heat maps. 

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math as math
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import cptac
import cptac.utils as u

  import pandas.util.testing as tm


In [9]:
import pandas as pd
import numpy as np
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColumnDataSource, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.io import export_png
from bokeh.io import export_svgs


'''
@Param df: Dataframe. Contains column with x-axis categorical variables, y-axis categorical variables,
and columns for circle size and color gradient. 
@Param circle_var. String. Name of column for numeric data to base circle size off of 
@Param color_var. String. Name of column of numeric data to base color gradient off of. Can be the same or different as circle_var
@Param x_axis String. Name of column for x-axis categorical labels
@Param y_axis String. Name of column for y-axis categorical labels
@Param x_axis_lab. String. Default is no label. 
@Param y_axis_lab. String. Default is no label. 

This function creates a bokeh map that is heat map with extra variable of size of the circles. 

'''
def plotCircleHeatMap ( df, circle_var, color_var, x_axis, y_axis,x_axis_lab = "no_label", y_axis_lab = "no_label"):
  


    #added a new column to make the plot size
    df["size"] = (np.log10(df[circle_var]))
    df["size"] = np.abs(df["size"])
    df['size'] = np.where(df["size"]<0, np.abs(df["size"]), (df["size"]))*5
    
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = -.6, high =.6)
    p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width=1000, 
               plot_height=650, 
               toolbar_location=None, tools="hover")

    p.scatter(x_axis,y_axis,source=df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})

    p.x_range.factors = sorted(df[x_axis].unique().tolist())
    p.y_range.factors = sorted(df[y_axis].unique().tolist(), reverse = True)
    p.xaxis.major_label_orientation = math.pi/2
    
    if (x_axis_lab != "no_label" ):
        p.xaxis.axis_label = x_axis_lab
    if (x_axis_lab != "no_label" ):   
        p.yaxis.axis_label = y_axis_lab

    bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
    p.add_layout(bar, "right")
    output_notebook()
    #return df
    show(p)

# Load in File and Subset

Insert list from enrichment analysis and add back proteomics to name

In [10]:
t_cell_genes = ['JUN','NFATC2','PIK3CD','CD3G','PIK3CG','VAV1','MAPK13','ZAP70','CD4','PTPRC','PPP3CC','LCK','GRAP2','PRKCQ','GRB2','LCP2','PTPN6','CARD11']
t_cells_column_names = []
for gene in t_cell_genes:
    gene += "_proteomics"
    t_cells_column_names.append(gene)
len(t_cell_genes)


18

Load appended version of data frame with correlations with FDR correction. Subset down dataframe to only have complement genes

In [12]:
df_FDR_append = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_pearson_sig_all_prot_append_FDR.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)

df_FDR_t_cell = df_FDR_append[df_FDR_append.Comparison.isin(t_cells_column_names)]
df_FDR_t_cell

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
2,GRB2_proteomics,-0.610889,1.886384e-11,GBM
83,CD4_proteomics,-0.467479,1.069659e-06,GBM
87,PIK3CD_proteomics,-0.466265,1.150323e-06,GBM
100,LCP2_proteomics,-0.460881,1.582634e-06,GBM
127,VAV1_proteomics,-0.447143,3.486858e-06,GBM
...,...,...,...,...
72176,MAPK13_proteomics,0.144000,1.333897e-01,Lscc
72451,PIK3CD_proteomics,-0.136597,1.547478e-01,Lscc
74174,PPP3CC_proteomics,-0.096720,3.148109e-01,Lscc
74519,PRKCQ_proteomics,-0.090094,3.492628e-01,Lscc


Make figure size of circles base on p_values and color based on correlation. Non sigificant correlations included. 

In [13]:
plotCircleHeatMap(df_FDR_t_cell, "P_value","Correlation","Comparison","Cancer Type")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Load in wide data frame and subset with genes in figure in order to see raw numbers associated with figure 

In [14]:
df_FDR_wide = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_all_FDR_wide.csv")
df_FDR_wide = df_FDR_wide.drop(['Unnamed: 0'], axis=1)

df_FDR_wide_t_cell = df_FDR_wide[df_FDR_wide.Comparison.isin(t_cells_column_names)]
df_FDR_wide_t_cell

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_colon,P_value_colon,Correlation_Lscc,P_value_Lscc
2,GRB2_proteomics,-0.610889,1.886384e-11,,,,,,,-0.302439,0.001321,-0.532341,2.559824e-09,,,,
83,CD4_proteomics,-0.467479,1.069659e-06,,,,,,,,,-0.417232,6.378812e-06,,,,
87,PIK3CD_proteomics,-0.466265,1.150323e-06,,,,,,,-0.33176,0.000399,,,,,,
100,LCP2_proteomics,-0.460881,1.582634e-06,,,,,,,,,-0.390098,2.750943e-05,,,,
127,VAV1_proteomics,-0.447143,3.486858e-06,-0.285693,0.002483,,,,,,,-0.439464,1.745944e-06,,,,
200,PTPRC_proteomics,-0.424958,1.163774e-05,,,,,,,,,-0.402397,1.440882e-05,,,,
290,CARD11_proteomics,-0.408163,2.743707e-05,,,,,,,,,-0.40482,1.264613e-05,,,,
319,PTPN6_proteomics,-0.402017,3.713191e-05,,,,,,,-0.37199,6.3e-05,-0.411321,8.866717e-06,,,,
363,PIK3CG_proteomics,-0.394048,5.449073e-05,,,,,,,,,-0.402484,1.434139e-05,,,,
370,NFATC2_proteomics,-0.392602,5.835717e-05,,,,,,,,,-0.457536,5.679765e-07,,,,


# Killer T cell genes

In [16]:
killerTcell = ['JUN','NFATC2','PIK3CD','CD3G','PIK3CG','VAV1','MAPK13','ZAP70','CD4','PTPRC','PPP3CC','LCK','GRAP2','PRKCQ','GRB2','LCP2','PTPN6','CARD11']
killerTcells_column_names = []
for gene in killerTcell:
    gene += "_proteomics"
    killerTcells_column_names.append(gene)
len(killerTcells_column_names)


18

In [17]:
df_FDR_killerT = df_FDR_append[df_FDR_append.Comparison.isin(killerTcells_column_names)]
df_FDR_killerT

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
2,GRB2_proteomics,-0.610889,1.886384e-11,GBM
83,CD4_proteomics,-0.467479,1.069659e-06,GBM
87,PIK3CD_proteomics,-0.466265,1.150323e-06,GBM
100,LCP2_proteomics,-0.460881,1.582634e-06,GBM
127,VAV1_proteomics,-0.447143,3.486858e-06,GBM
...,...,...,...,...
72176,MAPK13_proteomics,0.144000,1.333897e-01,Lscc
72451,PIK3CD_proteomics,-0.136597,1.547478e-01,Lscc
74174,PPP3CC_proteomics,-0.096720,3.148109e-01,Lscc
74519,PRKCQ_proteomics,-0.090094,3.492628e-01,Lscc


In [18]:
plotCircleHeatMap(df_FDR_killerT,"P_value","Correlation","Comparison","Cancer Type")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
df_FDR_wide_killerTcells = df_FDR_wide[df_FDR_wide.Comparison.isin(killerTcells_column_names)]
df_FDR_wide_killerTcells

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_colon,P_value_colon,Correlation_Lscc,P_value_Lscc
2,GRB2_proteomics,-0.610889,1.886384e-11,,,,,,,-0.302439,0.001321,-0.532341,2.559824e-09,,,,
83,CD4_proteomics,-0.467479,1.069659e-06,,,,,,,,,-0.417232,6.378812e-06,,,,
87,PIK3CD_proteomics,-0.466265,1.150323e-06,,,,,,,-0.33176,0.000399,,,,,,
100,LCP2_proteomics,-0.460881,1.582634e-06,,,,,,,,,-0.390098,2.750943e-05,,,,
127,VAV1_proteomics,-0.447143,3.486858e-06,-0.285693,0.002483,,,,,,,-0.439464,1.745944e-06,,,,
200,PTPRC_proteomics,-0.424958,1.163774e-05,,,,,,,,,-0.402397,1.440882e-05,,,,
290,CARD11_proteomics,-0.408163,2.743707e-05,,,,,,,,,-0.40482,1.264613e-05,,,,
319,PTPN6_proteomics,-0.402017,3.713191e-05,,,,,,,-0.37199,6.3e-05,-0.411321,8.866717e-06,,,,
363,PIK3CG_proteomics,-0.394048,5.449073e-05,,,,,,,,,-0.402484,1.434139e-05,,,,
370,NFATC2_proteomics,-0.392602,5.835717e-05,,,,,,,,,-0.457536,5.679765e-07,,,,
