# Figure 3 Circle Heatmap all trans Proteins

This notebook take lists of genes from the enrichment analysis and graphs the correlation in circle heat maps. 

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math as math
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import cptac
import cptac.utils as u

In [168]:
import pandas as pd
import numpy as np
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColumnDataSource, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.io import export_png
from bokeh.io import export_svgs


'''
@Param df: Dataframe. Contains column with x-axis categorical variables, y-axis categorical variables,
and columns for circle size and color gradient. 
@Param circle_var. String. Name of column for numeric data to base circle size off of 
@Param color_var. String. Name of column of numeric data to base color gradient off of. Can be the same or different as circle_var
@Param x_axis String. Name of column for x-axis categorical labels
@Param y_axis String. Name of column for y-axis categorical labels
@Param x_axis_lab. String. Default is no label. 
@Param y_axis_lab. String. Default is no label. 

This function creates a bokeh map that is heat map with extra variable of size of the circles. 

'''
def plotCircleHeatMap ( df, circle_var, color_var, x_axis, y_axis,x_axis_lab = "no_label", y_axis_lab = "no_label"):
  


    #added a new column to make the plot size
    df["size"] = (np.log10(df[circle_var]))
    df["size"] = np.abs(df["size"])
    df['size'] = np.where(df["size"]<0, np.abs(df["size"]), (df["size"]))*5
    
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = -.6, high =.6)
    p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width=1000, 
               plot_height=650, 
               toolbar_location=None, tools="hover")

    p.scatter(x_axis,y_axis,source=df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})

    p.x_range.factors = sorted(df[x_axis].unique().tolist())
    p.y_range.factors = sorted(df[y_axis].unique().tolist(), reverse = True)
    p.xaxis.major_label_orientation = math.pi/2
    
    if (x_axis_lab != "no_label" ):
        p.xaxis.axis_label = x_axis_lab
    if (x_axis_lab != "no_label" ):   
        p.yaxis.axis_label = y_axis_lab

    bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
    p.add_layout(bar, "right")
    output_notebook()
    #return df
    show(p)

# Complement Genes

Insert list from enrichment analysis and add back proteomics to name

In [169]:

complement_pos_neg = ['FGB','ITGAM','VWF','F10','SERPIND1','SERPINC1','PROS1','SERPINE1','PLAUR','F2','C8A','CLU','KNG1','C2','C3','PROCR','C5','C8G','C6','PROC','A2M','KLKB1','CFB']
complement_genes = ['CFD','CPB2','ITGAM','CFH','C1S','C1R','SERPINC1','PROS1','SERPINE1','ITGB2','F13A1','PLG','C8B','C8A','CLU','KNG1','C2','C3','THBD','C5','C8G','C6','C7','PLAU','VSIG4','A2M','FGB','FGA','CR2','SERPINB2','VWF','F10','FGG','SERPINF2','F11','PLAUR','F2','F3','PROCR','F9','PROC','SERPING1','F13B','KLKB1','CFB']
complement_column_names = []
for gene in complement_pos_neg:
    gene += "_proteomics"
    complement_column_names.append(gene)
len(complement_column_names)


23

Load appended version of data frame with correlations with FDR correction. Subset down dataframe to only have complement genes

In [170]:
df_FDR_append = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_pearson_sig_all_prot_append_FDR.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)

df_FDR_complement = df_FDR_append[df_FDR_append.Comparison.isin(complement_column_names)]
df_FDR_complement


Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
77,PROCR_proteomics,-0.470784,8.763500e-07,GBM
196,PLAUR_proteomics,-0.425639,1.122876e-05,GBM
305,FGB_proteomics,-0.404936,3.218521e-05,GBM
482,C2_proteomics,-0.373811,1.382574e-04,GBM
532,F10_proteomics,-0.367388,1.834809e-04,GBM
...,...,...,...,...
77361,FGB_proteomics,-0.039068,6.853102e-01,Lscc
77584,SERPIND1_proteomics,-0.035328,7.140663e-01,Lscc
77647,CLU_proteomics,-0.034342,7.217108e-01,Lscc
77872,SERPINC1_proteomics,-0.030863,7.489112e-01,Lscc


Make figure size of circles base on p_values and color based on correlation. Non sigificant correlations included. 

In [171]:
plotCircleHeatMap(df_FDR_complement, "P_value","Correlation","Comparison","Cancer Type")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Make same graph, but filter out non significant values

In [172]:
df_FDR_append_sig_only = df_FDR_complement[df_FDR_complement["P_value"] < 0.005]

In [173]:
plotCircleHeatMap(df_FDR_append_sig_only,"P_value","Correlation","Comparison","Cancer Type")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Load in wide data frame and subset with genes in figure in order to see raw numbers associated with figure 

In [174]:
df_FDR_wide = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_all_FDR_wide.csv")
df_FDR_wide = df_FDR_wide.drop(['Unnamed: 0'], axis=1)

df_FDR_wide_complement = df_FDR_wide[df_FDR_wide.Comparison.isin(complement_column_names)]
df_FDR_wide_complement

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_colon,P_value_colon,Correlation_Lscc,P_value_Lscc
77,PROCR_proteomics,-0.470784,8.7635e-07,0.284925,0.002553,,,0.336022,0.0002412023,,,,,,,,
196,PLAUR_proteomics,-0.425639,1.122876e-05,,,,,0.467428,1.389846e-07,,,0.278504,0.003364,,,,
305,FGB_proteomics,-0.404936,3.218521e-05,,,,,0.303047,0.000994482,,,,,,,,
482,C2_proteomics,-0.373811,0.0001382574,,,0.368573,0.000607,0.267613,0.005781507,,,,,,,,
532,F10_proteomics,-0.367388,0.0001834809,,,,,0.30666,0.0008581297,,,,,,,,
631,C8A_proteomics,-0.35468,0.0003157269,,,0.43097,4.8e-05,0.244053,0.00857733,,,,,,,,
810,A2M_proteomics,-0.33301,0.0007571685,,,,,0.272221,0.003248877,,,,,0.311803,0.001877,,
833,PROS1_proteomics,-0.331142,0.0008141044,,,,,0.384178,2.24829e-05,-0.281229,0.002919,,,,,,
838,ITGAM_proteomics,-0.330556,0.0008327428,,,,,0.388339,1.798502e-05,,,,,,,,
903,C6_proteomics,-0.324377,0.001054494,,,0.405493,0.000143,0.287137,0.001862266,,,,,,,,


# Endocytosis genes

In [175]:

endocytosis = ['SH3GLB1','ARFGEF1','PRKCI','SMAD3','WIPF2','CAV1','VPS4B','PARD6G','ARPC4','ARPC5','IGF1R','EPN3','ACAP2','DAB2','PARD6B','ZFYVE16','ERBB3','ARPC3','NEDD4','CAPZA1','KIT','CHMP4C','MET']
endocytosis_column_names = []
for gene in endocytosis:
    gene += "_proteomics"
    endocytosis_column_names.append(gene)
len(endocytosis_column_names)


23

In [176]:
df_FDR_endocytosis = df_FDR_append[df_FDR_append.Comparison.isin(endocytosis_column_names)]
df_FDR_endocytosis

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
6,DAB2_proteomics,-0.556402,2.237010e-09,GBM
164,NEDD4_proteomics,-0.434925,6.843176e-06,GBM
206,ARPC3_proteomics,-0.423330,1.267190e-05,GBM
291,CAV1_proteomics,-0.407997,2.766397e-05,GBM
399,CAPZA1_proteomics,-0.387765,7.323051e-05,GBM
...,...,...,...,...
75498,ARPC4_proteomics,0.071803,4.560076e-01,Lscc
75838,DAB2_proteomics,-0.066031,4.931051e-01,Lscc
76307,ERBB3_proteomics,-0.057605,5.500006e-01,Lscc
77330,ARPC3_proteomics,0.039662,6.807908e-01,Lscc


In [177]:
plotCircleHeatMap(df_FDR_endocytosis,"P_value","Correlation","Comparison","Cancer Type")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [178]:
df_FDR_wide_endocytosis = df_FDR_wide[df_FDR_wide.Comparison.isin(endocytosis_column_names)]
df_FDR_wide_endocytosis

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_colon,P_value_colon,Correlation_Lscc,P_value_Lscc
6,DAB2_proteomics,-0.556402,2.23701e-09,,,,,0.326055,0.0003764707,,,,,,,,
164,NEDD4_proteomics,-0.434925,6.843176e-06,0.321219,0.000622,,,,,,,,,,,,
206,ARPC3_proteomics,-0.42333,1.26719e-05,,,,,0.311784,0.000693981,-0.301699,0.001359677,,,,,,
291,CAV1_proteomics,-0.407997,2.766397e-05,0.344889,0.000224,,,,,,,,,,,,
399,CAPZA1_proteomics,-0.387765,7.323051e-05,,,,,0.311677,0.0006970752,,,,,,,,
449,ZFYVE16_proteomics,-0.379396,0.0001075736,,,,,,,,,,,0.390961,7.5e-05,,
472,SH3GLB1_proteomics,-0.375266,0.0001295641,,,,,0.316491,0.0005690949,,,,,,,,
601,ARPC4_proteomics,-0.357529,0.0002800944,-0.264291,0.00527,,,0.354157,0.0001031524,-0.294842,0.001767048,,,,,,
662,ARPC5_proteomics,-0.3493,0.0003946124,,,,,0.353939,0.0001042416,-0.270303,0.004290947,,,,,,
697,MET_proteomics,-0.344604,0.0004779015,,,,,0.451064,7.889147e-05,0.485692,7.518188e-08,,,,,,


# Aktin Genes

In [179]:
aktin_genes = ['ITGB1','GSN','ITGAM','ITGB5','ITGA3','ITGB3','LIMK2','ARPC4','ARPC5','F2','BAIAP2','ARHGAP35','NRAS','DIAPH2','ARPC3','ARHGEF1','PFN1','VCL']

In [180]:
aktin_column_names = []
for gene in aktin_genes:
    gene += "_proteomics"
    aktin_column_names.append(gene)
len(aktin_column_names)

18

In [181]:
df_FDR_aktin = df_FDR_append[df_FDR_append.Comparison.isin(aktin_column_names)]
df_FDR_aktin

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
206,ARPC3_proteomics,-0.423330,0.000013,GBM
271,ITGB1_proteomics,-0.412364,0.000022,GBM
315,PFN1_proteomics,-0.402695,0.000036,GBM
526,VCL_proteomics,-0.368318,0.000176,GBM
601,ARPC4_proteomics,-0.357529,0.000280,GBM
...,...,...,...,...
75970,GSN_proteomics,0.077248,0.510065,Lscc
76446,NRAS_proteomics,-0.054907,0.568870,Lscc
77330,ARPC3_proteomics,0.039662,0.680791,Lscc
79285,VCL_proteomics,0.008139,0.932744,Lscc


In [182]:
plotCircleHeatMap(df_FDR_aktin,"P_value","Correlation","Comparison","Cancer Type")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
