In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import cptac
import cptac.utils as u
import plot_utils as p 

  import pandas.util.testing as tm


In [2]:
import statsmodels.stats.multitest
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColumnDataSource, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, export_png, export_svgs
from bokeh.layouts import row
import math as math




def plotCircleHeatMap ( df, circle_var, color_var, x_axis, y_axis,plot_width= 1000, plot_height = 650, x_axis_lab = "no_label", y_axis_lab = "", show_plot = True, save_png = "plot.png"):
  
    # circle_var designed for pvalues. Normalized by taking log 10 of values and multiplying by 5 
    #added a new column to make the plot size
    
    df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
    df['size'] = (df["size2"])*3
    #find values to set color bar min/ max as 
    maxval = df[color_var].max()
    minval = df[color_var].min()
    if maxval > abs(minval):
        minval = maxval * -1 
    if maxval < abs(minval):
        maxval = minval * -1
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = minval, high = maxval)
    p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, 
               toolbar_location=None, tools="hover")

    p.scatter(x_axis,y_axis,source=df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})

    p.x_range.factors = sorted(df[x_axis].unique().tolist())
    p.y_range.factors = sorted(df[y_axis].unique().tolist(), reverse = True)
    p.xaxis.major_label_orientation = math.pi/2
    
    if (x_axis_lab != "no_label" ):
        p.xaxis.axis_label = x_axis_lab
    if (x_axis_lab != "no_label" ):   
        p.yaxis.axis_label = y_axis_lab

    bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
    p.add_layout(bar, "right")
    
    # Create Circle Legend
    circle_legend = create_circle_legend(df, circle_var, color_var)
    
    if show_plot:
        output_notebook()
        show(row(p, circle_legend))
      
    if save_png != "plot.png":
        export_png(p, filename= save_png)
             
        
'''
@Param df: Dataframe. Same as df passed to plotCircleHeatMap.
@Param lowest_pval: Float. Lowest p-value to include in the legend.
@Param highest_pval: Float. Highest p-value to include in the legend.

Returns: df to be used in creating the circle legend. 
'''

def create_circle_legend_df(lowest_pval = 1e-6, highest_pval = .05):
    lowest_pval_str = "{:.1e}".format(lowest_pval, '.2f')
    med_pval_str = "{:.1e}".format(lowest_pval * float(100), '.2f')
    highest_pval_str = "{:.1e}".format(highest_pval, '.2f')
    
    
    data = {'P_Value':  [lowest_pval, (lowest_pval * float(100)), highest_pval],
            'y_axis': [lowest_pval_str, med_pval_str, highest_pval_str],
            'x_axis': ['', '', ''],
            'Correlation': [.5, .5, .5]}

    fake_df = pd.DataFrame (data, columns = ['x_axis', 'y_axis', 'P_Value', "Correlation"])
    
    fake_df["size2"] = fake_df['P_Value'].apply(lambda x: -1*(np.log(x)))
    fake_df['size'] = (fake_df["size2"])*3
    
    return fake_df

'''
@Param df: Dataframe. Same as df passed to plotCircleHeatMap.
@Param circle_var: Column Label. Same as passed to plotCircleHeatMap.
@Param color_var: Column Label. Same as passed to plotCircleHeatMap.
@Param x_axis: Column Label. Used on the x-axis.
@Param y_axis: Column Label. Used on the y-axis.
@Param lowest_pval: Float. Lowest p-value to include in the legend.
@Param highest_pval: Float. Highest p-value to include in the legend.

Returns: df to be used in creating the circle legend. 
'''

def create_circle_legend(df, circle_var, color_var, x_axis = 'x_axis', y_axis = 'y_axis', 
                         lowest_pval = 1e-6, highest_pval = .05, plot_height = 200, plot_width = 120):
    # Use the smallest pval
    if df[circle_var].min() < lowest_pval:
        lowest_pval = df[circle_var].min()
    circle_df = create_circle_legend_df(lowest_pval, highest_pval)
    
    
    circle = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, toolbar_location=None, tools="hover")

    circle.scatter(x_axis, y_axis, source = circle_df, fill_alpha=1,  line_width=0, size="size")
    
    circle.x_range.factors = sorted(circle_df[x_axis].unique().tolist())
    circle.y_range.factors = sorted(circle_df[y_axis].unique().tolist(), reverse = True)
    circle.xaxis.major_label_orientation = math.pi/2
    
    circle.xaxis.axis_label = 'FDR P-Values'
    
    return circle
      


In [3]:
coag_genes = ['CPB2','ITGAM','CFH','C1S','C1R','PROS1','SERPINE1','ITGB2','F13A1','C4BPA','PLG','C4BPB','C8B','C8A','C2','C3','VTN','C5','C4A',"C4B",'C8G','C6','C7','C9',"CFG","CD59",'CD55','CD46','PLAU','VSIG4','A2M','MBL2','FGB','FGA','SERPINB2','F10','SERPIND1','FGG','F11','PLAUR','F2','F5','PROCR','F9','SERPING1','MASP2','F13B','MASP1','KLKB1','TNFAIP3',"TNFAIP6","TNFAIP8L2",'TNFAIP8L3',"TNFRSF12A",'TNFRSF17',
 'TNFRSF1A',
 'TNFSF10', "F2","F3","F7",'F8', "F9",'F5','F10',"F11","F13A1","F13B","KLKB1","FGA",'FGB','FGG'
 ]



coag_column_names = []
for gene in coag_genes:
    gene += "_proteomics"
    coag_column_names.append(gene)



In [4]:
df_FDR_append = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_pearson_sig_all_prot_append_FDR.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)

df_FDR_coag = df_FDR_append[df_FDR_append.Comparison.isin(coag_column_names)]

df_FDR_coag = df_FDR_coag.replace(to_replace ='_proteomics', value = '', regex = True)

In [5]:
plotCircleHeatMap(df_FDR_coag, "fdr_bh_p_val","Correlation","Comparison","Cancer Type",x_axis_lab = "Proteomics",plot_width= 1000, plot_height = 650)

In [12]:
platlet = ['A1BG', 'A2M', 'APBB1IP', 'CLU', 'DGKA', 'DGKB', 'F2', 'FGB',
       'GNG2', 'GNG7', 'ITGB3', 'ITIH4', 'ITPR3', 'KNG1', 'PECAM1',
       'PFN1', 'PLA2G4A', 'PRKCB', 'PROS1', 'QSOX1', 'RASGRP2',
       'SERPINE1', 'VCL', 'VWF',"PTGS2","ITGA2B","PLA2G2A","ITGA5","ITGB1","ITGAV","FN1","PTK2","SELP1"]


In [13]:
platlet_column_names = []
for gene in platlet:
    gene += "_proteomics"
    platlet_column_names.append(gene)

df_FDR_platlet = df_FDR_append[df_FDR_append.Comparison.isin(platlet_column_names)]


In [14]:
#All correlations 
p.plotCircleHeatMap(df_FDR_platlet, "P_value","Correlation","Comparison","Cancer Type",plot_width= 1200, plot_height = 650)

In [83]:
research_genes = [ 'MMP1',
 'MMP10',
 'MMP11',
 'MMP12',
 'MMP13',
 'MMP14',
 'MMP15',
 'MMP19',
 'MMP2',
 'MMP24',
 'MMP3',
 'MMP7',
 'MMP8',
 'MMP9']
research_genes_colon = ['GFM2',
 'MRPL19',
 'MRPS34',
 'MRPL39',
 'MRPL17',
 'MTIF2',
 'MRPL37',
 'MTIF3',
 'MRPL15',
 'MRPL12',
 'MRPL33',
 'MRPL55',
 'MRPL11',
 'MRPL42',
 'MRPL20',
 'MRPL40',
 'MRPS28',
 'PTCD3',
 'GADD45GIP1',
 'TSFM',
 'MRPS18A',
 'MRPL47',
 'MRPS7',
 'MRPL43',
 'MRPL21',
 'TUFM',
 'MRPL44',
 'MRPS9',
 'MRPL50',
 'DAP3']

In [88]:
heme = ['APP',
 'DOCK5',
 'DGKB',
 'PROS1',
 'F13A1',
 'PIK3CD',
 'ARRB2',
 'CLU',
 'AKAP10',
 'PPP2R5E',
 'LAMP2',
 'KIF13B',
 'RAC2',
 'KIFAP3',
 'JAK3',
 'HRAS',
 'SRGN',
 'GUCY1A2',
 'VWF',
 'PRKCB',
 'APLP2',
 'HGF',
 'SERPINF2',
 'ATP1B1',
 'APBB1IP',
 'FGR',
 'F9',
 'STIM1',
 'MMRN1',
 'LCK',
 'CEACAM6',
 'IRF7',
 'PRKCQ',
 'CD48',
 'PDE5A',
 'DAGLB',
 'TLN1',
 'PFN1',
 'ALDOA',
 'DOCK2',
 'CFD',
 'ITIH3',
 'AK3',
 'PRCP',
 'RASGRP2',
 'GNA14',
 'KLC3',
 'GNG2',
 'INPP5D',
 'PLCG2',
 'SPP2',
 'CSK',
 'CD74',
 'SERPIND1',
 'ANGPT1',
 'F12',
 'PTK2',
 'SELP',
 'CD2',
 'PROC',
 'P2RX1',
 'CD9',
 'F13B',
 'PTPN6',
 'GRB2',
 'HRG']

In [92]:
TCA = ['ATP5S',
 'NDUFA13',
 'NDUFB7',
 'NDUFB10',
 'UQCRB',
 'NDUFB6',
 'NDUFA12',
 'NDUFB5',
 'NDUFA10',
 'NDUFB3',
 'NDUFB1',
 'UQCR11',
 'UQCR10',
 'COX5B',
 'LDHA',
 'PDK4',
 'UQCRFS1',
 'CYC1',
 'NDUFV2',
 'SLC16A3',
 'NDUFV1',
 'PDK1',
 'COX8A',
 'NDUFA9',
 'NDUFA8',
 'NDUFA7',
 'SLC16A1',
 'NDUFA6',
 'NDUFA4',
 'NDUFA3',
 'NDUFA2',
 'NDUFC2',
 'NDUFC1',
 'COX6B1',
 'COX7A2L',
 'NDUFS8',
 'NDUFS7',
 'UQCRQ',
 'NDUFS5',
 'NDUFS4',
 'UQCRC1',
 'NDUFS3',
 'NDUFS2',
 'NDUFS1',
 'UQCRC2']

In [93]:
explor_genes = []
for gene in TCA:
    gene += "_proteomics"
    explor_genes.append(gene)

df_FDR_explor = df_FDR_append[df_FDR_append.Comparison.isin(explor_genes)]


In [94]:
#All correlations 
p.plotCircleHeatMap(df_FDR_explor, "P_value","Correlation","Comparison","Cancer Type",plot_width= 1200, plot_height = 650)

In [24]:
prot_FDR = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_all_FDR_wide.csv")

In [10]:
comp_coag = ['SERPINA1',
 'ITGAM',
 'PROS1',
 'SERPINE1',
 'ITGB2',
 'F13A1',
 'C8B',
 'C8A',
 'SERPINA5',
 'C4A',
 'THBD',
 'C8G',
 'PLAU',
 'C3AR1',
 'VSIG4',
 'FGB',
 'FGA',
 'SERPINB2',
 'FGG',
 'SERPINF2',
 'PLAUR',
 'F2',
 'F3',
 'F7',
 'PROCR',
 'F9',
 'SERPING1',
 'CFB',
 'C1QB',
 'CFD',
 'C1QA',
 'CPB2',
 'C1S',
 'C1R',
 'SERPINC1',
 'CFI',
 'C5AR1',
 'PLG',
 'KNG1',
 'C2',
 'C3',
 'C6',
 'C7',
 'C9',
 'A2M',
 'F10',
 'SERPIND1',
 'F11',
 'F13B',
 'KLKB1',
 'C1QC']

In [16]:
GBM_trans = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/GBM_EGFR_all_pearson_FDR.csv")
coag_column_names = []
for gene in comp_coag:
    gene += "_proteomics"
    coag_column_names.append(gene)
GBM_trans

Unnamed: 0.1,Unnamed: 0,Comparison,Correlation,P_value
0,450,EGFR_proteomics,1.000000,0.000000e+00
1,1038,PHLDA1_proteomics,0.816848,6.553435e-25
2,638,GRB2_proteomics,-0.610889,1.886384e-11
3,1346,SOCS2_proteomics,0.562720,1.343464e-09
4,274,CDH4_proteomics,0.559180,1.790048e-09
...,...,...,...,...
1575,202,C20orf194_proteomics,-0.268567,7.191485e-03
1576,644,GSTCD_proteomics,0.268529,7.200059e-03
1577,1368,SSH2_proteomics,-0.268322,7.246687e-03
1578,1021,PDIA5_proteomics,-0.268302,7.251178e-03


In [22]:

df_FDR_comp_coag = GBM_trans[GBM_trans.Comparison.isin(coag_column_names)]
df_FDR_comp_coag = df_FDR_comp_coag[["Comparison", "Correlation"]]
df_FDR_comp_coag.to_csv("GBM_comp_coag")