# UPAR Circle Heatmap

This notebooks takes the genes that are a hit from the Brca NCI-Nature_2016 UPAR pathway and maps them on a large circle heat map. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.utils as u
import plot_utils as p 

  import pandas.util.testing as tm


In [28]:
import statsmodels.stats.multitest
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColumnDataSource, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, export_png, export_svgs
from bokeh.layouts import row
import math as math




def plotCircleHeatMap ( df, circle_var, color_var, x_axis, y_axis,plot_width= 1000, plot_height = 650, x_axis_lab = "no_label", y_axis_lab = "", show_plot = True, save_png = "plot.png"):
  
    # circle_var designed for pvalues. Normalized by taking log 10 of values and multiplying by 5 
    #added a new column to make the plot size
    
    df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
    df['size'] = (df["size2"])*3
    #find values to set color bar min/ max as 
    maxval = df[color_var].max()
    minval = df[color_var].min()
    if maxval > abs(minval):
        minval = maxval * -1 
    if maxval < abs(minval):
        maxval = minval * -1
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = minval, high = maxval)
    p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, 
               toolbar_location=None, tools="hover")

    p.scatter(x_axis,y_axis,source=df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})

    p.x_range.factors = sorted(df[x_axis].unique().tolist())
    p.y_range.factors = sorted(df[y_axis].unique().tolist(), reverse = True)
    p.xaxis.major_label_orientation = math.pi/2
    
    if (x_axis_lab != "no_label" ):
        p.xaxis.axis_label = x_axis_lab
    if (x_axis_lab != "no_label" ):   
        p.yaxis.axis_label = y_axis_lab

    bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
    p.add_layout(bar, "right")
    
    # Create Circle Legend
    circle_legend = create_circle_legend(df, circle_var, color_var)
    
    if show_plot:
        output_notebook()
        show(row(p, circle_legend))
      
    if save_png != "plot.png":
        export_png(p, filename= save_png)
             
        
'''
@Param df: Dataframe. Same as df passed to plotCircleHeatMap.
@Param lowest_pval: Float. Lowest p-value to include in the legend.
@Param highest_pval: Float. Highest p-value to include in the legend.

Returns: df to be used in creating the circle legend. 
'''

def create_circle_legend_df(lowest_pval = 1e-6, highest_pval = .05):
    lowest_pval_str = "{:.1e}".format(lowest_pval, '.2f')
    med_pval_str = "{:.1e}".format(lowest_pval * float(100), '.2f')
    highest_pval_str = "{:.1e}".format(highest_pval, '.2f')
    
    
    data = {'P_Value':  [lowest_pval, (lowest_pval * float(100)), highest_pval],
            'y_axis': [lowest_pval_str, med_pval_str, highest_pval_str],
            'x_axis': ['', '', ''],
            'Correlation': [.5, .5, .5]}

    fake_df = pd.DataFrame (data, columns = ['x_axis', 'y_axis', 'P_Value', "Correlation"])
    
    fake_df["size2"] = fake_df['P_Value'].apply(lambda x: -1*(np.log(x)))
    fake_df['size'] = (fake_df["size2"])*3
    
    return fake_df

'''
@Param df: Dataframe. Same as df passed to plotCircleHeatMap.
@Param circle_var: Column Label. Same as passed to plotCircleHeatMap.
@Param color_var: Column Label. Same as passed to plotCircleHeatMap.
@Param x_axis: Column Label. Used on the x-axis.
@Param y_axis: Column Label. Used on the y-axis.
@Param lowest_pval: Float. Lowest p-value to include in the legend.
@Param highest_pval: Float. Highest p-value to include in the legend.

Returns: df to be used in creating the circle legend. 
'''

def create_circle_legend(df, circle_var, color_var, x_axis = 'x_axis', y_axis = 'y_axis', 
                         lowest_pval = 1e-6, highest_pval = .05, plot_height = 200, plot_width = 120):
    # Use the smallest pval
    if df[circle_var].min() < lowest_pval:
        lowest_pval = df[circle_var].min()
    circle_df = create_circle_legend_df(lowest_pval, highest_pval)
    
    
    circle = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, toolbar_location=None, tools="hover")

    circle.scatter(x_axis, y_axis, source = circle_df, fill_alpha=1,  line_width=0, size="size")
    
    circle.x_range.factors = sorted(circle_df[x_axis].unique().tolist())
    circle.y_range.factors = sorted(circle_df[y_axis].unique().tolist(), reverse = True)
    circle.xaxis.major_label_orientation = math.pi/2
    
    circle.xaxis.axis_label = 'FDR P-Values'
    
    return circle
      


Load df with all of the genes that are FDR significant. Then get list of just the gene names and use them to run a GSEA. 

In [3]:
prot_FDR = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Brca_EGFR_all_pearson_FDR.csv")
df_FDR= prot_FDR.drop(['Unnamed: 0'], axis=1)
df_FDR = df_FDR.set_index("Comparison")
df1_transposed = df_FDR.T 
df1_transposed


Comparison,EGFR_proteomics,MPP6_proteomics,CPNE8_proteomics,CRYBG3_proteomics,PSAT1_proteomics,PHC3_proteomics,GATA3_proteomics,KRT16_proteomics,KRT5_proteomics,NXN_proteomics,...,MNS1_proteomics,RRP1_proteomics,ZDHHC20_proteomics,SFXN3_proteomics,AGAP3_proteomics,PPP1R14B_proteomics,RASA1_proteomics,HMGN1_proteomics,DDX27_proteomics,MSH6_proteomics
Correlation,1.0,0.6482017,0.6164527,0.6099971,0.6093187,-0.6032044,-0.5989113,0.5953255,0.5947704,0.5928643,...,0.270088,0.229647,0.229381,0.229358,-0.229335,0.229296,-0.229296,-0.229293,0.229265,0.229263
P_value,0.0,4.834438e-15,2.229263e-13,4.612176e-13,4.973595e-13,9.738761e-13,1.547846e-12,2.267369e-12,2.404384e-12,2.938573e-12,...,0.013535,0.013554,0.013665,0.013675,0.013685,0.013701,0.013701,0.013703,0.013714,0.013715
fdr_bh_p_val,0.0,2.441391e-11,7.505185e-10,1.004666e-09,1.004666e-09,1.639358e-09,2.233321e-09,2.514358e-09,2.514358e-09,2.514358e-09,...,0.049494,0.049545,0.049935,0.049953,0.04997,0.04998,0.04998,0.04998,0.049991,0.049991


In [4]:
brca_prot = df1_transposed.columns.values.tolist()
brca_genes = []
for gene in brca_prot :
    brca_genes.append((re.sub("_proteomics", "", gene)))
len(brca_genes)

2771

Run GSEA using reactome 2016 set

In [5]:
brca_enr = gp.enrichr(gene_list = brca_genes, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='test/enrichr_kegg')
brca_enr.res2d.head(2)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,NCI-Nature_2016,Urokinase-type plasminogen activator (uPA) and...,25/42,3.300094e-12,6.897197e-10,0,0,4.448715,117.610996,ITGB1;ITGAM;LRP1;SRC;ITGB3;ITGB2;SERPINE1;PLG;...
1,NCI-Nature_2016,PDGFR-beta signaling pathway Homo sapiens c901...,48/128,5.660464e-12,5.915185e-10,0,0,2.802691,72.582722,USP6NL;DOCK4;TAGLN;LRP1;SRC;ARPC1B;ITGB3;BRK1;...


In [12]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_pearson_sig_all_prot_append_FDR.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)


In [13]:
#get just the upa genes
brca_df = brca_enr.res2d
upa = brca_df.iloc[0,9]
upa = upa.split(';')
upa.remove("EGFR")
len(upa)



24

In [14]:
#filter down df with just upa genes 
upa_column_names = []
for gene in upa:
    gene += "_proteomics"
    upa_column_names.append(gene)

df_FDR_upa = df_FDR_append[df_FDR_append.Comparison.isin(upa_column_names)]


In [17]:

df_FDR_upa = df_FDR_upa.replace(to_replace ='_proteomics', value = '', regex = True)

In [30]:
#Make plot using plot utils
plotCircleHeatMap(df_FDR_upa, "P_value","Correlation","Comparison","Cancer Type",plot_width= 1000, plot_height = 650, x_axis_lab= "Proteomics")

In [15]:
luad_FDR = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Luad_EGFR_all_pearson_FDR.csv")

In [17]:
df_FDR= luad_FDR.drop(['Unnamed: 0'], axis=1)
df_FDR = df_FDR.set_index("Comparison")
df1_transposed = df_FDR.T 
df1_transposed


Comparison,EGFR_proteomics,GGCT_proteomics,LANCL2_proteomics,NUDCD3_proteomics,TAX1BP1_proteomics,ADGRF1_proteomics,GRAP_proteomics,CDK13_proteomics,GRAMD2B_proteomics,MACC1_proteomics,...,WDR19_proteomics,SLC25A22_proteomics,OXSR1_proteomics,PCSK6_proteomics,TMEM205_proteomics,VGLL4_proteomics,ST5_proteomics,CSTF1_proteomics,SPRYD4_proteomics,LDB3_proteomics
Correlation,1.0,0.6090062,0.5888081,0.5683828,0.5615545,0.6068124,-0.5441949,0.541353,0.562198,0.5222575,...,-0.261754,0.261729,-0.261669,-0.324513,0.267685,0.261583,0.261481,0.261472,-0.261441,-0.278121
P_value,0.0,1.676601e-12,1.328507e-11,9.362813e-11,1.746498e-10,7.466411e-10,8.008013e-10,1.019153e-09,1.703448e-09,4.866322e-09,...,0.00574,0.005745,0.005756,0.005762,0.005768,0.005773,0.005793,0.005794,0.005801,0.00581


In [18]:
luad_prot = df1_transposed.columns.values.tolist()
luad_genes = []
for gene in luad_prot :
    luad_genes.append((re.sub("_proteomics", "", gene)))
len(luad_genes)

1198

In [25]:
luad_enr = gp.enrichr(gene_list = luad_genes, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='test/enrichr_kegg')
luad_enr.res2d.head(7)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,Reactome_2016,Immune System Homo sapiens R-HSA-168256,158/1547,1.186818e-11,1.815831e-08,0,0,1.705061,42.894496,ATF1;CYFIP2;APP;AHCYL1;NCF1;WIPF1;PROS1;WIPF2;...
1,Reactome_2016,Innate Immune System Homo sapiens R-HSA-168249,93/807,8.043351e-10,6.153164e-07,0,0,1.9239,40.288409,ATF1;CYFIP2;APP;AHCYL1;CDKN1B;WIPF1;WIPF2;PROS...
2,Reactome_2016,RHO GTPases Activate WASPs and WAVEs Homo sapi...,14/36,7.762482e-09,3.958866e-06,0,0,6.492302,121.237012,CYFIP2;WIPF1;WIPF2;WAS;BRK1;ARPC4;ARPC5;PTK2;A...
3,Reactome_2016,Hemostasis Homo sapiens R-HSA-109582,66/552,6.232478e-08,2.383923e-05,0,0,1.99608,33.116784,APP;DOCK5;DGKB;PROS1;F13A1;PIK3CD;ARRB2;CLU;AK...
4,Reactome_2016,Membrane Trafficking Homo sapiens R-HSA-199991,54/420,9.540055e-08,2.919257e-05,0,0,2.146435,34.697504,APP;GCC1;USE1;GOLGA5;KIF13B;KIFAP3;VPS36;CTSC;...
5,Reactome_2016,"Platelet activation, signaling and aggregation...",38/253,1.53935e-07,3.925342e-05,0,0,2.507473,39.334065,CFD;APP;ITIH3;DGKB;PROS1;F13A1;PIK3CD;ARRB2;RA...
6,Reactome_2016,Signaling by Rho GTPases Homo sapiens R-HSA-19...,48/367,2.939688e-07,6.425318e-05,0,0,2.183476,32.83902,CYFIP2;TRIO;CDKN1B;NCF1;WIPF1;PKN3;WIPF2;SRF;F...


In [39]:
#get just the upa genes
luad_df = luad_enr.res2d
genes = luad_df.iloc[3,9]
genes = genes.split(';')

genes.remove("GRB2")

len(genes)
genes

['APP',
 'DOCK5',
 'DGKB',
 'PROS1',
 'F13A1',
 'PIK3CD',
 'ARRB2',
 'CLU',
 'AKAP10',
 'PPP2R5E',
 'LAMP2',
 'KIF13B',
 'RAC2',
 'KIFAP3',
 'JAK3',
 'HRAS',
 'SRGN',
 'GUCY1A2',
 'VWF',
 'PRKCB',
 'APLP2',
 'HGF',
 'SERPINF2',
 'ATP1B1',
 'APBB1IP',
 'FGR',
 'F9',
 'STIM1',
 'MMRN1',
 'LCK',
 'CEACAM6',
 'IRF7',
 'PRKCQ',
 'CD48',
 'PDE5A',
 'DAGLB',
 'TLN1',
 'PFN1',
 'ALDOA',
 'DOCK2',
 'CFD',
 'ITIH3',
 'AK3',
 'PRCP',
 'RASGRP2',
 'GNA14',
 'KLC3',
 'GNG2',
 'INPP5D',
 'PLCG2',
 'SPP2',
 'CSK',
 'CD74',
 'SERPIND1',
 'ANGPT1',
 'F12',
 'PTK2',
 'SELP',
 'CD2',
 'PROC',
 'P2RX1',
 'CD9',
 'F13B',
 'PTPN6',
 'HRG']

In [41]:
#filter down df with just upa genes 
pathway_names = []
for gene in genes:
    gene += "_proteomics"
    pathway_names.append(gene)

df_FDR_pathway = df_FDR_append[df_FDR_append.Comparison.isin(pathway_names)]
df_FDR_pathway

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
53,APBB1IP_proteomics,-0.483056,4.103970e-07,GBM
58,CSK_proteomics,-0.481565,4.507321e-07,GBM
73,PLCG2_proteomics,-0.472453,7.918099e-07,GBM
87,PIK3CD_proteomics,-0.466265,1.150323e-06,GBM
183,DOCK2_proteomics,-0.428841,9.481834e-06,GBM
...,...,...,...,...
8413,KLC3_proteomics,0.431755,2.483261e-06,Lscc
8495,KIF13B_proteomics,-0.377294,4.842222e-05,Lscc
8547,CD9_proteomics,0.354370,1.457314e-04,Lscc
8632,STIM1_proteomics,0.326801,4.929629e-04,Lscc


In [42]:
#Make plot using plot utils
p.plotCircleHeatMap(df_FDR_pathway, "P_value","Correlation","Comparison","Cancer Type",plot_width= 1000, plot_height = 650)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = (df["size2"])*3
