# Make Figure 4

This notebooks takes all the trans genes that have positive and negative results and runs a GSEA using Reactome. It then takes a subset of genes from the top hit (Hemostasis) pathway and maps them on a large circle heat map. This heatmap focuses on coagulation and urokinase related genes. 

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import plot_utils as p 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math as math
import scipy.stats
import re
import sys
import statsmodels.stats.multitest
from bokeh.palettes import RdBu
from bokeh.models import LinearColorMapper, ColumnDataSource, ColorBar
from bokeh.models.ranges import FactorRange
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, export_png, export_svgs
from bokeh.layouts import row


In [42]:

def plotCircleHeatMap ( df, circle_var, color_var, x_axis, y_axis, plot_width= 1000, plot_height = 650, font_size = 12,    x_axis_lab = "no_label", y_axis_lab = "no_label", show_plot = True, save_png = "plot.png", legend_min = 1e-6, legend_med = .0001, legend_max = 0.01, show_legend = True):
  
    # circle_var designed for pvalues. Normalized by taking log 10 of values and multiplying by 5 
    #added a new column to make the plot size
    
    df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
    df['size'] = (df["size2"])*3
    #find values to set color bar min/ max as 
    maxval = df[color_var].max()
    minval = df[color_var].min()
    if maxval > abs(minval):
        minval = maxval * -1 
    if maxval < abs(minval):
        maxval = minval * -1
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = minval, high = maxval)
    p = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, 
               toolbar_location=None, tools="hover")

    p.scatter(x_axis,y_axis,source=df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})

    #p.x_range.factors = sorted(df[x_axis].unique().tolist())
    p.x_range.factors = df[x_axis].unique().tolist()
    p.y_range.factors = sorted(df[y_axis].unique().tolist(), reverse = True)
    p.xaxis.major_label_orientation = math.pi/2
    
    # font size
    p.axis.major_label_text_font_size = str(font_size)+"pt"
    p.axis.axis_label_text_font_size = str(font_size)+"pt"
    
    if (x_axis_lab != "no_label" ):
        p.xaxis.axis_label = x_axis_lab
    if (y_axis_lab != "no_label" ):   
        p.yaxis.axis_label = y_axis_lab

    bar = ColorBar(color_mapper=exp_cmap, location=(0,0))
    p.add_layout(bar, "right")
    
    if show_plot:  
        if show_legend:
            # Create Circle Legend
            circle_legend = create_circle_legend(circle_var, color_var, legend_min, legend_med, legend_max)
            circle_legend.axis.major_label_text_font_size = str(font_size - 1)+"pt" # font size
            output_notebook()
            show(row(p, circle_legend))
        else:
            output_notebook()
            show(p)
      
    if save_png != "plot.png":
        export_png(row(p, circle_legend), filename= save_png)
        
    return p
         

'''
@Param legend_min: Float. Lowest p-value to include in the legend.
@Param legend_max: Float. Highest p-value to include in the legend.
@Param color_var: Str. Used for column name (same as other df)

Returns: df to be used in creating the circle legend. 
'''

def create_circle_legend_df(color_var, legend_min, legend_med, legend_max):
    # Find middle pval
    
    #exp_min = abs(np.log10(legend_min))
    #exp_max = abs(np.log10(legend_max))
    #delta = (exp_min - exp_max) / 2 
    #exp_mid = -1 * (exp_max + delta)
    #pval2 = 1 * 10**exp_mid
    
    # Foramat scientific notation pvals as strings for y_axis labels  
    max_str = "{:.1e}".format(legend_max, '.2f')
    med_str = "{:.1e}".format(legend_med, '.2f')
    min_str = "{:.1e}".format(legend_min, '.2f')
    
    # max to min
    data = {'P_Value':  [legend_max, legend_med, legend_min],
            'y_axis': [max_str, med_str, min_str],
            'x_axis': ['', '', ''],
            color_var: [1.5, 1.5, 1.5]}

    fake_df = pd.DataFrame (data, columns = ['x_axis', 'y_axis', 'P_Value', color_var])
    
    fake_df["size2"] = fake_df['P_Value'].apply(lambda x: -1*(np.log(x)))
    fake_df['size'] = (fake_df["size2"])*3
    
    return fake_df

'''
@Param df: Dataframe. Same as df passed to plotCircleHeatMap.
@Param circle_var: Column Label. Same as passed to plotCircleHeatMap.
@Param color_var: Column Label. Same as passed to plotCircleHeatMap.
@Param x_axis: Column Label. Used on the x-axis.
@Param y_axis: Column Label. Used on the y-axis.
@Param legend_min: Float. Lowest p-value to include in the legend.
@Param legend_max: Float. Highest p-value to include in the legend.

Returns: df to be used in creating the circle legend. 
'''

def create_circle_legend(circle_var, color_var, legend_min, legend_med, legend_max,
                         x_axis = 'x_axis', y_axis = 'y_axis', 
                         plot_height = 200, plot_width = 140):
    # Get customized df
    circle_df = create_circle_legend_df(color_var, legend_min, legend_med, legend_max)
    
    maxval = circle_df[color_var].max()
    minval = circle_df[color_var].min()
    if maxval > abs(minval):
        minval = maxval * -1 
    if maxval < abs(minval):
        maxval = minval * -1
    colors = list((RdBu[9]))
    exp_cmap = LinearColorMapper(palette=colors, low = minval, high = maxval)
    
    circle = figure(x_range = FactorRange(), y_range = FactorRange(), plot_width= plot_width, 
               plot_height=plot_height, toolbar_location=None, tools="hover")

    circle.scatter(x_axis, y_axis, source = circle_df, fill_alpha=1,  line_width=0, size="size", 
              fill_color={"field":color_var, "transform":exp_cmap})
    
    circle.x_range.factors = sorted(circle_df[x_axis].unique().tolist())
    circle.y_range.factors = circle_df[y_axis].unique().tolist() # plots in reverse order of circle_df (max to min)
    circle.xaxis.major_label_orientation = math.pi/2
    
    circle.xaxis.axis_label = 'FDR p-value'
    
    return circle



# Step 1: Find Trans proteins with opposite effects in different cancers 

Load df with all of the genes that are FDR significant. This dataframe was made in the Make_Supplemental_Tables notebook. See
https://github.com/PayneLab/WhenMutationsDontMatter/blob/master/EGFR/Make_Tables/Make_Supplemental_Tables.ipynb

In [2]:
FDR_sig = pd.read_csv("Make_Tables/csv_files/Supplemental_Table_EGFR_sig_only.csv")
FDR_sig = FDR_sig.set_index("Comparison")



In [3]:
FDR_sig.max(axis=1)
FDR_sig.min(axis = 1)

Comparison
PHLDA1    3.507071e-21
GRB2     -6.108891e-01
SOCS2     3.420388e-06
CDH4      3.420388e-06
DAB2     -5.564015e-01
              ...     
CLTC      4.813589e-02
PLEC      4.824560e-02
LRRK2    -2.674570e-01
MBD1     -2.660975e-01
RRP12     4.993781e-02
Length: 6230, dtype: float64

In [4]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

Subset data frame to include only trans genes that have opposite effects in different cancers by using apply function

In [5]:
col = ["Correlation_GBM","Correlation_ccRCC","Correlation_OV","Correlation_BR","Correlation_LUAD","Correlation_HNSCC","Correlation_LSCC","Correlation_CO"]
FDR_corr = FDR_sig[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)
FDR_corr_True = FDR_corr[FDR_corr['Pos_Neg']==True]
FDR_corr_True.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Correlation_GBM,Correlation_ccRCC,Correlation_OV,Correlation_BR,Correlation_LUAD,Correlation_HNSCC,Correlation_LSCC,Correlation_CO,Pos_Neg
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DAB2,-0.556402,,,0.326055,,,,,True
PLA2G15,-0.556624,-0.298029,,0.274185,,,,,True
CTSC,-0.546285,-0.302316,,0.26694,0.30276,,,,True
SCPEP1,-0.531494,-0.386583,,0.399187,,,,,True
FAM129B,-0.514984,,,0.344093,,,0.360092,,True
PPP1R18,-0.497202,,,0.359142,,,,,True
NPC2,-0.498791,-0.319133,,0.279599,0.29252,,,,True
CTSB,-0.496895,,,0.341048,,,,,True
KYNU,-0.495517,,,0.373575,-0.341363,,,,True
HSD17B11,-0.491843,0.272218,,0.481667,,-0.402146,,,True


In [6]:
def Pvalue_sig(row):
    numSig = 0

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0.05:
            numSig += 1
            
    return numSig

In [7]:
df = FDR_corr_True.drop("Pos_Neg",axis = 1)
diff = df.max(axis=1) - df.min(axis = 1)
diff.sort_values(ascending = False).head(20)

Comparison
AADAT       1.053837
EHBP1       0.976850
ARHGAP10    0.973586
HSD17B11    0.973510
RARA        0.959860
CMBL        0.954009
CELSR1      0.950254
PPP2R3A     0.949914
HAAO        0.944973
TBC1D10C    0.941293
CTNND2      0.938110
FAM49A      0.934342
LPIN1       0.933992
SCPEP1      0.930681
ACSL4       0.930283
TES         0.921650
DSC2        0.919490
GLIPR2      0.917386
CXXC5       0.917084
CRYBG3      0.914881
dtype: float64

In [8]:
abs_val = FDR_corr_True.abs()
abs_val.sum(1).sort_values(ascending = False).head(20)

Comparison
MYO10       3.264387
KIF13B      3.140698
CD109       3.068188
IL16        2.949050
CGGBP1      2.940795
RCSD1       2.912306
CNNM4       2.851991
PLCG2       2.845178
BAG2        2.837454
RIN3        2.834336
BIN2        2.806953
SDC1        2.803257
WIPF1       2.779699
ITGB1       2.743227
MICALL1     2.735946
PSTPIP1     2.734371
CELSR1      2.669411
TRIM26      2.661191
HSD17B11    2.647874
ALDH1L1     2.639734
dtype: float64

The manuscript mentions 945 of trans proteins that opposite effects in different cancers. Here is the derivation of that number 

In [15]:
pos_neg_prot = FDR_corr_True.index.tolist()
pos_neg_prot
#print("Total number of trans proteins with opposite effects in different cancers is " + str(len(pos_neg_prot)))

['DAB2',
 'PLA2G15',
 'CTSC',
 'SCPEP1',
 'FAM129B',
 'PPP1R18',
 'NPC2',
 'CTSB',
 'KYNU',
 'HSD17B11',
 'SVIL',
 'SH3BP1',
 'APBB1IP',
 'COL6A2',
 'PPP2R3A',
 'CTSZ',
 'HAAO',
 'THEMIS2',
 'DOCK11',
 'FCGRT',
 'CD14',
 'PLCG2',
 'PROCR',
 'GBA',
 'UAP1',
 'CTNND2',
 'GMIP',
 'RASAL3',
 'REL',
 'DNAJC21',
 'EHBP1L1',
 'RAB3IL1',
 'FCGR2A',
 'CTSL',
 'RIN2',
 'RGS12',
 'PHLDB2',
 'FCGR3A',
 'MYO10',
 'LIG4',
 'ST6GAL1',
 'ACSL1',
 'GPSM3',
 'CTSS',
 'CD300A',
 'GMFG',
 'TES',
 'CXXC5',
 'RCSD1',
 'NEDD4',
 'OTULIN',
 'DOK3',
 'TPD52L2',
 'CHDH',
 'PIK3AP1',
 'ARPC1B',
 'TNS3',
 'FHL2',
 'CDKN2C',
 'IL16',
 'UPP1',
 'HCK',
 'FCGR2B',
 'SERPINB8',
 'CLDN10',
 'PLAUR',
 'TMED5',
 'WIPF1',
 'ARPC3',
 'AHR',
 'DPYD',
 'LUZP1',
 'MYO1G',
 'RASSF1',
 'RALGAPA1',
 'NOVA1',
 'ACSS1',
 'SLFN5',
 'FLNB',
 'SERPINB2',
 'EVI2B',
 'MRC1',
 'FHOD1',
 'EHD4',
 'NFKB2',
 'RHBDF2',
 'LRRC25',
 'ARHGAP10',
 'ACSL4',
 'ITGB1',
 'ZNF185',
 'P2RX4',
 'LPXN',
 'MYOF',
 'CAV1',
 'CRYBG1',
 'ARPC2',
 'FGB',
 '

# Run GSEA

In [13]:
pos_neg_enr = gp.enrichr(gene_list = pos_neg_prot, description='Tumor_partition', gene_sets='Reactome_2016')
#pos_neg_enr.res2d.head(5)

KeyError: 'userListId'

In [None]:
#get just the clotting cascade genes and add urokinase genes 
pos_neg_df = pos_neg_enr.res2d
coag = pos_neg_df.iloc[2,9]
coag = coag.split(';')
upa = ["F3","PLAUR","PLAU","PLG","MMP9","MMP12","SERPINE1"]
coag_upa =  coag + upa
len(coag_upa)

# Step 3 Make Data frame for Figure 4

In [19]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("Make_Tables/csv_files/sig_prot_heatmap_EGFR.csv")
 
coag_upa =  ["F2""F3","F9","F10","F11","F13A1","PLAUR","PLAU","PLG","MMP9","MMP12","SERPINE1"]
#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(coag_upa)]
df_FDR_append

Unnamed: 0,Comparison,Correlation,P_Value,Cancer
198,PLAUR,-0.425639,0.0006047225,GBM
504,F13A1,-0.371427,0.003256892,GBM
532,F10,-0.367388,0.003684421,GBM
1000,F9,-0.314237,0.01641199,GBM
1095,SERPINE1,-0.306814,0.0196221,GBM
1153,F11,-0.301704,0.02232032,GBM
1404,PLAU,-0.278913,0.03947268,GBM
1471,PLG,-0.274098,0.04390179,GBM
1646,PLAU,0.451865,0.0001480995,HNSCC
1665,SERPINE1,0.44059,0.0002212241,HNSCC


Set add new column to be unique index and order the new index. This way genes will be grouped by coagulation factors, regulators, and urokinase genes.

In [20]:

df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer"]
df_FDR_append = df_FDR_append.set_index("Index")
df_ordered = df_FDR_append.reindex(["F2 GBM","F3 GBM","F9 GBM","F10 GBM","F11 GBM","F13A1 GBM","F13B GBM","KLKB1 GBM","VWF CO","FGA GBM","FGB GBM","FGG GBM","SERPINC1 GBM", "SERPIND1 GBM","SERPING1 GBM","A2M GBM","PROS1 GBM","PROC OV","PROCR GBM","THBD GBM","KNG1 GBM","PLAUR GBM","PLAU GBM","PLG GBM","MMP9 BR","MMP12 BR","SERPINE1 GBM",
                                "F2 BR","F9 BR","F10 BR","F11 BR","F13A1 BR","F13B BR","FGA BR","FGB BR","FGG BR", "SERPIND1 BR","SERPING1 BR","A2M BR","PROS1 BR","PROCR BR","KLKB1 BR", "PLAUR BR","PLAU BR","PLG BR","SERPINE1 BR",
                               "VWF HNSCC","THBD HNSCC","PLAUR HNSCC","PLAU HNSCC","SERPINE1 HNSCC",
                               "F9 LUAD","F13A1 LUAD", "F13B LUAD", "SERPIND1 LUAD","PROS1 LUAD","PROC LUAD","VWF LUAD",
                                "PROCR ccRCC",
                                "SERPIND1 OV","PROC OV",
                               "F3 CO","SERPINC1 CO", "SERPIND1 CO","A2M CO","KNG1 CO","KLKB1 CO"])



# Step 4: Plot Figure 4

In [43]:
legend_min = df_ordered["P_Value"].min()
#Make plot using plot utils
plotCircleHeatMap(df_FDR_append, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 700, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 22, show_legend = True , save_png = "png_files/Figure4.png")

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12. (As mentioned in EGFR Figure 2 legend)

In [None]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("Make_Tables/csv_files/all_prot_heatmap_EGFR.csv")
df_all_prot_append 


In [None]:
#subset dataframe to include genes only desired for figure 
df_all_comp_coag = df_all_prot_append[df_all_prot_append.Comparison.isin(coag_upa)]
print("Number of rows in data frame " + str(len(df_all_comp_coag)))


Our figure includes 27 genes for 8 cancers. If all data was present there would be 216 rows. However, the data frame only has 212 rows. 4 genes are missing. 

In [None]:
def find_missing_genes(test_list, full_list):
    for gene in full_list:
        if (gene not in test_list):
            print(gene)
        

In [None]:
#Get list of genes for colon, kidney, and ovarian
colon = df_all_prot_append[df_all_prot_append["Cancer"] == "CO"]
colon_list = colon.Comparison.to_list()

Kidney = df_all_prot_append[df_all_prot_append["Cancer"] == "ccRCC"]
Kidney_list = Kidney.Comparison.to_list()

Ovarian = df_all_prot_append[df_all_prot_append["Cancer"] == "OV"]
Ovarian_list = Ovarian.Comparison.to_list()

In [None]:
#Show the 3 missing genes 
print("Ovarian missing genes: ")
find_missing_genes(Ovarian_list, coag_upa)
print("Kidney missing genes: ")
find_missing_genes(Kidney_list, coag_upa)
print("Colon missing genes: ")
find_missing_genes(colon_list, coag_upa)