# Flagship vs Pancan Make Figure 4

This notebooks takes all the trans genes that have positive and negative results and runs a GSEA using Reactome. It then takes a subset of genes from the top hit (Hemostasis) pathway and maps them on a large circle heat map. This heatmap focuses on coagulation and urokinase related genes. The notebook does this with both the flagship and harmonzied paper

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import plot_utils as p 


  import pandas.util.testing as tm


# Flagship Figure 4 

# Step 1: Find Trans proteins with opposite effects in different cancers 

Load df with all of the genes that are FDR significant. This dataframe was made in the Make_Supplemental_Tables notebook. See
https://github.com/PayneLab/WhenMutationsDontMatter/blob/master/EGFR/Make_Tables/Make_Supplemental_Tables.ipynb

In [3]:
FDR_sig = pd.read_csv("csv_files/Supplemental_Table_EGFR_sig_only.csv")
FDR_sig = FDR_sig.set_index("Comparison")



In [4]:
FDR_sig.max(axis=1)
FDR_sig.min(axis = 1)

Comparison
PHLDA1      9.649849e-19
CDH4        4.744022e-10
GRB2       -5.890078e-01
PHLDA3      2.351012e-14
GLA        -5.623150e-01
                ...     
IGLV2-23   -2.778010e-01
SCN10A     -3.648703e-01
PCYT1A      4.976210e-02
NCBP1       4.976210e-02
IGHV3-43   -2.451771e-01
Length: 6443, dtype: float64

In [5]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

Subset data frame to include only trans genes that have opposite effects in different cancers by using apply function

In [7]:
col = ["Correlation_GBM","Correlation_ccRCC","Correlation_BR","Correlation_LUAD","Correlation_HNSCC","Correlation_LSCC"]
FDR_corr = FDR_sig[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)
FDR_corr_True = FDR_corr[FDR_corr['Pos_Neg']==True]
FDR_corr_True.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Correlation_GBM,Correlation_ccRCC,Correlation_BR,Correlation_LUAD,Correlation_HNSCC,Correlation_LSCC,Pos_Neg
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CTSC,-0.548303,,0.316126,0.28286,,,True
SHC1,-0.521169,0.361057,,,,,True
HSD17B11,-0.52163,,0.405826,,,,True
CTSB,-0.514005,,0.244299,,,,True
EPB41L3_3,-0.510933,,0.315277,,,,True
SCPEP1,-0.509046,-0.394109,0.384739,,,,True
DAB2,-0.500648,,0.41047,,,,True
OSTF1,-0.500063,-0.456002,0.30868,,,,True
PLA2G15,-0.498087,,0.271834,,,,True
SUB1,0.498468,-0.289319,-0.260239,,,,True


In [8]:
def Pvalue_sig(row):
    numSig = 0

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0.05:
            numSig += 1
            
    return numSig

In [9]:
df = FDR_corr_True.drop("Pos_Neg",axis = 1)
diff = df.max(axis=1) - df.min(axis = 1)
diff.sort_values(ascending = False).head(20)

Comparison
CELSR1      1.085044
ARHGAP10    1.055842
RARA        1.014943
CRYBG3      0.967890
FAM89A      0.962051
MRTFB       0.954529
CA12        0.948078
ARFGEF3     0.946863
GLIPR2      0.944892
LPIN1       0.942117
CYRIA       0.938610
JPT2        0.935680
PLA2G4A     0.933300
HSD17B11    0.927456
CTNND2      0.921279
PREX1       0.917569
SLC9A3R1    0.911895
DAB2        0.911117
METTL26     0.900342
MPP1        0.897000
dtype: float64

In [10]:
abs_val = FDR_corr_True.abs()
abs_val.sum(1).sort_values(ascending = False).head(20)

Comparison
CELSR1     2.811137
PFKP       2.780631
CRYBG1     2.757146
FAM89A     2.682271
TANC2      2.673555
GMFG       2.669330
UPP1       2.540427
FERMT1     2.539596
CA12       2.537178
GPRC5C     2.535036
CLIC2      2.530128
JPT2       2.526792
RBM47      2.504163
SEMA4D     2.499493
KBTBD11    2.455456
CDK6       2.447587
ARRB2      2.446843
WIPF1      2.442668
ITGB1      2.441911
LCP1       2.440666
dtype: float64

The manuscript mentions 945 of trans proteins that opposite effects in different cancers. Here is the derivation of that number 

In [11]:
pos_neg_prot = FDR_corr_True.index.tolist()
pos_neg_prot
#print("Total number of trans proteins with opposite effects in different cancers is " + str(len(pos_neg_prot)))

['CTSC',
 'SHC1',
 'HSD17B11',
 'CTSB',
 'EPB41L3_3',
 'SCPEP1',
 'DAB2',
 'OSTF1',
 'PLA2G15',
 'SUB1',
 'RIN2',
 'PPP2R3A',
 'MTARC2',
 'SIPA1L3',
 'CLIC2',
 'CTSL',
 'ABI3',
 'CHDH',
 'DOCK11',
 'SVIL',
 'GSTO1',
 'FCGR2A',
 'NIBAN2',
 'TES',
 'GBA',
 'SNAPIN',
 'COL1A2',
 'COL1A1',
 'PHLDB2',
 'ALDH7A1',
 'CTNND2',
 'RAB3IL1',
 'CDKN2C',
 'COL6A3',
 'TMEM87B',
 'EML4',
 'NPC2',
 'ENG',
 'STAMBPL1',
 'EHBP1L1',
 'APBB1IP',
 'ITGB1',
 'KYNU',
 'PTER',
 'TNS3',
 'PCMTD2',
 'RGS12',
 'DOK3',
 'VAMP8',
 'FBN1',
 'EMB',
 'APC',
 'ACSL1',
 'FHOD1',
 'OTULIN',
 'CAPZA1',
 'ARHGAP10',
 'LIG4',
 'FHL2',
 'TNFAIP8L2',
 'TWF2',
 'TNFAIP8',
 'LUZP1',
 'FBXL17',
 'GMFG',
 'PLCG2',
 'PPM1F',
 'RALGAPB',
 'SYK',
 'WIPF1',
 'IGKV3-20',
 'DAPK3',
 'SULF1',
 'NOVA1',
 'CTSS',
 'DLGAP4',
 'HCK',
 'DNAJC21',
 'CHST3',
 'PAPSS2',
 'LETM1',
 'TPD52L2',
 'MAP3K3',
 'TUT7',
 'DIAPH2',
 'RASSF5',
 'PLIN2',
 'STAT6',
 'SDR39U1',
 'AMPD3',
 'CDK6',
 'LAIR1',
 'MANBA',
 'UAP1',
 'ILK',
 'CD14',
 'PIK3AP1',
 'E

# Run GSEA

In [14]:
pos_neg_enr = gp.enrichr(gene_list = pos_neg_prot, description='Tumor_partition', gene_sets='Reactome_2016')
pos_neg_enr.res2d.head(10)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,Reactome_2016,Fcgamma receptor (FCGR) dependent phagocytosis...,23/120,5.116037e-09,2e-06,0,0,5.017975,95.79759,LYN;ACTR3;HSP90AA1;AHCYL1;SYK;WIPF1;WIPF2;IGKV...
1,Reactome_2016,Innate Immune System Homo sapiens R-HSA-168249,75/807,5.116718e-09,2e-06,0,0,2.224755,42.472249,GSK3A;AHCYL1;WIPF1;WIPF2;ARRB2;C8A;CLU;FGF2;CT...
2,Reactome_2016,"Platelet activation, signaling and aggregation...",35/253,5.957088e-09,2e-06,0,0,3.421811,64.804598,SERPINA1;SHC1;DGKA;PLEK;F13A1;A1BG;ARRB2;RASGR...
3,Reactome_2016,Immune System Homo sapiens R-HSA-168256,119/1547,1.509957e-08,4e-06,0,0,1.836454,33.071972,AHCYL1;NCF1;NUP188;WIPF1;WIPF2;NCF4;CLU;FGF2;N...
4,Reactome_2016,FCGR activation Homo sapiens R-HSA-2029481,14/49,2.579549e-08,6e-06,0,0,8.408389,146.92033,LYN;SYK;IGKV1-5;IGLV2-11;HCK;FCGR3A;IGHG1;IGHG...
5,Reactome_2016,Extracellular matrix organization Homo sapiens...,36/283,3.363738e-08,6e-06,0,0,3.10508,53.431061,DDR1;ITGB1;LAMA5;COL15A1;SDC4;ITGB5;COL11A1;CO...
6,Reactome_2016,Binding and Uptake of Ligands by Scavenger Rec...,16/74,1.929421e-07,3e-05,0,0,5.804699,89.745737,HSP90AA1;CD163;IGKV1-5;IGLV2-11;COL1A1;COL3A1;...
7,Reactome_2016,Collagen formation Homo sapiens R-HSA-1474290,17/85,2.66734e-07,3.4e-05,0,0,5.263566,79.674671,COL15A1;CRTAP;COL11A1;COL12A1;CTSS;LOXL2;COL1A...
8,Reactome_2016,Platelet degranulation Homo sapiens R-HSA-114608,19/105,2.816681e-07,3.4e-05,0,0,4.657435,70.245932,SRGN;FGB;TGFB1;SERPINA1;TGFB3;SERPINF2;PLEK;F1...
9,Reactome_2016,Hemostasis Homo sapiens R-HSA-109582,53/552,3.541015e-07,3.8e-05,0,0,2.276278,33.811111,ITGB1;CD84;SERPINA1;DGKA;PLEK;F13A1;ARRB2;CLU;...


In [17]:
pos_neg_df = pos_neg_enr.res2d
fcgr = pos_neg_df.iloc[2,9]
fcgr = fcgr.split(';')


In [None]:
#get just the clotting cascade genes and add urokinase genes 
pos_neg_df = pos_neg_enr.res2d
coag = pos_neg_df.iloc[2,9]
coag = coag.split(';')
upa = ["F3","PLAUR","PLAU","PLG","MMP9","MMP12","SERPINE1"]
coag_upa =  coag + upa
len(coag_upa)

# Step 3 Make Data frame for Figure 4

In [20]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("csv_files/sig_prot_heatmap_EGFR.csv")
 
#coag_upa =  ["F2""F3","F9","F10","F11","F13A1","PLAUR","PLAU","PLG","MMP9","MMP12","SERPINE1"]
#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(fcgr)]
df_FDR_append

Unnamed: 0,Comparison,Correlation,P_Value,Cancer
12,SHC1,-0.521169,0.000022,GBM
137,APBB1IP,-0.419095,0.001191,GBM
243,PLCG2,-0.390149,0.002829,GBM
248,SYK,-0.389025,0.002931,GBM
402,PFN1,-0.361363,0.006317,GBM
...,...,...,...,...
8633,PLA2G4A,-0.324576,0.013817,ccRCC
8733,TMSB4X,-0.310140,0.019902,ccRCC
8803,PLCG2,-0.302984,0.023092,ccRCC
8879,SYK,-0.293638,0.028987,ccRCC


Set add new column to be unique index and order the new index. This way genes will be grouped by coagulation factors, regulators, and urokinase genes.

In [22]:

df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer"]
'''
df_FDR_append = df_FDR_append.set_index("Index")
df_ordered = df_FDR_append.reindex(["F2 GBM","F3 GBM","F9 GBM","F10 GBM","F11 GBM","F13A1 GBM","F13B GBM","KLKB1 GBM","VWF CO","FGA GBM","FGB GBM","FGG GBM","SERPINC1 GBM", "SERPIND1 GBM","SERPING1 GBM","A2M GBM","PROS1 GBM","PROC OV","PROCR GBM","THBD GBM","KNG1 GBM","PLAUR GBM","PLAU GBM","PLG GBM","MMP9 BR","MMP12 BR","SERPINE1 GBM",
                                "F2 BR","F9 BR","F10 BR","F11 BR","F13A1 BR","F13B BR","FGA BR","FGB BR","FGG BR", "SERPIND1 BR","SERPING1 BR","A2M BR","PROS1 BR","PROCR BR","KLKB1 BR", "PLAUR BR","PLAU BR","PLG BR","SERPINE1 BR",
                               "VWF HNSCC","THBD HNSCC","PLAUR HNSCC","PLAU HNSCC","SERPINE1 HNSCC",
                               "F9 LUAD","F13A1 LUAD", "F13B LUAD", "SERPIND1 LUAD","PROS1 LUAD","PROC LUAD","VWF LUAD",
                                "PROCR ccRCC",
                                "SERPIND1 OV","PROC OV",
                               "F3 CO","SERPINC1 CO", "SERPIND1 CO","A2M CO","KNG1 CO","KLKB1 CO"])
'''
df_FDR_append


Unnamed: 0,Comparison,Correlation,P_Value,Cancer,Index
12,SHC1,-0.521169,0.000022,GBM,SHC1 GBM
137,APBB1IP,-0.419095,0.001191,GBM,APBB1IP GBM
243,PLCG2,-0.390149,0.002829,GBM,PLCG2 GBM
248,SYK,-0.389025,0.002931,GBM,SYK GBM
402,PFN1,-0.361363,0.006317,GBM,PFN1 GBM
...,...,...,...,...,...
8633,PLA2G4A,-0.324576,0.013817,ccRCC,PLA2G4A ccRCC
8733,TMSB4X,-0.310140,0.019902,ccRCC,TMSB4X ccRCC
8803,PLCG2,-0.302984,0.023092,ccRCC,PLCG2 ccRCC
8879,SYK,-0.293638,0.028987,ccRCC,SYK ccRCC


# Step 4: Plot Figure 4

In [25]:
legend_min = df_FDR_append["P_Value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(df_FDR_append, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 700, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True)

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12. (As mentioned in EGFR Figure 2 legend)

In [None]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("Make_Tables/csv_files/all_prot_heatmap_EGFR.csv")
df_all_prot_append 


In [None]:
#subset dataframe to include genes only desired for figure 
df_all_comp_coag = df_all_prot_append[df_all_prot_append.Comparison.isin(coag_upa)]
print("Number of rows in data frame " + str(len(df_all_comp_coag)))


Our figure includes 27 genes for 8 cancers. If all data was present there would be 216 rows. However, the data frame only has 212 rows. 4 genes are missing. 

In [None]:
def find_missing_genes(test_list, full_list):
    for gene in full_list:
        if (gene not in test_list):
            print(gene)
        

In [None]:
#Get list of genes for colon, kidney, and ovarian
colon = df_all_prot_append[df_all_prot_append["Cancer"] == "CO"]
colon_list = colon.Comparison.to_list()

Kidney = df_all_prot_append[df_all_prot_append["Cancer"] == "ccRCC"]
Kidney_list = Kidney.Comparison.to_list()

Ovarian = df_all_prot_append[df_all_prot_append["Cancer"] == "OV"]
Ovarian_list = Ovarian.Comparison.to_list()

In [None]:
#Show the 3 missing genes 
print("Ovarian missing genes: ")
find_missing_genes(Ovarian_list, coag_upa)
print("Kidney missing genes: ")
find_missing_genes(Kidney_list, coag_upa)
print("Colon missing genes: ")
find_missing_genes(colon_list, coag_upa)