# Make EGFR Figure 2

This notebooks takes all the trans genes that have positive and negative results and runs a GSEA using Reactome. It then takes a subset of genes from the top hit(Hemostasis) pathway and maps them on a large circle heat map. This heatmap focuses on coagulation and urokinase related genes. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.utils as u
import plot_utils as p 

  import pandas.util.testing as tm


# Step 1: Find Trans genes with opposite effects in different cancers 

Load df with all of the genes that are FDR significant.

In [2]:
FDR_sig = pd.read_csv("Pval_corr_table_Fig_2_return_sig.csv")
FDR_sig = FDR_sig.replace(to_replace ='_proteomics', value = '', regex = True) #Remove proteomics labeling
FDR_sig= FDR_sig.drop(['Unnamed: 0'], axis=1)
FDR_sig = FDR_sig.set_index("Comparison")
FDR_sig = FDR_sig[1:] #Drop EGFR 
FDR_sig

Unnamed: 0_level_0,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_Lscc,P_value_Lscc,Correlation_Colon,P_value_Colon
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PHLDA1,0.816848,3.507071e-21,,,,,0.364797,0.002164,,,0.664271,8.888640e-12,0.713420,2.644826e-14,,
GRB2,-0.610889,6.729990e-08,,,,,,,-0.302439,0.020631,-0.532341,3.320092e-06,,,,
CDH4,0.559180,3.420388e-06,,,,,,,,,,,,,,
PLA2G15,-0.556624,3.420388e-06,-0.298029,0.02216,,,0.274185,0.016061,,,,,,,,
SOCS2,0.562720,3.420388e-06,,,,,,,,,,,0.472624,1.417921e-02,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCDC6,,,,,,,,,,,,,,,0.287735,0.049589
SSR2,,,,,,,,,,,,,,,0.301814,0.049589
PSMB1,,,,,,,,,,,,,,,0.287839,0.049589
MRPL39,,,,,,,,,,,,,,,-0.288110,0.049589


In [3]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

Subset data frame to include only trans genes that have opposite effects in different cancers by using apply function

In [4]:
col = ["Correlation_Gbm","Correlation_kidney","Correlation_Ovar","Correlation_Brca","Correlation_Luad","Correlation_hnscc","Correlation_Lscc","Correlation_Colon"]
FDR_corr = FDR_sig[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)
FDR_corr_True = FDR_corr[FDR_corr['Pos_Neg']==True]
FDR_corr_True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Correlation_Gbm,Correlation_kidney,Correlation_Ovar,Correlation_Brca,Correlation_Luad,Correlation_hnscc,Correlation_Lscc,Correlation_Colon,Pos_Neg
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PLA2G15,-0.556624,-0.298029,,0.274185,,,,,True
DAB2,-0.556402,,,0.326055,,,,,True
CTSC,-0.546285,-0.302316,,0.266940,0.30276,,,,True
SCPEP1,-0.531494,-0.386583,,0.399187,,,,,True
FAM129B,-0.514984,,,0.344093,,,0.360092,,True
...,...,...,...,...,...,...,...,...,...
MUC5B,,,,,,-0.283446,,0.332804,True
HVCN1,,,,,,-0.267908,,0.313116,True
ANP32E,,,,,,,-0.313115,0.343896,True
SLC25A6,,,,,,,0.304578,-0.429471,True


# Step 2: Run GSEA 

In [5]:
#Get list of trans genes that have opposite effects in different cancers
 
pos_neg_prot = FDR_corr_True.index.tolist()
len(pos_neg_prot)

945

Run GSEA using reactome 2016 set

In [6]:
pos_neg_enr = gp.enrichr(gene_list = pos_neg_prot, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='test/enrichr_Reactome')
pos_neg_enr.res2d.head(5)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,Reactome_2016,Hemostasis Homo sapiens R-HSA-109582,80/552,2.1650839999999998e-19,3.312578e-16,0,0,3.067249,131.82013,ITGB1;DOCK5;ITGAM;DGKB;DGKA;PROS1;ITGB3;SERPIN...
1,Reactome_2016,Innate Immune System Homo sapiens R-HSA-168249,98/807,4.949186e-18,3.786127e-15,0,0,2.570104,102.411734,AHCYL1;WIPF1;WIPF2;PROS1;ARAF;ICAM3;FGF1;CLU;R...
2,Reactome_2016,Formation of Fibrin Clot (Clotting Cascade) Ho...,20/39,7.420408000000001e-17,3.784408e-14,0,0,10.853344,403.090083,FGB;FGA;VWF;F10;SERPIND1;SERPINC1;PROS1;FGG;F1...
3,Reactome_2016,Immune System Homo sapiens R-HSA-168256,145/1547,5.663493e-16,2.166286e-13,0,0,1.983699,69.642373,AHCYL1;NCF1;NCF2;WIPF1;PROS1;WIPF2;NCF4;ARAF;I...
4,Reactome_2016,Response to elevated platelet cytosolic Ca2+ H...,28/110,1.385496e-13,4.239617e-11,0,0,5.387205,159.501943,ITIH4;PROS1;ITGB3;SERPINE1;F13A1;PLG;A1BG;CLU;...


In [7]:
#get just the clotting cascade genes and add urokinase genes 
pos_neg_df = pos_neg_enr.res2d
coag = pos_neg_df.iloc[2,9]
coag = coag.split(';')
upa = ["PLAUR","PLAU","PLG","MMP9","MMP12","SERPINE1"]
coag_upa =  coag + upa
len(coag_upa)

26

# Step 3 Make Data frame for figure 2

In [8]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("Pval_corr_table_Fig_2_return_sig_append.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)

#Remove Proteomics labeling 
df_FDR_append = df_FDR_append.replace(to_replace ='_proteomics', value = '', regex = True) 

#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(coag_upa)]

Set add new column to be unique index and order the new index. This way genes will be grouped by coagulation factors, regulators, and urokinase genes.

In [9]:

df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer Type"]
df_FDR_append = df_FDR_append.set_index("Index")
df_ordered = df_FDR_append.reindex(["F2 GBM","F9 GBM","F10 GBM","F11 GBM","F13A1 GBM","F13B GBM","KLKB1 GBM","VWF Colon","FGA GBM","FGB GBM","FGG GBM","SERPINC1 GBM", "SERPIND1 GBM","SERPING1 GBM","A2M GBM","PROS1 GBM","PROC Ovarian","PROCR GBM","THBD GBM","KNG1 GBM","PLAUR GBM","PLAU GBM","PLG GBM","MMP9 Breast","MMP12 Breast","SERPINE1 GBM",
                                "F2 Breast","F9 Breast","F10 Breast","F11 Breast","F13A1 Breast","F13B Breast","FGA Breast","FGB Breast","FGG Breast", "SERPIND1 Breast","SERPING1 Breast","A2M Breast","PROS1 Breast","PROCR Breast","KLKB1 Breast", "PLAUR Breast","PLAU Breast","PLG Breast","SERPINE1 Breast",
                               "VWF Head and Neck","THBD Head and Neck","PLAUR Head and Neck","PLAU Head and Neck","SERPINE1 Head and Neck",
                               "F9 Luad","F13A1 Luad", "F13B Luad", "SERPIND1 Luad","PROS1 Luad","PROC Luad","VWF Luad",
                                "PROCR Kidney",
                                "SERPIND1 Ovarian","PROC Ovarian",
                               "SERPINC1 Colon", "SERPIND1 Colon","A2M Colon","KNG1 Colon","KLKB1 Colon"])


# Step 4: Plot Figure 2

In [10]:
legend_min = df_ordered["P_value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(df_ordered, circle_var = "P_value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer Type", plot_width= 1000, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True , save_png = "Figure2.png")

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12. (As mentioned in EGFR Figure 2 legend)

In [11]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("Pval_corr_table_Fig_2_return_all_append.csv")
df_all_prot_append = df_all_prot_append.drop(['Unnamed: 0'], axis=1)


In [12]:
#Remove Proteomics labeling 
df_all_prot_append = df_all_prot_append.replace(to_replace ='_proteomics', value = '', regex = True) 

#subset dataframe to include genes only desired for figure 
df_all_comp_coag = df_all_prot_append[df_all_prot_append.Comparison.isin(coag_upa)]
print("Number of rows in data frame " + str(len(df_all_comp_coag)))

Number of rows in data frame 205


Our Figure includes 26 genes for 8 cancers. If all data was present there would be 208 rows. However, the data frame only has 205 rows. 3 genes are missing. 

In [13]:
def find_missing_genes(test_list, full_list):
    for gene in full_list:
        if (gene not in test_list):
            print(gene)
        

In [14]:
#Get list of genes for colon, kidney, and ovarian
colon = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Colon"]
colon_list = colon.Comparison.to_list()

Kidney = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Kidney"]
Kidney_list = Kidney.Comparison.to_list()

Ovarian = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Ovarian"]
Ovarian_list = Ovarian.Comparison.to_list()

In [15]:
#Show the 3 missing genes 
print("Ovarian missing genes: ")
find_missing_genes(Ovarian_list, coag_upa)
print("Kidney missing genes: ")
find_missing_genes(Kidney_list, coag_upa)
print("Colon missing genes: ")
find_missing_genes(colon_list, coag_upa)

Ovarian missing genes: 
MMP12
Kidney missing genes: 
MMP12
Colon missing genes: 
THBD
