# Make EGFR Figure 2

This notebooks takes all the trans genes that have positive and negative results and runs a GSEA using Reactome. It then takes a subset of genes from the top hit(Hemostasis) pathway and maps them on a large circle heat map. This heatmap focuses on coagulation and urokinase related genes. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.utils as u
import plot_utils as p 

  import pandas.util.testing as tm


# Step 1: Find Trans proteins with opposite effects in different cancers 

Load df with all of the genes that are FDR significant.

In [2]:
FDR_sig = pd.read_csv("Pval_corr_table_Fig_2_return_sig.csv")
FDR_sig = FDR_sig.replace(to_replace ='_proteomics', value = '', regex = True) #Remove proteomics labeling
FDR_sig= FDR_sig.drop(['Unnamed: 0'], axis=1)
FDR_sig = FDR_sig.set_index("Comparison")
FDR_sig = FDR_sig[1:] #Drop EGFR 
FDR_sig

Unnamed: 0_level_0,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_Lscc,P_value_Lscc,Correlation_Colon,P_value_Colon
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PHLDA1,0.816848,3.507071e-21,,,,,0.364797,0.002164,,,0.664271,8.888640e-12,0.713420,2.644826e-14,,
GRB2,-0.610889,6.729990e-08,,,,,,,-0.302439,0.020631,-0.532341,3.320092e-06,,,,
CDH4,0.559180,3.420388e-06,,,,,,,,,,,,,,
PLA2G15,-0.556624,3.420388e-06,-0.298029,0.02216,,,0.274185,0.016061,,,,,,,,
SOCS2,0.562720,3.420388e-06,,,,,,,,,,,0.472624,1.417921e-02,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCDC6,,,,,,,,,,,,,,,0.287735,0.049589
SSR2,,,,,,,,,,,,,,,0.301814,0.049589
PSMB1,,,,,,,,,,,,,,,0.287839,0.049589
MRPL39,,,,,,,,,,,,,,,-0.288110,0.049589


In [3]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

def CountPosNeg(row):
    hasPos = False
    hasNeg= False
    counter = 0
    for item in row:
        if pd.isnull(item):
            continue
        if item < -0:
            hasNeg = True
            counter += 1
        if item > 0:
            hasPos = True
            counter += 1
    return counter


Subset data frame to include only trans genes that have opposite effects in different cancers by using apply function

In [4]:
col = ["Correlation_Gbm","Correlation_kidney","Correlation_Ovar","Correlation_Brca","Correlation_Luad","Correlation_hnscc","Correlation_Lscc","Correlation_Colon"]
FDR_corr = FDR_sig[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)
FDR_corr_False = FDR_corr[FDR_corr['Pos_Neg']==False]
FDR_corr_False = FDR_corr_False.drop(["Pos_Neg"], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
FDR_corr_False["Num_corr"] = FDR_corr_False.apply(CountPosNeg, axis = 1)
FDR_corr_False.sort_values(by=['Num_corr'],ascending=False)
FDR_corr_False = FDR_corr_False[FDR_corr_False['Num_corr'].isin([2,3,4,5,6])]
FDR_corr_False

Unnamed: 0_level_0,Correlation_Gbm,Correlation_kidney,Correlation_Ovar,Correlation_Brca,Correlation_Luad,Correlation_hnscc,Correlation_Lscc,Correlation_Colon,Num_corr
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PHLDA1,0.816848,,,0.364797,,0.664271,0.713420,,4
GRB2,-0.610889,,,,-0.302439,-0.532341,,,3
SOCS2,0.562720,,,,,,0.472624,,2
GLA,-0.550491,-0.298348,,,,,,,2
PHLDA3,0.525883,0.432530,,,,0.763784,0.682034,,4
...,...,...,...,...,...,...,...,...,...
S100A14,,,,,,,0.370227,0.351373,2
MOV10,,,,,,,0.302787,0.299218,2
ATP2B1,,,,,,,0.298873,0.346409,2
MFSD6,,,,,,,0.295201,0.380169,2


The manuscript mentions the total number of trans proteins that opposite effects in different cancers. Here is the derivation of that number 

In [6]:
same_sign_prot = FDR_corr_False.index.tolist()
print("Total number of trans proteins with same effects in different cancers is " + str(len(same_sign_prot)))

Total number of trans proteins with same effects in different cancers is 1136


# Run GSEA

In [10]:
same_sign_enr = gp.enrichr(gene_list = same_sign_prot, description='Tumor_partition', gene_sets='KEGG_2016', 
                       outdir='test/enrichr_KEGG')
same_sign_enr.res2d.head(10)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,KEGG_2016,Arrhythmogenic right ventricular cardiomyopath...,18/74,1.159284e-07,3.4e-05,0,0,4.282451,68.392004,DSP;ITGA4;JUP;ITGB4;ACTN2;ITGA2;GJA1;CDH2;PKP2...
1,KEGG_2016,Metabolic pathways Homo sapiens hsa01100,114/1239,1.813751e-07,2.7e-05,0,0,1.619889,25.145046,MSMO1;ENO1;NSDHL;NAMPT;ENPP1;ENPP3;ATP6V1E1;PT...
2,KEGG_2016,Regulation of actin cytoskeleton Homo sapiens ...,32/214,5.044867e-07,4.9e-05,0,0,2.632618,38.172238,CYFIP2;NCKAP1;ITGB4;WAS;PIK3CD;IQGAP2;ITGAL;FG...
3,KEGG_2016,Biosynthesis of amino acids Homo sapiens hsa01230,17/74,6.249417e-07,4.6e-05,0,0,4.044537,57.778675,PRPS1;ARG1;PGAM1;GPT;ENO1;PFKL;PKM;CBS;PSAT1;P...
4,KEGG_2016,Glycolysis / Gluconeogenesis Homo sapiens hsa0...,16/67,7.613663e-07,4.5e-05,0,0,4.20433,59.231243,GPI;ACSS2;ADH1B;PGAM1;ENO1;HK2;LDHA;PFKL;PKM;P...
5,KEGG_2016,Bacterial invasion of epithelial cells Homo sa...,17/78,1.377343e-06,6.7e-05,0,0,3.837125,51.783367,WAS;FN1;PIK3CD;SEPT1;PIK3CG;SEPT2;CD2AP;SEPT9;...
6,KEGG_2016,B cell receptor signaling pathway Homo sapiens...,16/73,2.588556e-06,0.000108,0,0,3.858769,49.640789,CR2;JUN;SYK;INPPL1;PIK3CD;NFATC2;PIK3CG;VAV1;P...
7,KEGG_2016,Carbon metabolism Homo sapiens hsa01200,20/113,5.248171e-06,0.000192,0,0,3.116041,37.883681,GPI;PRPS1;H6PD;ACSS2;ECHS1;PGAM1;GPT;ENO1;HK2;...
8,KEGG_2016,T cell receptor signaling pathway Homo sapiens...,19/104,5.547202e-06,0.000181,0,0,3.216414,38.925738,JUN;NFATC2;PIK3CD;CD3G;PIK3CG;VAV1;ZAP70;CD4;P...
9,KEGG_2016,Axon guidance Homo sapiens hsa04360,21/127,9.30435e-06,0.000273,0,0,2.911168,33.725962,EPHA4;SEMA4A;SEMA4D;SEMA4B;LIMK1;GNAI3;NFATC2;...


In [52]:
#get just the clotting cascade genes and add urokinase genes 
same_sign_df = same_sign_enr.res2d
bcell = same_sign_df.iloc[6,9]
bcell = bcell.split(';')
tcell = same_sign_df.iloc[8,9]
tcell = tcell.split(";")
immune_both =  bcell + tcell
immune = set(immune_both)
immune = list(immune)
btcell_apm = (["CD5","CD8A","BLK","BLNK","CD79A","LYN","CD3E","HLA-DMA","HLA-DMB"])

immune= immune + btcell_apm

#remove = ["CR2","NCK1","CARD11","INPPL1","JUN","KRAS"]
immune.remove("JUN")
immune.remove("KRAS")
immune.remove("CR2")
immune.remove("NCK1")
immune.remove("INPPL1")
immune.remove("CARD11")
immune

['PRKCQ',
 'PPP3CC',
 'LCP2',
 'INPP5D',
 'PIK3CD',
 'GRAP2',
 'CD4',
 'PTPN6',
 'BTK',
 'NFATC2',
 'PIK3CG',
 'SYK',
 'PTPRC',
 'ZAP70',
 'VAV1',
 'RAC2',
 'LCK',
 'GRB2',
 'CD3G',
 'CD5',
 'CD8A',
 'BLK',
 'BLNK',
 'CD79A',
 'LYN',
 'CD3E',
 'HLA-DMA',
 'HLA-DMB']

# Step 3 Make Data frame for figure 2

In [53]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("Pval_corr_table_Fig_2_return_sig_append.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)

#Remove Proteomics labeling 
df_FDR_append = df_FDR_append.replace(to_replace ='_proteomics', value = '', regex = True) 

#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(immune)]
df_FDR_append

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
1,GRB2,-0.610889,6.72999e-08,GBM
75,SYK,-0.470849,0.000120251,GBM
82,CD4,-0.467479,0.0001362923,GBM
86,PIK3CD,-0.466265,0.000139908,GBM
100,LCP2,-0.460881,0.0001665085,GBM
125,VAV1,-0.447143,0.0002894593,GBM
141,HLA-DMB,-0.442005,0.0003476225,GBM
199,PTPRC,-0.424958,0.0006196952,GBM
204,BTK,-0.42338,0.0006552047,GBM
287,INPP5D,-0.408181,0.001009137,GBM


Set add new column to be unique index and order the new index. This way genes will be grouped by coagulation factors, regulators, and urokinase genes.

In [58]:

df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer Type"]
df_FDR_append = df_FDR_append.set_index("Index")
df_ordered = df_FDR_append.reindex(['CD3E Head and Neck', 'CD3G Head and Neck','CD4 Head and Neck','CD8A Luad', 'LCK Head and Neck', 'ZAP70 Head and Neck','LCP2 Head and Neck', 'GRAP2 Head and Neck','VAV1 Head and Neck','GRB2 Head and Neck', 'NFATC2 Head and Neck','PPP3CC Head and Neck', 'RAC2 Head and Neck','CD79A Head and Neck','SYK Head and Neck','BTK Head and Neck','CD5 Head and Neck','PTPN6 Head and Neck','LYN Head and Neck','PTPRC Head and Neck','PIK3CG Head and Neck','INPP5D Head and Neck','PRKCQ Head and Neck',
                                    'CD4 GBM', 'LCP2 GBM', 'VAV1 GBM','GRB2 GBM', 'NFATC2 GBM','RAC2 GBM','SYK GBM','BTK GBM','BLNK GBM', 'PTPN6 GBM','PTPRC GBM', 'PIK3CD GBM', 'PIK3CG GBM','INPP5D GBM','PRKCQ GBM', 
                                    'CD3G Luad','LCK Luad', 'ZAP70 Luad','GRB2 Luad','PPP3CC Luad', 'RAC2 Luad','BLK Luad','BTK Luad','PTPN6 Luad','PIK3CD Luad','INPP5D Luad','PRKCQ Luad',
                                    'VAV1 Kidney',"ZAP70 Kidney", "INPP5D Kidney","GRAP2 Kidney","HLA-DMA GBM","HLA-DMB GBM", 
                                    "NFATC2 Lscc"])
                                    
df_ordered.head(40)

Unnamed: 0_level_0,Comparison,Correlation,P_value,Cancer Type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CD3E Head and Neck,CD3E,-0.339994,0.02079403,Head and Neck
CD3G Head and Neck,CD3G,-0.381032,0.00321078,Head and Neck
CD4 Head and Neck,CD4,-0.417232,0.0005663144,Head and Neck
CD8A Luad,CD8A,-0.292478,0.02581604,Luad
LCK Head and Neck,LCK,-0.331149,0.009506279,Head and Neck
ZAP70 Head and Neck,ZAP70,-0.410167,0.0007208683,Head and Neck
LCP2 Head and Neck,LCP2,-0.390098,0.001462062,Head and Neck
GRAP2 Head and Neck,GRAP2,-0.428093,0.0003591048,Head and Neck
VAV1 Head and Neck,VAV1,-0.439464,0.0002264489,Head and Neck
GRB2 Head and Neck,GRB2,-0.532341,3.320092e-06,Head and Neck


# Step 4: Plot Figure 2

In [59]:
legend_min = df_ordered["P_value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(df_ordered, circle_var = "P_value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer Type", plot_width= 1000, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True , save_png = "Figure2.png")

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12. (As mentioned in EGFR Figure 2 legend)

In [27]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("Pval_corr_table_Fig_2_return_all_append.csv")
df_all_prot_append = df_all_prot_append.drop(['Unnamed: 0'], axis=1)
#Remove Proteomics labeling 
df_all_prot_append = df_all_prot_append.replace(to_replace ='_proteomics', value = '', regex = True) 



NameError: name 'coag_upa' is not defined

In [45]:
#MHC class2 (CD4) molecules
mhc2 = ["HLA-DPA1","HLA-DPB1","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DOB","HLA-DQ","HLA-DRA","HLA-DRB1"]
#subset dataframe to include genes only desired for figure 
df_all_mhc2 = df_all_prot_append[df_all_prot_append.Comparison.isin(mhc2)]
print("Number of missing columns" )
print(56 - len(df_all_mhc2))
#HLA-DP and HLA-DM missing for all. HLA-DRA and HLA-DRB1 consistently not sig 
df_all_mhc2

Number of missing columns
4


Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
141,HLA-DMB,-0.442005,0.000348,GBM
603,HLA-DMA,-0.373678,0.005056,GBM
1667,HLA-DPB1,-0.262423,0.055717,GBM
1681,HLA-DRA,-0.261173,0.057358,GBM
2494,HLA-DPA1,-0.218791,0.126822,GBM
2705,HLA-DRB1,-0.209905,0.146458,GBM
12448,HLA-DPB1,-0.222952,0.109021,Kidney
12942,HLA-DPA1,-0.199014,0.164162,Kidney
13489,HLA-DRA,-0.17688,0.229302,Kidney
15164,HLA-DMA,0.123195,0.443495,Kidney


In [42]:
#CD8 and MHC class 1 molecules
mhc1 = ["CD8A","HLA-A","HLA-B","HLA-C"]

df_all_mhc1 = df_all_prot_append[df_all_prot_append.Comparison.isin(mhc1)]
df_all_mhc1

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
4460,HLA-C,0.149001,0.338227,GBM
4767,HLA-B,0.140454,0.37155,GBM
6132,CD8A,-0.1552,0.528789,GBM
8068,HLA-A,-0.058388,0.749978,GBM
13260,CD8A,-0.240222,0.201476,Kidney
16995,HLA-A,-0.08214,0.673774,Kidney
17871,HLA-B,-0.056599,0.769694,Kidney
19497,HLA-C,-0.022882,0.917089,Kidney
26082,CD8A,0.108046,0.656611,Ovarian
26858,HLA-C,0.08565,0.719148,Ovarian


In [None]:
#Remove Proteomics labeling 
df_all_prot_append = df_all_prot_append.replace(to_replace ='_proteomics', value = '', regex = True) 

#subset dataframe to include genes only desired for figure 
df_all_comp_coag = df_all_prot_append[df_all_prot_append.Comparison.isin(coag_upa)]
print("Number of rows in data frame " + str(len(df_all_comp_coag)))

Our Figure includes 26 genes for 8 cancers. If all data was present there would be 208 rows. However, the data frame only has 205 rows. 3 genes are missing. 

In [None]:
def find_missing_genes(test_list, full_list):
    for gene in full_list:
        if (gene not in test_list):
            print(gene)
        

In [None]:
#Get list of genes for colon, kidney, and ovarian
colon = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Colon"]
colon_list = colon.Comparison.to_list()

Kidney = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Kidney"]
Kidney_list = Kidney.Comparison.to_list()

Ovarian = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Ovarian"]
Ovarian_list = Ovarian.Comparison.to_list()

In [None]:
#Show the 3 missing genes 
print("Ovarian missing genes: ")
find_missing_genes(Ovarian_list, coag_upa)
print("Kidney missing genes: ")
find_missing_genes(Kidney_list, coag_upa)
print("Colon missing genes: ")
find_missing_genes(colon_list, coag_upa)