# Make EGFR Immune Figure (7A)

This notebooks takes all the trans genes that have the same results and runs a GSEA using NCI-Nature_2016. Top hits included T and B cell signaling pathways. Heat map includes genes from GSEA as well as additional genes realted to the immune system. 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.utils as u
import plot_utils as p 
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


# Step 1: Find Trans proteins with opposite effects in different cancers 

Load df with all of the genes that are FDR significant.

In [2]:
FDR_sig = pd.read_csv("Make_Tables/csv_files/Supplemental_Table_EGFR_sig_only.csv")
FDR_sig = FDR_sig.set_index("Comparison")
FDR_sig

Unnamed: 0_level_0,Correlation_GBM,P_value_GBM,Correlation_ccRCC,P_value_ccRCC,Correlation_OV,P_value_OV,Correlation_LUAD,P_value_LUAD,Correlation_LSCC,P_value_LSCC,Correlation_BR,P_value_BR,Correlation_CO,P_value_CO,Correlation_HNSCC,P_value_HNSCC
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PHLDA1,0.816848,3.507071e-21,,,,,,,0.713420,2.644826e-14,0.364797,0.002164,,,0.664271,8.888640e-12
GRB2,-0.610889,6.729990e-08,,,,,-0.302439,0.020631,,,,,,,-0.532341,3.320092e-06
SOCS2,0.562720,3.420388e-06,,,,,,,0.472624,1.417921e-02,,,,,,
CDH4,0.559180,3.420388e-06,,,,,,,,,,,,,,
DAB2,-0.556402,3.420388e-06,,,,,,,,,0.326055,0.003543,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CLTC,,,,,,,,,,,,,,,0.267632,4.813589e-02
PLEC,,,,,,,,,,,,,,,0.267522,4.824560e-02
LRRK2,,,,,,,,,,,,,,,-0.267457,4.830683e-02
MBD1,,,,,,,,,,,,,,,-0.266098,4.993781e-02


In [3]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

def CountPosNeg(row):
    hasPos = False
    hasNeg= False
    counter = 0
    for item in row:
        if pd.isnull(item):
            continue
        if item < -0:
            hasNeg = True
            counter += 1
        if item > 0:
            hasPos = True
            counter += 1
    return counter


Subset data frame to include only trans genes that have the same effects in different cancers by using apply function

In [4]:
col = ["Correlation_GBM","Correlation_ccRCC","Correlation_OV","Correlation_BR","Correlation_LUAD","Correlation_HNSCC","Correlation_LSCC","Correlation_CO"]
FDR_corr = FDR_sig[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)

FDR_corr_False = FDR_corr[FDR_corr['Pos_Neg']==False]


In [5]:
#Get only proteins significant in atleast two cancers
FDR_corr_False["Num_corr"] = FDR_corr_False.apply(CountPosNeg, axis = 1)
FDR_corr_False.sort_values(by=['Num_corr'],ascending=False)
FDR_corr_False = FDR_corr_False[FDR_corr_False['Num_corr'].isin([2,3,4,5,6])]
FDR_corr_False.head()

Unnamed: 0_level_0,Correlation_GBM,Correlation_ccRCC,Correlation_OV,Correlation_BR,Correlation_LUAD,Correlation_HNSCC,Correlation_LSCC,Correlation_CO,Pos_Neg,Num_corr
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PHLDA1,0.816848,,,0.364797,,0.664271,0.71342,,False,4
GRB2,-0.610889,,,,-0.302439,-0.532341,,,False,3
SOCS2,0.56272,,,,,,0.472624,,False,2
GLA,-0.550491,-0.298348,,,,,,,False,2
PHLDA3,0.525883,0.43253,,,,0.763784,0.682034,,False,4


The manuscript mentions that there are 1136 proteins with the same directional effect. Here is the derivation of that number. 

In [6]:
same_sign_prot = FDR_corr_False.index.tolist()
print("Total number of trans proteins with same effects in different cancers is " + str(len(same_sign_prot)))

Total number of trans proteins with same effects in different cancers is 1136


The manuscript mentions pathways enriched in these 1136 proteins. Here is the enrichment analysis results

# Run GSEA

In [7]:
same_sign_enr = gp.enrichr(gene_list = same_sign_prot, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='test/enrichr_NCI-Nature')
same_sign_enr.res2d.head(10)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,NCI-Nature_2016,CXCR4-mediated signaling events Homo sapiens 4...,29/100,1.33389e-13,2.547731e-11,0,0,6.934069,205.563979,GNAI3;PIK3CD;CD3G;ARRB2;PIK3CG;GNAI1;GRK6;PTK2...
1,NCI-Nature_2016,TCR signaling in naive CD4+ T cells Homo sapie...,20/64,1.854077e-10,1.770644e-08,0,0,7.665363,171.769016,MAP4K1;WAS;CD3G;VAV1;ZAP70;CD4;PTPRC;STIM1;LCK...
2,NCI-Nature_2016,HIF-1-alpha transcription factor network Homo ...,17/66,1.054419e-07,6.713133e-06,0,0,5.83347,93.715316,EGLN1;JUN;TFRC;SLC2A1;ENO1;NDRG1;HK2;NT5E;LDHA...
3,NCI-Nature_2016,TCR signaling in naive CD8+ T cells Homo sapie...,15/53,1.530121e-07,7.306328e-06,0,0,6.629184,104.030127,CD3G;VAV1;ZAP70;PTPRC;STIM1;LCK;GRAP2;TRAF6;LC...
4,NCI-Nature_2016,Integrin family cell surface interactions Homo...,10/26,7.73261e-07,2.190643e-05,0,0,10.461812,147.225406,ITGA4;ITGB4;ITGA2;ITGB8;ITGA7;ITGAV;ITGA6;ITGB...
5,NCI-Nature_2016,a6b1 and a6b4 Integrin signaling Homo sapiens ...,13/45,7.942656e-07,2.190643e-05,0,0,6.812556,95.688121,COL17A1;LAMA5;LAMB3;ITGB4;LAMC2;YWHAZ;YWHAQ;CD...
6,NCI-Nature_2016,PDGFR-beta signaling pathway Homo sapiens c901...,23/128,8.028533e-07,2.190643e-05,0,0,3.691927,51.816536,CYFIP2;STAT5A;NCKAP1;STAT5B;JUN;YES1;LRP1;SPHK...
7,NCI-Nature_2016,Arf6 trafficking events Homo sapiens 7a5b8f09-...,13/49,2.310351e-06,5.515963e-05,0,0,6.054319,78.573622,RALA;ITGA4;ITGA2;MAPK8IP3;ACAP1;CDH1;CTNNA1;IT...
8,NCI-Nature_2016,Validated transcriptional targets of TAp63 iso...,13/53,5.988083e-06,0.0001270804,0,0,5.447729,65.512972,GPX2;TFAP2C;JAG1;S100A2;ITGB4;PRKCD;YWHAQ;SP1;...
9,NCI-Nature_2016,Beta1 integrin cell surface interactions Homo ...,14/66,1.616497e-05,0.0003087508,0,0,4.514055,49.802049,COL18A1;LAMA5;ITGA4;LAMB3;ITGA2;TNC;FN1;LAMC2;...


We selected proteins related to B and T cell signaling based off of enrichment results. 

In [8]:
t_only = ['CD3E', 'CD3G', 'CD4','GRAP2', 'FYB1', 'PRKCQ'] 
b_only = ['PPP3CC', 'CD79A', 'SYK', 'BTK', 'LYN', 'BLNK', 'HLA-DMA', 'HLA-DMB']
both = ['CD2', 'CD5', 'GRB2', 'LCK', 'ZAP70', 'VAV1', 'PTPN6', 'PTPRC', 'PIK3CD', 'PIK3CG', 'INPP5D',
        'PLCG1', 'PLCG2', 'NFATC2', 'PRKCB','NFKB2', 'DOCK2', 'RAC2', 'ELMO1', 'WAS']
immune = t_only + b_only + both
len(immune)

34

# Make data frame for figure 7A

In [9]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("Make_Tables/csv_files/sig_prot_heatmap_EGFR.csv")

#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(immune)]


 Add new column that will be the unique index and order the new index. This way genes will be grouped by proteins associated with T cell, B cell, or both.

In [10]:
# Order index for Heatmap
df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer"]
df_FDR_append = df_FDR_append.set_index("Index")


ordered_list = []
cancer_list = ['BR', 'CO', 'GBM', 'HNSCC', 'ccRCC', 'LSCC', 'LUAd',"OV"]
for gene in immune:
    for cancer in cancer_list:
        ordered_list.append(gene+' '+cancer)
        
ordered_df = df_FDR_append.reindex(ordered_list)
ordered_df = ordered_df.loc[ordered_df['P_Value'] < 0.05] # Keep sig 


# Make Figure 7A

In [11]:
legend_min = df_FDR_append["P_Value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(ordered_df, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 1000, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True , save_png = "png_files/Figure7A.png")

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12.

In [12]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("Make_Tables/csv_files/all_prot_heatmap_EGFR.csv")

#subset dataframe to include genes only desired for figure 
df_all_immune = df_all_prot_append[df_all_prot_append.Comparison.isin(immune)]
# With 8 cancer types and 34 genes there should be 272 rows. 24 genes are missing
print("Number of rows in data frame " + str(len(df_all_immune)))


Number of rows in data frame 248


In [13]:
def find_missing_genes(test_list, full_list):
    missing_list = []
    for gene in full_list:
        if (gene not in test_list):
            missing_list.append(gene)
    return missing_list

print list of missing genes

In [14]:
cancers = ["GBM", "LUAD","LSCC","HNSCC","OV","ccRCC","BR","CO"]
fig_genes =set(ordered_df.Comparison.to_list())
for cancer in cancers:
    cancer_check = df_all_immune[df_all_immune["Cancer"] == cancer]
    cancer_check = cancer_check.Comparison.to_list()
    print(cancer)
    print(find_missing_genes(cancer_check,immune))

GBM
['CD3E', 'CD3G', 'CD79A', 'CD5', 'LCK']
LUAD
[]
LSCC
[]
HNSCC
[]
OV
['CD3G', 'CD79A', 'CD2', 'LCK']
ccRCC
['CD79A', 'BTK', 'NFKB2']
BR
['FYB1', 'LYN']
CO
['CD3E', 'CD3G', 'GRAP2', 'FYB1', 'PRKCQ', 'PPP3CC', 'CD79A', 'BLNK', 'PIK3CD', 'PIK3CG']


In [1]:
p53 = ['FOXA1',
 'STEAP3',
 'JUN',
 'PPP1R13B',
 'GPX1',
 'S100A2',
 'CSE1L',
 'HGF',
 'IGFBP3',
 'CAV1',
 'SERPINE1',
 'SH2D1A',
 'HTT',
 'NDRG1',
 'BBC3',
 'PYCARD',
 'CARM1',
 'TP53BP2',
 'BCL2',
 'SFN',
 'MET',
 'TP63']

In [4]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("Make_Tables/csv_files/sig_prot_heatmap_EGFR.csv")

In [5]:
#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(p53)]

In [6]:
legend_min = df_FDR_append["P_Value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(df_FDR_append, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 1000, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True )