# Make EGFR Immune Figure (7A)

This notebooks takes all the trans genes that have the same results and runs a GSEA using NCI-Nature_2016. Top hits included T and B cell signaling pathways. Heat map includes genes from GSEA as well as additional genes realted to the immune system. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.utils as u
import plot_utils as p 
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


# Step 1: Find Trans proteins with opposite effects in different cancers 

Load df with all of the genes that are FDR significant.

In [2]:
FDR_sig = pd.read_csv("csv_files/Supplemental_Table_EGFR_sig_only.csv")
FDR_sig = FDR_sig.set_index("Comparison")
FDR_sig

Unnamed: 0_level_0,Correlation_GBM,P_value_GBM,Correlation_ccRCC,P_value_ccRCC,Correlation_OV,P_value_OV,Correlation_LUAD,P_value_LUAD,Correlation_LSCC,P_value_LSCC,Correlation_BR,P_value_BR,Correlation_CO,P_value_CO,Correlation_HNSCC,P_value_HNSCC
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PHLDA1,0.789551,9.649849e-19,,,,,,,0.682116,1.398013e-12,0.269548,0.016601,,,0.583064,1.395609e-08
CDH4,0.656331,4.744022e-10,,,,,,,,,,,,,,
GRB2,-0.589008,3.310325e-07,,,,,-0.269432,0.048945,,,,,,,-0.481605,2.053150e-05
SOCS2,0.565367,2.006861e-06,,,,,,,,,,,,,,
PHLDA3,0.561528,2.006861e-06,0.364164,0.005159,,,,,0.651618,5.103796e-11,,,,,0.693125,2.351012e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VPS37B,,,,,,,,,,,,,,,-0.246134,4.868684e-02
SCN10A,,,,,,,,,,,,,,,-0.364870,4.918329e-02
PCYT1A,,,,,,,,,,,,,,,0.245333,4.976210e-02
NCBP1,,,,,,,,,,,,,,,0.245341,4.976210e-02


In [3]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

def CountPosNeg(row):
    hasPos = False
    hasNeg= False
    counter = 0
    for item in row:
        if pd.isnull(item):
            continue
        if item < -0:
            hasNeg = True
            counter += 1
        if item > 0:
            hasPos = True
            counter += 1
    return counter


Subset data frame to include only trans genes that have the same effects in different cancers by using apply function

In [4]:
col = ["Correlation_GBM","Correlation_ccRCC","Correlation_BR","Correlation_LUAD","Correlation_HNSCC","Correlation_LSCC"]
FDR_corr = FDR_sig[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)

FDR_corr_False = FDR_corr[FDR_corr['Pos_Neg']==False]


In [5]:
#Get only proteins significant in atleast two cancers
FDR_corr_False["Num_corr"] = FDR_corr_False.apply(CountPosNeg, axis = 1)
FDR_corr_False.sort_values(by=['Num_corr'],ascending=False)
FDR_corr_False = FDR_corr_False[FDR_corr_False['Num_corr'].isin([2,3,4,5,6])]
FDR_corr_False.head()

Unnamed: 0_level_0,Correlation_GBM,Correlation_ccRCC,Correlation_BR,Correlation_LUAD,Correlation_HNSCC,Correlation_LSCC,Pos_Neg,Num_corr
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PHLDA1,0.789551,,0.269548,,0.583064,0.682116,False,4
GRB2,-0.589008,,,-0.269432,-0.481605,,False,3
PHLDA3,0.561528,0.364164,,,0.693125,0.651618,False,4
GLA,-0.562315,-0.27304,,,,,False,2
SEC23IP,-0.543668,,-0.30393,-0.287779,,,False,3


The manuscript mentions that there are 1136 proteins with the same directional effect. Here is the derivation of that number. 

In [6]:
same_sign_prot = FDR_corr_False.index.tolist()
print("Total number of trans proteins with same effects in different cancers is " + str(len(same_sign_prot)))

Total number of trans proteins with same effects in different cancers is 1196


The manuscript mentions pathways enriched in these 1136 proteins. Here is the enrichment analysis results

# Run GSEA

In [7]:
same_sign_enr = gp.enrichr(gene_list = same_sign_prot, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='test/enrichr_NCI-Nature')
same_sign_enr.res2d.head(10)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,NCI-Nature_2016,HIF-1-alpha transcription factor network Homo ...,18/66,3.71078e-08,7e-06,0,0,5.970713,102.155549,NCOA1;EGLN1;JUN;SMAD4;PFKFB3;TFRC;SERPINE1;SLC...
1,NCI-Nature_2016,CXCR4-mediated signaling events Homo sapiens 4...,21/100,3.747482e-07,2.9e-05,0,0,4.236197,62.683058,STAT5A;YES1;RALB;ITGA4;ITGA2;PXN;LIMK1;GNAI3;P...
2,NCI-Nature_2016,Validated transcriptional targets of AP1 famil...,12/35,4.565562e-07,2.9e-05,0,0,8.275999,120.825892,FOSL1;GJA1;JUN;ITGB4;PLAU;MMP1;SP1;MGP;NFATC2;...
3,NCI-Nature_2016,TCR signaling in naive CD4+ T cells Homo sapie...,16/64,7.728543e-07,3.7e-05,0,0,5.298305,74.563976,MAP4K1;PRKCB;WAS;VAV1;ZAP70;CD4;LCK;GRAP2;FLNA...
4,NCI-Nature_2016,Regulation of RAC1 activity Homo sapiens 351aa...,12/38,1.25313e-06,4.8e-05,0,0,7.319906,99.476549,ARHGAP9;EPS8;BCR;ABR;TRIO;RAP1GDS1;ELMO1;ARHGA...
5,NCI-Nature_2016,Direct p53 effectors Homo sapiens 67c3b75d-619...,24/136,1.629203e-06,5.2e-05,0,0,3.417601,45.547804,RB1;FOXA1;STEAP3;JUN;PPP1R13B;GPX1;CD82;S100A2...
6,NCI-Nature_2016,Validated transcriptional targets of TAp63 iso...,13/53,1.044404e-05,0.00025,0,0,5.154945,59.124536,TFAP2C;JAG1;S100A2;ITGB4;IGFBP3;CABLES1;DICER1...
7,NCI-Nature_2016,AP-1 transcription factor network Homo sapiens...,15/69,1.094039e-05,0.00025,0,0,4.410104,50.376841,JUN;CDKN1B;MMP1;NFATC2;FOSL2;FOSL1;GJA1;ELF1;F...
8,NCI-Nature_2016,Integrin family cell surface interactions Homo...,9/26,1.172488e-05,0.00025,0,0,8.379157,95.135245,ITGA4;ITGB4;ITGA2;ITGB8;ITGAV;ITGA6;ITGB6;ITGA...
9,NCI-Nature_2016,HIF-2-alpha transcription factor network Homo ...,10/34,1.970169e-05,0.000378,0,0,6.597808,71.48597,EGLN1;EFNA1;MMP14;SP1;APEX1;SERPINE1;SLC2A1;PG...


We selected proteins related to B and T cell signaling based off of enrichment results. 

In [8]:
t_only = ['CD3E', 'CD3G', 'CD4','GRAP2', 'FYB1', 'PRKCQ'] 
b_only = ['PPP3CC', 'CD79A', 'SYK', 'BTK', 'LYN', 'BLNK', 'HLA-DMA', 'HLA-DMB']
both = ['CD2', 'CD5', 'GRB2', 'LCK', 'ZAP70', 'VAV1', 'PTPN6', 'PTPRC', 'PIK3CD', 'PIK3CG', 'INPP5D',
        'PLCG1', 'PLCG2', 'NFATC2', 'PRKCB','NFKB2', 'DOCK2', 'RAC2', 'ELMO1', 'WAS']
immune = t_only + b_only + both
len(immune)

34

# Make data frame for figure 7A

In [9]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("csv_files/sig_prot_heatmap_EGFR.csv")

#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(immune)]


 Add new column that will be the unique index and order the new index. This way genes will be grouped by proteins associated with T cell, B cell, or both.

In [10]:
# Order index for Heatmap
df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer"]
df_FDR_append = df_FDR_append.set_index("Index")


ordered_list = []
cancer_list = ['BR', 'GBM', 'HNSCC', 'ccRCC', 'LSCC', 'LUAD']
for gene in immune:
    for cancer in cancer_list:
        ordered_list.append(gene+' '+cancer)
        
ordered_df = df_FDR_append.reindex(ordered_list)
ordered_df = ordered_df.loc[ordered_df['P_Value'] < 0.05] # Keep sig 


# Make Figure 7A

In [11]:
legend_min = df_FDR_append["P_Value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(ordered_df, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 1000, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True , save_png = "png_files/Figure7A.png")

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12.

In [14]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("csv_files/all_prot_heatmap_EGFR.csv")

#subset dataframe to include genes only desired for figure 
df_all_immune = df_all_prot_append[df_all_prot_append.Comparison.isin(immune)]
# With 8 cancer types and 34 genes there should be 272 rows. 24 genes are missing
print("Number of rows in data frame " + str(len(df_all_immune)))


Number of rows in data frame 194


In [15]:
def find_missing_genes(test_list, full_list):
    missing_list = []
    for gene in full_list:
        if (gene not in test_list):
            missing_list.append(gene)
    return missing_list

print list of missing genes

In [16]:
cancers = ["GBM", "LUAD","LSCC","HNSCC","OV","ccRCC","BR","CO"]
fig_genes =set(ordered_df.Comparison.to_list())
for cancer in cancers:
    cancer_check = df_all_immune[df_all_immune["Cancer"] == cancer]
    cancer_check = cancer_check.Comparison.to_list()
    print(cancer)
    print(find_missing_genes(cancer_check,immune))

GBM
['CD3E', 'CD3G', 'CD79A', 'CD5']
LUAD
['PIK3CG', 'NFATC2']
LSCC
['PPP3CC', 'PIK3CG', 'ELMO1']
HNSCC
[]
OV
['CD3E', 'CD3G', 'CD4', 'GRAP2', 'FYB1', 'PRKCQ', 'PPP3CC', 'CD79A', 'SYK', 'BTK', 'LYN', 'BLNK', 'HLA-DMA', 'HLA-DMB', 'CD2', 'CD5', 'GRB2', 'LCK', 'ZAP70', 'VAV1', 'PTPN6', 'PTPRC', 'PIK3CD', 'PIK3CG', 'INPP5D', 'PLCG1', 'PLCG2', 'NFATC2', 'PRKCB', 'NFKB2', 'DOCK2', 'RAC2', 'ELMO1', 'WAS']
ccRCC
['CD79A']
BR
[]
CO
['CD3E', 'CD3G', 'CD4', 'GRAP2', 'FYB1', 'PRKCQ', 'PPP3CC', 'CD79A', 'SYK', 'BTK', 'LYN', 'BLNK', 'HLA-DMA', 'HLA-DMB', 'CD2', 'CD5', 'GRB2', 'LCK', 'ZAP70', 'VAV1', 'PTPN6', 'PTPRC', 'PIK3CD', 'PIK3CG', 'INPP5D', 'PLCG1', 'PLCG2', 'NFATC2', 'PRKCB', 'NFKB2', 'DOCK2', 'RAC2', 'ELMO1', 'WAS']


In [17]:
enrich_df = same_sign_enr.res2d
p53_genes = enrich_df.iloc[7,9]

p53_genes = p53_genes.split(';')
p53_genes

['JUN',
 'CDKN1B',
 'MMP1',
 'NFATC2',
 'FOSL2',
 'FOSL1',
 'GJA1',
 'ELF1',
 'FABP4',
 'PLAU',
 'SP1',
 'BAG1',
 'TRIP6',
 'CTNNB1',
 'JUNB']

In [18]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("csv_files/sig_prot_heatmap_EGFR.csv")

In [19]:
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(p53_genes)]
df_FDR_append

Unnamed: 0,Comparison,Correlation,P_Value,Cancer
267,NFATC2,-0.385156,0.003265,GBM
1238,FABP4,-0.282661,0.04074,GBM
1329,GJA1,0.275381,0.048127,GBM
1376,NFATC2,-0.501696,8e-06,HNSCC
1381,PLAU,0.494005,1.1e-05,HNSCC
1480,GJA1,0.418377,0.00027,HNSCC
1551,TRIP6,0.395337,0.000638,HNSCC
1684,JUN,0.364643,0.001901,HNSCC
1782,CDKN1B,-0.350198,0.002962,HNSCC
1999,BAG1,-0.327064,0.005692,HNSCC


In [20]:
legend_min = df_FDR_append["P_Value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(df_FDR_append, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 1000, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True)