# Make EGFR Immune Figure 

This notebooks takes all the trans genes that have the same results and runs a GSEA using NCI-Nature_2016. Top hits included T and B cell signaling pathways. Heat map includes genes from GSEA as well as additional genes realted to the immune system. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.utils as u
import plot_utils as p 
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


# Step 1: Find Trans proteins with opposite effects in different cancers 

Load df with all of the genes that are FDR significant.

In [2]:
FDR_sig = pd.read_csv("../Make_Tables/csv_files/Supplemental_Table_EGFR_sig_only.csv")
FDR_sig = FDR_sig.set_index("Comparison")
FDR_sig

Unnamed: 0_level_0,Correlation_Gbm,P_value_Gbm,Correlation_Kidney,P_value_Kidney,Correlation_Ovarian,P_value_Ovarian,Correlation_Luad,P_value_Luad,Correlation_Lscc,P_value_Lscc,Correlation_Brca,P_value_Brca,Correlation_Colon,P_value_Colon,Correlation_Hnscc,P_value_Hnscc
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PHLDA1,0.816848,3.507071e-21,,,,,,,0.713420,2.644826e-14,0.364797,0.002164,,,0.664271,8.888640e-12
GRB2,-0.610889,6.729990e-08,,,,,-0.302439,0.020631,,,,,,,-0.532341,3.320092e-06
SOCS2,0.562720,3.420388e-06,,,,,,,0.472624,1.417921e-02,,,,,,
CDH4,0.559180,3.420388e-06,,,,,,,,,,,,,,
DAB2,-0.556402,3.420388e-06,,,,,,,,,0.326055,0.003543,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CLTC,,,,,,,,,,,,,,,0.267632,4.813589e-02
PLEC,,,,,,,,,,,,,,,0.267522,4.824560e-02
LRRK2,,,,,,,,,,,,,,,-0.267457,4.830683e-02
MBD1,,,,,,,,,,,,,,,-0.266098,4.993781e-02


In [3]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

def CountPosNeg(row):
    hasPos = False
    hasNeg= False
    counter = 0
    for item in row:
        if pd.isnull(item):
            continue
        if item < -0:
            hasNeg = True
            counter += 1
        if item > 0:
            hasPos = True
            counter += 1
    return counter


Subset data frame to include only trans genes that have the same effects in different cancers by using apply function

In [4]:
col = ["Correlation_Gbm","Correlation_Kidney","Correlation_Ovarian","Correlation_Brca","Correlation_Luad","Correlation_Hnscc","Correlation_Lscc","Correlation_Colon"]
FDR_corr = FDR_sig[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)

FDR_corr_False = FDR_corr[FDR_corr['Pos_Neg']==False]


In [5]:
#Get only proteins significant in atleast two cancers
FDR_corr_False["Num_corr"] = FDR_corr_False.apply(CountPosNeg, axis = 1)
FDR_corr_False.sort_values(by=['Num_corr'],ascending=False)
FDR_corr_False = FDR_corr_False[FDR_corr_False['Num_corr'].isin([2,3,4,5,6])]
FDR_corr_False.head()

Unnamed: 0_level_0,Correlation_Gbm,Correlation_Kidney,Correlation_Ovarian,Correlation_Brca,Correlation_Luad,Correlation_Hnscc,Correlation_Lscc,Correlation_Colon,Pos_Neg,Num_corr
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PHLDA1,0.816848,,,0.364797,,0.664271,0.71342,,False,4
GRB2,-0.610889,,,,-0.302439,-0.532341,,,False,3
SOCS2,0.56272,,,,,,0.472624,,False,2
GLA,-0.550491,-0.298348,,,,,,,False,2
PHLDA3,0.525883,0.43253,,,,0.763784,0.682034,,False,4


In [6]:
same_sign_prot = FDR_corr_False.index.tolist()
print("Total number of trans proteins with same effects in different cancers is " + str(len(same_sign_prot)))

Total number of trans proteins with same effects in different cancers is 1136


# Run GSEA

In [9]:
same_sign_enr = gp.enrichr(gene_list = same_sign_prot, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='test/enrichr_KEGG')
same_sign_enr.res2d.head(20)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,NCI-Nature_2016,CXCR4-mediated signaling events Homo sapiens 4...,29/100,1.33389e-13,2.787831e-11,0,0,5.105634,151.3591,GNAI3;PIK3CD;CD3G;ARRB2;PIK3CG;GNAI1;GRK6;PTK2...
1,NCI-Nature_2016,TCR signaling in naive CD4+ T cells Homo sapie...,20/64,1.854077e-10,1.937511e-08,0,0,5.501761,123.286002,MAP4K1;WAS;CD3G;VAV1;ZAP70;CD4;PTPRC;STIM1;LCK...
2,NCI-Nature_2016,HIF-1-alpha transcription factor network Homo ...,17/66,1.054419e-07,7.345784e-06,0,0,4.534784,72.851793,EGLN1;JUN;TFRC;SLC2A1;ENO1;NDRG1;HK2;NT5E;LDHA...
3,NCI-Nature_2016,TCR signaling in naive CD8+ T cells Homo sapie...,15/53,1.530121e-07,7.994882e-06,0,0,4.982727,78.192676,CD3G;VAV1;ZAP70;PTPRC;STIM1;LCK;GRAP2;TRAF6;LC...
4,NCI-Nature_2016,Integrin family cell surface interactions Homo...,10/26,7.73261e-07,3.232231e-05,0,0,6.771398,95.291503,ITGA4;ITGB4;ITGA2;ITGB8;ITGA7;ITGAV;ITGA6;ITGB...
5,NCI-Nature_2016,a6b1 and a6b4 Integrin signaling Homo sapiens ...,13/45,7.942656e-07,2.766692e-05,0,0,5.086072,71.438194,COL17A1;LAMA5;LAMB3;ITGB4;LAMC2;YWHAZ;YWHAQ;CD...
6,NCI-Nature_2016,PDGFR-beta signaling pathway Homo sapiens c901...,23/128,8.028533e-07,2.397091e-05,0,0,3.163512,44.400192,CYFIP2;STAT5A;NCKAP1;STAT5B;JUN;YES1;LRP1;SPHK...
7,NCI-Nature_2016,Arf6 trafficking events Homo sapiens 7a5b8f09-...,13/49,2.310351e-06,6.035792e-05,0,0,4.670882,60.619231,RALA;ITGA4;ITGA2;MAPK8IP3;ACAP1;CDH1;CTNNA1;IT...
8,NCI-Nature_2016,Validated transcriptional targets of TAp63 iso...,13/53,5.988083e-06,0.0001390566,0,0,4.318363,51.931507,GPX2;TFAP2C;JAG1;S100A2;ITGB4;PRKCD;YWHAQ;SP1;...
9,NCI-Nature_2016,Beta1 integrin cell surface interactions Homo ...,14/66,1.616497e-05,0.0003378478,0,0,3.734528,41.201798,COL18A1;LAMA5;ITGA4;LAMB3;ITGA2;TNC;FN1;LAMC2;...


In [10]:
#get just the clotting cascade genes and add urokinase genes 
same_sign_df = same_sign_enr.res2d
T_H_cell = same_sign_df.iloc[1,9]
T_H_cell = T_H_cell.split(';')
T_K_cell = same_sign_df.iloc[3,9]
T_K_cell = T_K_cell.split(";")
immune_both =  T_H_cell + T_K_cell
immune = set(immune_both)
immune = list(immune)
btcell_apm = (["FYB1","PLCG1","CD5","CD8A","BLK","BLNK","CD79A","INPP5D","LYN","CD3E","HLA-DMA","HLA-DMB","PRKCB","NFKB2","CHUK","PLCG2","PPP3CB","PPP3CC","MAP3K8","NFATC2","PIK3CG","SYK","BTK","RAC2","ELMO1","DOCK2","WAS"])

immune= immune + btcell_apm

immune.remove("FLNA")
immune.remove("WAS")
immune.remove("NCK1")
immune.remove("CARD11")
immune.remove("TRAF6")
immune.remove("STIM1")
immune.remove("KRAS")



In [None]:
cxcr4 = same_sign_df.iloc[0,9]
cxcr4 = cxcr4.split(';')


# Step 3 Make Data frame for figure 2

In [11]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("../Make_Tables/csv_files/sig_prot_heatmap_EGFR.csv")

#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(immune)]
df_FDR_append

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Cancer,Correlation_Hnscc,P_value_Hnscc,Correlation_Luad,P_value_Luad,Correlation_Lscc,P_value_Lscc,Correlation_Brca,P_value_Brca,Correlation,P_Value
689,PRKCB,,,Kidney,,,,,,,,,-0.322809,0.011837
737,PLCG2,,,Kidney,,,,,,,,,-0.317009,0.013691
885,GRAP2,,,Kidney,,,,,,,,,-0.30529,0.021277
945,DOCK2,,,Kidney,,,,,,,,,-0.293756,0.024582
1015,VAV1,,,Kidney,,,,,,,,,-0.285693,0.030254
1064,WAS,,,Kidney,,,,,,,,,-0.279814,0.035337
1096,PLCG1,,,Kidney,,,,,,,,,-0.287556,0.037584
1108,ZAP70,,,Kidney,,,,,,,,,-0.276087,0.038339
1165,INPP5D,,,Kidney,,,,,,,,,-0.271891,0.041827
1362,PRKCB,,,Colon,,,,,,,,,0.395927,0.006516


In [None]:
legend_min = df_FDR_append["P_Value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(df_FDR_append, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 1000, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True , save_png = "Figure2.png")

CHUK not significant in any cancer. 

Set add new column to be unique index and order the new index. This way genes will be grouped by coagulation factors, regulators, and urokinase genes.

In [None]:

df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer"]
df_FDR_append = df_FDR_append.set_index("Index")
df_ordered = df_FDR_append.reindex(['CD3E Hnscc', 'CD3G Hnscc','CD4 Hnscc','CD8A Luad', 'LCK Hnscc', 'ZAP70 Hnscc','LCP2 Hnscc', 'GRAP2 Hnscc','VAV1 Hnscc','GRB2 Hnscc',"LCP2 Hnscc","FYB1 Hnscc" , "PLCG1 Gbm",'NFATC2 Hnscc','PPP3CC Hnscc', 'RAC2 Hnscc','CD79A Hnscc','SYK Hnscc','BTK Hnscc','CD5 Hnscc','PTPN6 Hnscc','LYN Hnscc','PTPRC Hnscc','PIK3CG Hnscc','INPP5D Hnscc','PRKCQ Hnscc','BLNK Gbm','BLK Luad',"HLA-DMA Gbm", "HLA-DMB Gbm","HLA-DMB Hnscc",
                                    "NFKB2 Gbm" ,"NFKB2 Luad", "PRKCB Hnscc", "PRKCB Luad", "PRKCB Kidney","PLCG2 Hnscc","PLCG2 Gbm","PLCG2 Luad","PLCG2 Kidney","ELMO1 Gbm","DOCK2 Gbm","WAS Gbm","ELMO1 Hnscc","DOCK2 Hnscc","WAS Hnscc",
                                    'CD4 Gbm', 'LCP2 Gbm', 'VAV1 Gbm','GRB2 Gbm', "FYB1 Gbm", "LCP2 Gbm", 'NFATC2 Gbm','RAC2 Gbm','SYK Gbm','BTK Gbm', 'PTPN6 Gbm','PTPRC Gbm','PIK3CG Gbm','INPP5D Gbm','PRKCQ Gbm', 
                                    'CD3G Luad','LCK Luad',"FYB1 Luad", 'ZAP70 Luad','GRB2 Luad','PPP3CC Luad', 'RAC2 Luad','BTK Luad','PTPN6 Luad','INPP5D Luad','PRKCQ Luad',"WAS Luad","DOCK2 Luad","ELMO1 Hnscc",
                                    'VAV1 Kidney',"PLCG1 Kidney","ZAP70 Kidney", "INPP5D Kidney","GRAP2 Kidney","WAS Kidney","HLA-DMA Gbm","HLA-DMB Gbm", 
                                    "NFATC2 Lscc" ,"PLCG2 Brca", "PLCG1 Brca","NFKB2 Colon", "PRKCB Colon"])

                                    

# Step 4: Plot Figure 2

In [None]:
legend_min = df_ordered["P_Value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(df_ordered, circle_var = "P_Value",color_var = "Correlation", x_axis = "Comparison", y_axis = "Cancer", plot_width= 1200, plot_height = 500, legend_min = legend_min, legend_max = 0.05, font_size = 10, show_legend = True , save_png = "Figure_EGFR_immune.png")

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12. (As mentioned in EGFR Figure 2 legend)

In [None]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("../Make_Tables/csv_files/all_prot_heatmap_EGFR.csv")
df_all_prot_append

#subset dataframe to include genes only desired for figure 
df_all_immune = df_all_prot_append[df_all_prot_append.Comparison.isin(immune)]
print("Number of rows in data frame " + str(len(df_all_immune)))
#total of 37 immune proteins x 8 cancers should have 296 rows.  missing data 31 genes

        


In [None]:
def find_NA_genes (immune,cancer_list):
    checked = []
    missing = []
    for gene in immune:
        if gene in cancer_list:
            checked.append(gene)
        else:
            missing.append(gene)
    return missing

In [None]:
cancers = ["Gbm", "Luad","Lscc","Hnscc","Ovarian","Kidney","Brca","Colon"]
for cancer in cancers:
    cancer_check = df_all_immune[df_all_immune["Cancer"] == cancer]
    cancer_check = cancer_check.Comparison.to_list()
    print(cancer)
    print(find_NA_genes(immune,cancer_check))

In [None]:
#MHC class2 (CD4) molecules
mhc2 = ["HLA-DPA1","HLA-DPB1","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DOB","HLA-DQ","HLA-DRA","HLA-DRB1"]
#subset dataframe to include genes only desired for figure 
df_all_mhc2 = df_all_prot_append[df_all_prot_append.Comparison.isin(mhc2)]
print("Number of missing columns" )
print(56 - len(df_all_mhc2))
#HLA-DP and HLA-DM missing for all. HLA-DRA and HLA-DRB1 consistently not sig 
df_all_mhc2

In [None]:
#CD8 and MHC class 1 molecules
mhc1 = ["CD28","PDCD1","CD279","CTL4","CD152"] #don't have data for pd-1 (pdcd1 or cd279) and CTLA4 (aka CD152)

df_all_mhc1 = df_all_prot_append[df_all_prot_append.Comparison.isin(mhc1)]
df_all_mhc1