# Make EGFR Figure 7A

This notebooks takes all the trans genes that have the same pearson correlation sign in at least  results and runs a GSEA using Reactome. It then takes a subset of genes from the top hit(Hemostasis) pathway and maps them on a large circle heat map. This heatmap focuses on coagulation and urokinase related genes. 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas.util.testing as tm
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import plot_utils as p 

  import pandas.util.testing as tm


# Step 1: Find Trans proteins with opposite effects in different cancers 

Load df with all of the genes that are FDR significant.

In [3]:
FDR_sig = pd.read_csv("Pval_corr_table_Fig_2_return_sig.csv")
FDR_sig = FDR_sig.replace(to_replace ='_proteomics', value = '', regex = True) #Remove proteomics labeling
FDR_sig= FDR_sig.drop(['Unnamed: 0'], axis=1)
FDR_sig = FDR_sig.set_index("Comparison")
FDR_sig = FDR_sig[1:] #Drop EGFR 
FDR_sig

FileNotFoundError: [Errno 2] File Pval_corr_table_Fig_2_return_sig.csv does not exist: 'Pval_corr_table_Fig_2_return_sig.csv'

# Step 3 Make Data frame for figure 2

In [3]:
# Sig PTEN # CD2? CHUK
t_only = ['CD3E', 'CD3G', 'CD4', 'CD8A', 'GRAP2', 'FYB1', 'PRKCQ'] 
b_only = ['PPP3CC', 'CD79A', 'SYK', 'BTK', 'LYN', 'BLNK', 'BLK', 'HLA-DMA', 'HLA-DMB']
both = ['CD2', 'CD5', 'GRB2', 'LCK', 'ZAP70', 'VAV1', 'PTPN6', 'PTPRC', 'PIK3CD', 'PIK3CG', 'INPP5D',
        'PLCG1', 'PLCG2', 'NFATC2', 'PRKCB', 'CHUK', 'NFKB2', 'DOCK2', 'RAC2', 'ELMO1', 'WAS']
immune = t_only + b_only + both

In [4]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("Pval_corr_table_Fig_2_return_sig_append.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)

#Remove Proteomics labeling 
df_FDR_append = df_FDR_append.replace(to_replace ='_proteomics', value = '', regex = True) 

#subset dataframe to include genes only desired for figure 
df_FDR_append= df_FDR_append[df_FDR_append.Comparison.isin(immune)]
df_FDR_append['Cancer Type'].unique()
df_FDR_append

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
1,GRB2,-0.610889,6.729990e-08,GBM
24,WAS,-0.501918,4.961122e-05,GBM
72,PLCG2,-0.472453,1.145235e-04,GBM
75,SYK,-0.470849,1.202510e-04,GBM
82,CD4,-0.467479,1.362923e-04,GBM
...,...,...,...,...
7781,PRKCQ,-0.284462,3.293176e-02,Head and Neck
7876,RAC2,-0.277100,3.855935e-02,Head and Neck
8461,NFATC2,-0.297428,4.438700e-02,Lscc
8573,PRKCB,0.395927,6.516094e-03,Colon


Set add new column to be unique index and order the new index. This way genes will be grouped by proteins associated with T cell, B cell, or both.

In [5]:
# Order index for Heatmap
df_FDR_append["Index"] = df_FDR_append["Comparison"] + " " + df_FDR_append["Cancer Type"]
df_FDR_append = df_FDR_append.set_index("Index")


ordered_list = []
cancer_list = ['Breast', 'Colon', 'GBM', 'Head and Neck', 'Kidney', 'Lscc', 'Luad']
for gene in immune:
    for cancer in cancer_list:
        ordered_list.append(gene+' '+cancer)

ordered_df = df_FDR_append.reindex(ordered_list)
ordered_df = ordered_df.loc[ordered_df['P_value'] < 0.05] # Keep sig 
     

In [6]:
                               
ordered_df = ordered_df.replace({'Head and Neck': 'HNSCC', 'Luad': 'LUAD', 'Lscc': 'LSCC', 'Ovarian': 'OV', 'Colon': 'CO',
                    'Kidney': 'ccRCC', 'Breast': 'BR', 'Gbm': 'GBM', 'Endo': 'EC'})
ordered_df['Cancer Type'].unique()

array(['HNSCC', 'LUAD', 'GBM', 'ccRCC', 'BR', 'LSCC', 'CO'], dtype=object)

# Step 4: Plot Figure 2

In [1]:
legend_min = ordered_df["P_value"].min()
#Make plot using plot utils
p.plotCircleHeatMap(ordered_df, circle_var = "P_value",color_var = "Correlation", x_axis = "Comparison", 
                    y_axis = "Cancer Type", plot_width= 1000, plot_height = 500, legend_min = 1e-8, 
                    legend_med = 0.00001, legend_max = 0.05, font_size = 12, show_legend = True , save_png = "Figure2.png")

NameError: name 'ordered_df' is not defined

# Check if blanks are due to no data 

The follow code chunks show that the following cancers/genes don't have data: colon THBD, Kidney MMP12, and Ovarian MMP12. (As mentioned in EGFR Figure 2 legend)

In [9]:
#Get append version of the df with all proteins 
df_all_prot_append = pd.read_csv("Pval_corr_table_Fig_2_return_all_append.csv")
df_all_prot_append = df_all_prot_append.drop(['Unnamed: 0'], axis=1)
#Remove Proteomics labeling 
df_all_prot_append = df_all_prot_append.replace(to_replace ='_proteomics', value = '', regex = True) 



In [10]:
#MHC class2 (CD4) molecules
mhc2 = ["HLA-DPA1","HLA-DPB1","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DOB","HLA-DQ","HLA-DRA","HLA-DRB1"]
#subset dataframe to include genes only desired for figure 
df_all_mhc2 = df_all_prot_append[df_all_prot_append.Comparison.isin(mhc2)]
print("Number of missing columns" )
print(56 - len(df_all_mhc2))



Number of missing columns
4


Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
141,HLA-DMB,-0.442005,0.000348,GBM
603,HLA-DMA,-0.373678,0.005056,GBM
1667,HLA-DPB1,-0.262423,0.055717,GBM
1681,HLA-DRA,-0.261173,0.057358,GBM
2494,HLA-DPA1,-0.218791,0.126822,GBM
2705,HLA-DRB1,-0.209905,0.146458,GBM
12448,HLA-DPB1,-0.222952,0.109021,Kidney
12942,HLA-DPA1,-0.199014,0.164162,Kidney
13489,HLA-DRA,-0.17688,0.229302,Kidney
15164,HLA-DMA,0.123195,0.443495,Kidney


In [11]:
#CD8 and MHC class 1 molecules
mhc1 = ["CD8A","HLA-A","HLA-B","HLA-C"]

df_all_mhc1 = df_all_prot_append[df_all_prot_append.Comparison.isin(mhc1)]
df_all_mhc1

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
4460,HLA-C,0.149001,0.338227,GBM
4767,HLA-B,0.140454,0.37155,GBM
6132,CD8A,-0.1552,0.528789,GBM
8068,HLA-A,-0.058388,0.749978,GBM
13260,CD8A,-0.240222,0.201476,Kidney
16995,HLA-A,-0.08214,0.673774,Kidney
17871,HLA-B,-0.056599,0.769694,Kidney
19497,HLA-C,-0.022882,0.917089,Kidney
26082,CD8A,0.108046,0.656611,Ovarian
26858,HLA-C,0.08565,0.719148,Ovarian


In [12]:
#Remove Proteomics labeling 
df_all_prot_append = df_all_prot_append.replace(to_replace ='_proteomics', value = '', regex = True) 

#subset dataframe to include genes only desired for figure 
df_all_comp_coag = df_all_prot_append[df_all_prot_append.Comparison.isin(coag_upa)]
print("Number of rows in data frame " + str(len(df_all_comp_coag)))

NameError: name 'coag_upa' is not defined

Our Figure includes 26 genes for 8 cancers. If all data was present there would be 208 rows. However, the data frame only has 205 rows. 3 genes are missing. 

In [None]:
def find_missing_genes(test_list, full_list):
    for gene in full_list:
        if (gene not in test_list):
            print(gene)
        

In [None]:
#Get list of genes for colon, kidney, and ovarian
colon = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Colon"]
colon_list = colon.Comparison.to_list()

Kidney = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Kidney"]
Kidney_list = Kidney.Comparison.to_list()

Ovarian = df_all_prot_append[df_all_prot_append["Cancer Type"] == "Ovarian"]
Ovarian_list = Ovarian.Comparison.to_list()

In [None]:
#Show the 3 missing genes 
print("Ovarian missing genes: ")
find_missing_genes(Ovarian_list, coag_upa)
print("Kidney missing genes: ")
find_missing_genes(Kidney_list, coag_upa)
print("Colon missing genes: ")
find_missing_genes(colon_list, coag_upa)