# Make Supplemental Tables and variations

This notebook combines dfs with p-values and correlations for 8 cancers. It makes the supplmental data table as well as the dataframes used in downstream anaylisis.  
Description of created csv files: 
* Supplemental_Table_2  - This is the supplemental table provided with the manuscript. It has the FDR corrected p-values and correlations for all proteins, 
* Supplemental_Table_EGFR_sig_only - This is the filtered version of suppl table 1 which only has FDR significant p-values
* all_heatmap - all data appended to make a long table for easy use with heatmap function, 
* sig_pval_heatmap - contains only significant genes in long format for heatmap, 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 


# Flagship

Read in the single cancer dfs with FDR corrected p-values and correlation values. Merge all dfs into one pancancer data frame.

In [2]:
Gbm_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_GBM')
Hnscc_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_HNSCC')
Luad_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_LUAD')
Lscc_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_LSCC')
Brca_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_BR')
Ovarian_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_OV')
Colon_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_CO')
Kidney_df = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_ccRCC')


# Make Supplemental Table 2

Merge all cancer data frames into one wide data frame

In [3]:
pancan = pd.merge(Gbm_df, Kidney_df, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Ovarian_df, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Luad_df, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Lscc_df, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Brca_df, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Colon_df, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Hnscc_df, on="Comparison", how = "outer")
pancan = pancan[1:]
pancan.to_csv('csv_files/Supplemental_Table_2.csv', index=False)
pancan

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM,Correlation_ccRCC,P_value_ccRCC,Correlation_OV,P_value_OV,Correlation_LUAD,P_value_LUAD,Correlation_LSCC,P_value_LSCC,Correlation_BR,P_value_BR,Correlation_CO,P_value_CO,Correlation_HNSCC,P_value_HNSCC
1,PHLDA1,0.816848,3.507071e-21,0.254436,0.060261,,,0.260110,0.074530,0.713420,2.644826e-14,0.364797,0.002164,0.386104,0.122847,0.587915,1.282608e-08
2,GRB2,-0.610889,6.729990e-08,-0.217427,0.120342,-0.190090,0.346111,-0.302439,0.020631,-0.198042,2.437176e-01,-0.177379,0.142733,0.150960,0.347409,-0.496325,1.084446e-05
3,SOCS2,0.562720,3.420388e-06,,,,,,,0.472624,1.417921e-02,,,,,-0.014519,9.611234e-01
4,CDH4,0.559180,3.420388e-06,0.148407,0.513490,,,,,,,,,,,,
5,DAB2,-0.556402,3.420388e-06,-0.076173,0.673774,0.076981,0.750510,-0.086403,0.597546,-0.072496,7.501117e-01,0.326055,0.003543,-0.147519,0.360266,-0.224967,7.751436e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14874,PIM1,,,,,,,,,,,,,,,-0.002105,9.917500e-01
14875,HEPHL1,,,,,,,,,,,,,,,-0.001563,9.931644e-01
14876,SULT6B1,,,,,,,,,,,,,,,-0.001236,9.950909e-01
14877,CLEC4G,,,,,,,,,,,,,,,0.001142,9.963768e-01


# Make Supplemental_Table_EGFR_sig_only

In [4]:
Gbm_df_sig =Gbm_df.loc[(Gbm_df["P_value_GBM"] <= 0.05)]
Kidney_df_sig =Kidney_df.loc[(Kidney_df["P_value_ccRCC"] <= 0.05)]
Colon_df_sig =Colon_df.loc[(Colon_df["P_value_CO"] <= 0.05)]
Ovarian_df_sig =Ovarian_df.loc[(Ovarian_df["P_value_OV"] <= 0.05)]
Luad_df_sig =Luad_df.loc[(Luad_df["P_value_LUAD"] <= 0.05)]
Lscc_df_sig =Lscc_df.loc[(Lscc_df["P_value_LSCC"] <= 0.05)]
Brca_df_sig =Brca_df.loc[(Brca_df["P_value_BR"] <= 0.05)]
Hnscc_df_sig =Hnscc_df.loc[(Hnscc_df["P_value_HNSCC"] <= 0.05)]

In [5]:
pancan = pd.merge(Gbm_df_sig, Kidney_df_sig, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Ovarian_df_sig, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Luad_df_sig, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Lscc_df_sig, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Brca_df_sig, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Colon_df_sig, on="Comparison", how = "outer")
pancan = pd.merge(pancan, Hnscc_df_sig, on="Comparison", how = "outer")
pancan = pancan[1:]
pancan.to_csv('csv_files/Supplemental_Table_EGFR_sig_only.csv', index=False)


In [6]:
pancan

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM,Correlation_ccRCC,P_value_ccRCC,Correlation_OV,P_value_OV,Correlation_LUAD,P_value_LUAD,Correlation_LSCC,P_value_LSCC,Correlation_BR,P_value_BR,Correlation_CO,P_value_CO,Correlation_HNSCC,P_value_HNSCC
1,PHLDA1,0.816848,3.507071e-21,,,,,,,0.713420,2.644826e-14,0.364797,0.002164,,,0.587915,1.282608e-08
2,GRB2,-0.610889,6.729990e-08,,,,,-0.302439,0.020631,,,,,,,-0.496325,1.084446e-05
3,SOCS2,0.562720,3.420388e-06,,,,,,,0.472624,1.417921e-02,,,,,,
4,CDH4,0.559180,3.420388e-06,,,,,,,,,,,,,,
5,DAB2,-0.556402,3.420388e-06,,,,,,,,,0.326055,0.003543,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6497,TMUB2,,,,,,,,,,,,,,,0.245838,4.970798e-02
6498,PARL,,,,,,,,,,,,,,,0.245796,4.972477e-02
6499,RGS14,,,,,,,,,,,,,,,-0.245782,4.972477e-02
6500,STEAP1,,,,,,,,,,,,,,,0.252364,4.972477e-02


# Make All Heatmap

In [7]:
# Create long df for heatmap

cancer = ['GBM','HNSCC','LSCC','LUAD','BR','OV','ccRCC','CO']
merged_dfs = [Gbm_df,Hnscc_df,Lscc_df,Luad_df,Brca_df,Ovarian_df,Kidney_df,Colon_df]

all_long_df = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={'P_value_' + c: 'P_Value'})
    m2 = m2.rename(columns={'Correlation_' + c: 'Correlation'})
    all_long_df = all_long_df.append(m2) 
    if i < 7:
        i += 1
        


In [8]:
all_long_df.to_csv('csv_files/all_prot_heatmap_EGFR.csv', index=False)
all_long_df

Unnamed: 0,Comparison,Correlation,P_Value,Cancer
0,EGFR,1.000000,0.000000e+00,GBM
1,PHLDA1,0.816848,3.507071e-21,GBM
2,GRB2,-0.610889,6.729990e-08,GBM
3,SOCS2,0.562720,3.420388e-06,GBM
4,CDH4,0.559180,3.420388e-06,GBM
...,...,...,...,...
7108,AK1,-0.000256,9.985768e-01,CO
7109,KRI1,-0.000217,9.986912e-01,CO
7110,MUL1,-0.000272,9.986912e-01,CO
7111,CADPS,0.000064,9.997745e-01,CO


# Significant P values of Heatmap df 

In [9]:
only_sig_pvals = all_long_df.loc[(all_long_df["P_Value"] <= 0.05)]
only_sig_pvals.to_csv('csv_files/sig_prot_heatmap_EGFR.csv', index=False)

# Harmonized

Read in the single cancer dfs with FDR corrected p-values and correlation values. Merge all dfs into one pancancer data frame.

In [10]:
Gbm_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_GBM_Harmonized')
Hnscc_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_HNSCC_Harmonized')
Luad_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_LUAD_Harmonized')
Lscc_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_LSCC_Harmonized')
Brca_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_BR_Harmonized')
Ovarian_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_OV_Harmonized')
Colon_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_CO_Harmonized')
Kidney_df_pc = pd.read_csv('csv_files/trans_effects_all_prot_fdr_corrected_ccRCC_Harmonized')

In [11]:
Colon_df_pc

Unnamed: 0,Comparison,Correlation_CO,P_value_CO
0,EGFR,1.000000,0.000000
1,RAB27B,0.466296,0.005035
2,S100A16,0.461603,0.005035
3,STAU1,-0.443358,0.010505
4,YWHAB,-0.435715,0.012681
...,...,...,...
7755,CSRP1,0.000037,0.999993
7756,CHRDL2,-0.000037,0.999993
7757,FAM136A,0.000017,0.999993
7758,NTAN1,0.000001,0.999993


# Harmonized Supplemental Table 2

Merge all cancer data frames into one wide data frame

In [12]:
harmonized = pd.merge(Gbm_df_pc, Kidney_df_pc, on="Comparison", how = "outer")

harmonized = pd.merge(harmonized, Ovarian_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Luad_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Lscc_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Brca_df_pc, on="Comparison", how = "outer")

harmonized = pd.merge(harmonized, Colon_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Hnscc_df_pc, on="Comparison", how = "outer")

harmonized

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM,Correlation_ccRCC,P_value_ccRCC,Correlation_OV,P_value_OV,Correlation_LUAD,P_value_LUAD,Correlation_LSCC,P_value_LSCC,Correlation_BR,P_value_BR,Correlation_CO,P_value_CO,Correlation_HNSCC,P_value_HNSCC
0,EGFR,1.000000,0.000000e+00,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000e+00,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000e+00
1,PHLDA1,0.789551,9.649849e-19,0.180866,0.249285,0.383373,0.128340,0.261161,0.058276,0.682116,1.398013e-12,0.212226,0.066258,0.437505,0.143566,0.583064,1.395609e-08
2,CDH4,0.656331,4.744022e-10,0.015524,0.965357,,,,,,,,,,,,
3,GRB2,-0.589008,3.310325e-07,-0.245694,0.081461,-0.097221,0.639167,-0.265172,0.054181,-0.098621,5.999783e-01,-0.168101,0.160235,-0.017903,0.972007,-0.481605,2.053150e-05
4,PHLDA3,0.561528,2.006861e-06,0.364164,0.005159,-0.260799,0.159831,0.216646,0.131629,0.651618,5.103796e-11,0.152625,0.207694,0.161425,0.624900,0.693125,2.351012e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14778,MROH2B,,,,,,,,,,,,,,,-0.007516,9.823474e-01
14779,TRIM54,,,,,,,,,,,,,,,-0.003303,9.873752e-01
14780,HEPHL1,,,,,,,,,,,,,,,0.002924,9.884925e-01
14781,SH3BGR,,,,,,,,,,,,,,,0.002168,9.956978e-01


In [13]:
harmonized = pd.merge(Gbm_df_pc, Kidney_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Ovarian_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Luad_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Lscc_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Brca_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Colon_df_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Hnscc_df_pc, on="Comparison", how = "outer")
harmonized = harmonized[1:]
harmonized.to_csv('csv_files/Supplemental_Table_2_Harmonized.csv', index=False)
harmonized

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM,Correlation_ccRCC,P_value_ccRCC,Correlation_OV,P_value_OV,Correlation_LUAD,P_value_LUAD,Correlation_LSCC,P_value_LSCC,Correlation_BR,P_value_BR,Correlation_CO,P_value_CO,Correlation_HNSCC,P_value_HNSCC
1,PHLDA1,0.789551,9.649849e-19,0.180866,0.249285,0.383373,0.128340,0.261161,0.058276,0.682116,1.398013e-12,0.212226,0.066258,0.437505,0.143566,0.583064,1.395609e-08
2,CDH4,0.656331,4.744022e-10,0.015524,0.965357,,,,,,,,,,,,
3,GRB2,-0.589008,3.310325e-07,-0.245694,0.081461,-0.097221,0.639167,-0.265172,0.054181,-0.098621,5.999783e-01,-0.168101,0.160235,-0.017903,0.972007,-0.481605,2.053150e-05
4,PHLDA3,0.561528,2.006861e-06,0.364164,0.005159,-0.260799,0.159831,0.216646,0.131629,0.651618,5.103796e-11,0.152625,0.207694,0.161425,0.624900,0.693125,2.351012e-14
5,GLA,-0.562315,2.006861e-06,-0.273040,0.047412,-0.021412,0.935707,0.213431,0.138822,0.006542,9.809384e-01,-0.000452,0.997917,-0.029155,0.947540,0.019987,9.161161e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14778,MROH2B,,,,,,,,,,,,,,,-0.007516,9.823474e-01
14779,TRIM54,,,,,,,,,,,,,,,-0.003303,9.873752e-01
14780,HEPHL1,,,,,,,,,,,,,,,0.002924,9.884925e-01
14781,SH3BGR,,,,,,,,,,,,,,,0.002168,9.956978e-01


# Make Supplemental_Table_EGFR_sig_only

In [14]:
Gbm_df_sig_pc =Gbm_df_pc.loc[(Gbm_df_pc["P_value_GBM"] <= 0.05)]
Kidney_df_sig_pc =Kidney_df_pc.loc[(Kidney_df_pc["P_value_ccRCC"] <= 0.05)]
Colon_df_sig_pc =Colon_df_pc.loc[(Colon_df_pc["P_value_CO"] <= 0.05)]
Ovarian_df_sig_pc =Ovarian_df_pc.loc[(Ovarian_df_pc["P_value_OV"] <= 0.05)]
Luad_df_sig_pc =Luad_df_pc.loc[(Luad_df_pc["P_value_LUAD"] <= 0.05)]
Lscc_df_sig_pc =Lscc_df_pc.loc[(Lscc_df_pc["P_value_LSCC"] <= 0.05)]
Brca_df_sig_pc =Brca_df_pc.loc[(Brca_df_pc["P_value_BR"] <= 0.05)]
Hnscc_df_sig_pc =Hnscc_df_pc.loc[(Hnscc_df_pc["P_value_HNSCC"] <= 0.05)]

In [15]:
harmonized = pd.merge(Gbm_df_sig_pc, Kidney_df_sig_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Ovarian_df_sig_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Luad_df_sig_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Lscc_df_sig_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Brca_df_sig_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Colon_df_sig_pc, on="Comparison", how = "outer")
harmonized = pd.merge(harmonized, Hnscc_df_sig_pc, on="Comparison", how = "outer")
harmonized = pancan[1:]
harmonized.to_csv('csv_files/Supplemental_Table_EGFR_sig_only_Harmonized.csv', index=False)

# Make All Heatmap

In [16]:
# Create long df for heatmap

cancer = ['GBM','HNSCC','LSCC','LUAD','BR','OV','ccRCC','CO']
merged_dfs = [Gbm_df_pc,Hnscc_df_pc,Lscc_df_pc,Luad_df_pc,Brca_df_pc,Ovarian_df_pc,Kidney_df_pc,Colon_df_pc]

all_long_df_pc = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={'P_value_' + c: 'P_Value'})
    m2 = m2.rename(columns={'Correlation_' + c: 'Correlation'})
    all_long_df_pc = all_long_df_pc.append(m2) 
    if i < 7:
        i += 1

# Significant P values of Heatmap df 

In [17]:
only_sig_pvals_pc = all_long_df_pc.loc[(all_long_df_pc["P_Value"] <= 0.05)]
only_sig_pvals_pc.to_csv('csv_files/sig_prot_heatmap_EGFR_Harmonized.csv', index=False)