# Combine Pearson Tables

This notebook takes all of the data frames create from the notebook pearson_dfs_EGFRprot_all_prot_sig and merges them to create one wide dataframe 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import cptac
import cptac.utils as u

  import pandas.util.testing as tm


# Step 1: Read in Files

# Step 2: Merge and Format Data frame

Drop unneccesary columns and merge data frames. Rename columns to include cancer type. 

# GBM 

In [2]:
df = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/GBM_EGFR_all_pearson_return_all.csv")
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,Comparison,Correlation,P_value
0,EGFR_proteomics,1.000000,0.000000e+00
1,PHLDA1_proteomics,0.816848,6.553435e-25
2,GRB2_proteomics,-0.610889,1.886384e-11
3,SOCS2_proteomics,0.562720,1.343464e-09
4,CDH4_proteomics,0.559180,1.790048e-09
...,...,...,...
10698,ZNF195_proteomics,-0.000056,9.997124e-01
10699,BBS2_proteomics,0.000036,9.997140e-01
10700,PSMB4_proteomics,-0.000033,9.997438e-01
10701,BAZ1B_proteomics,-0.000025,9.998003e-01


# Kidney

In [3]:
df2 = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Kidney_EGFR_all_pearson_return_all.csv")

In [4]:
df2 = df2.drop(['Unnamed: 0'], axis=1)


In [5]:
Gbm_kidney = pd.merge(df, df2, on="Comparison", how = "outer")
Gbm_kidney = Gbm_kidney.rename(columns={"Correlation_x": "Correlation_Gbm","P_value_x":"P_value_Gbm" ,"Correlation_y":"Correlation_kidney","P_value_y": "P_value_kidney" })
Gbm_kidney

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney
0,EGFR_proteomics,1.000000,0.000000e+00,1.000000,0.000000
1,PHLDA1_proteomics,0.816848,6.553435e-25,0.254436,0.007311
2,GRB2_proteomics,-0.610889,1.886384e-11,-0.217427,0.022506
3,SOCS2_proteomics,0.562720,1.343464e-09,,
4,CDH4_proteomics,0.559180,1.790048e-09,0.148407,0.257781
...,...,...,...,...,...
11851,SEMA3G_proteomics,,,0.000838,0.995693
11852,GSTA1_proteomics,,,0.000486,0.995981
11853,LY6D_proteomics,,,-0.000398,0.997551
11854,FILIP1_proteomics,,,-0.000286,0.997638


# Ovarian 

In [6]:
df_Ovar = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Ovar_EGFR_all_pearson_return_all.csv")
df_Ovar = df_Ovar.drop(['Unnamed: 0'], axis=1)


In [7]:
pancan = pd.merge(Gbm_kidney, df_Ovar, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Ovar","P_value": "P_value_Ovar" })


# BRCA

In [8]:
df_Brca = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Brca_EGFR_all_pearson_return_all.csv")
df_Brca = df_Brca.drop(['Unnamed: 0'], axis=1)


In [9]:
pancan = pd.merge(pancan, df_Brca, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Brca","P_value": "P_value_Brca" })


# Luad

In [10]:
df_Luad = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Luad_EGFR_all_pearson_return_all.csv")
df_Luad = df_Luad.drop(['Unnamed: 0'], axis=1)


In [11]:
pancan = pd.merge(pancan, df_Luad, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Luad","P_value": "P_value_Luad" })

# HNSCC

In [12]:
df_hnscc = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Hnscc_EGFR_all_pearson_return_all.csv")
df_hnscc = df_hnscc.drop(['Unnamed: 0'], axis=1)


In [13]:
pancan = pd.merge(pancan, df_hnscc, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_hnscc","P_value": "P_value_hnscc" })


# Colon

In [14]:
df_colon = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Colon_EGFR_all_pearson_return_all.csv")
df_colon = df_colon.drop(['Unnamed: 0'], axis=1)


In [15]:
pancan = pd.merge(pancan, df_colon, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_colon","P_value": "P_value_colon" })


# LSCC

In [16]:
df_Lscc = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Lscc_EGFR_all_pearson_return_all.csv")
df_Lscc = df_Lscc.drop(['Unnamed: 0'], axis=1)


In [17]:
pancan = pd.merge(pancan, df_Lscc, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_Lscc","P_value": "P_value_Lscc" })
pancan

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_colon,P_value_colon,Correlation_Lscc,P_value_Lscc
0,EGFR_proteomics,1.000000,0.000000e+00,1.000000,0.000000,1.00000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000e+00,1.000000,0.000000,1.000000,0.000000e+00
1,PHLDA1_proteomics,0.816848,6.553435e-25,0.254436,0.007311,,,0.364797,0.000190,0.260110,0.011346,0.664271,3.426615e-15,0.386104,0.021968,0.713420,4.569893e-18
2,GRB2_proteomics,-0.610889,1.886384e-11,-0.217427,0.022506,-0.19009,0.085202,-0.177379,0.057899,-0.302439,0.001321,-0.532341,2.559824e-09,0.150960,0.139949,-0.198042,3.992126e-02
3,SOCS2_proteomics,0.562720,1.343464e-09,,,,,,,,,0.020297,8.984786e-01,,,0.472624,3.078071e-04
4,CDH4_proteomics,0.559180,1.790048e-09,0.148407,0.257781,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24328,TAGAP_proteomics,,,,,,,,,,,,,,,-0.002387,9.833413e-01
24329,ZNF283_proteomics,,,,,,,,,,,,,,,0.002610,9.835395e-01
24330,TRIM39_proteomics,,,,,,,,,,,,,,,-0.002379,9.851135e-01
24331,ADGRA3_proteomics,,,,,,,,,,,,,,,0.002228,9.877509e-01


In [18]:
pancan_corr = pancan[["Comparison","Correlation_Luad","Correlation_Gbm","Correlation_Brca","Correlation_hnscc","Correlation_colon","Correlation_Lscc"]]

In [19]:
pancan_corr = pancan_corr[1:]
pancan_corr.to_csv("csv_files/pancan_EGFR_all_wide_corr.csv")

In [20]:
pancan = pancan[1:]
pancan.to_csv("csv_files/pancan_EGFR_all_return_all_wide.csv")


In [21]:
pancan

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_colon,P_value_colon,Correlation_Lscc,P_value_Lscc
1,PHLDA1_proteomics,0.816848,6.553435e-25,0.254436,0.007311,,,0.364797,0.000190,0.260110,0.011346,0.664271,3.426615e-15,0.386104,0.021968,0.713420,4.569893e-18
2,GRB2_proteomics,-0.610889,1.886384e-11,-0.217427,0.022506,-0.190090,0.085202,-0.177379,0.057899,-0.302439,0.001321,-0.532341,2.559824e-09,0.150960,0.139949,-0.198042,3.992126e-02
3,SOCS2_proteomics,0.562720,1.343464e-09,,,,,,,,,0.020297,8.984786e-01,,,0.472624,3.078071e-04
4,CDH4_proteomics,0.559180,1.790048e-09,0.148407,0.257781,,,,,,,,,,,,
5,PLA2G15_proteomics,-0.556624,2.197562e-09,-0.298029,0.001566,-0.035395,0.759903,0.274185,0.003025,-0.182930,0.055768,-0.089313,3.557176e-01,,,-0.147438,1.278266e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24328,TAGAP_proteomics,,,,,,,,,,,,,,,-0.002387,9.833413e-01
24329,ZNF283_proteomics,,,,,,,,,,,,,,,0.002610,9.835395e-01
24330,TRIM39_proteomics,,,,,,,,,,,,,,,-0.002379,9.851135e-01
24331,ADGRA3_proteomics,,,,,,,,,,,,,,,0.002228,9.877509e-01
