# Pearson Dfs EGFR proteomics vs all phosphoprotemics 

This notebook creates dfs for each cancer type that are EGFR proteomics vs all phospho

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import cptac
import cptac.utils as u
import plot_utils as p
import cptac.pancan as pc



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
g = pc.PancanGbm()
hn = pc.PancanHnscc()
l = pc.PancanLuad()
o = pc.PancanOv()
c = pc.PancanCcrcc()
col = pc.PancanCoad()
b = pc.PancanBrca()
ls = pc.PancanLscc()
en = pc.PancanUcec()

                                                 

In [4]:
def get_phosphosites(df1):
    
    df1 = df1.loc[:,~df1.columns.duplicated()]
    phospho_sites = df1.columns.values.tolist()
    phospho_sites.remove('EGFR_proteomics')
    return phospho_sites

In [5]:
def get_phosphotrans(cancer, prot_col,return_all = True):
    df1 = cancer.join_omics_to_omics(df1_name="proteomics",df2_name="phosphoproteomics", df1_source = "umich", df2_source = "umich",
genes1 = "EGFR", tissue_type = "tumor")
    drop = ["Peptide", "Database_ID"]
    df1 = u.reduce_multiindex(df=df1,levels_to_drop= drop,  flatten= True)
    df1.columns = df1.columns.str.replace("umich_phosphoproteomics_", "")
    df1.columns = df1.columns.str.replace("umich_", "")
    phospho_sites = get_phosphosites(df1)
    trans_df = p.wrap_pearson_corr(df1,prot_col,comparison_columns = phospho_sites, return_all = return_all)
    return trans_df
    


In [13]:
def get_phosphotrans_pathway(cancer, prot_col,path_genes, return_all = True):
    df1 = cancer.join_omics_to_omics(df1_name="proteomics",df2_name="phosphoproteomics", df1_source = "umich", df2_source = "umich",
genes1 = "EGFR", genes2 = path_genes, tissue_type = "tumor")
    drop = ["Peptide", "Database_ID"]
    df1 = u.reduce_multiindex(df=df1,levels_to_drop= drop,  flatten= True)
    df1.columns = df1.columns.str.replace("umich_phosphoproteomics_", "")
    df1.columns = df1.columns.str.replace("umich_", "")
    phospho_sites = get_phosphosites(df1)
    trans_df = p.wrap_pearson_corr(df1,prot_col,comparison_columns = phospho_sites, return_all = return_all)
    return trans_df
    

# Step 1 Create Data frames

For each cancer type, load proteomic data and phosphoprteomic data for EGFR. Also, load clinical data and use it filter out non-tumor samples

# Step 2 Create list of phosphosites

For each cancer type, create list of phosphosites by using the getphoshosites function which extracts column names and removes extra columns.

# Step 3 Run Pearson Correlation Function 

Run function on df and compare EGFR proteomics to all phosphosites in list. Store all comparisons. Save df as csv file

# GBM

In [7]:
gbm_trans = get_phosphotrans(g,"EGFR_proteomics")
gbm_trans

Unnamed: 0,Comparison,Correlation,P_value
9565,EGFR_S695,0.794188,1.049406e-22
9576,EGFR_Y1197,0.837900,1.792640e-22
9566,EGFR_S991,0.786851,2.913899e-22
9562,EGFR_S1166Y1172,0.766134,1.590295e-20
9570,EGFR_T693,0.764357,2.198515e-20
...,...,...,...
32904,TCF7L2_S154S156,0.000044,9.997966e-01
31391,SRSF4_S330S332,0.000019,9.998477e-01
36582,ZC3H4_S1275,0.000022,9.998507e-01
18975,MRTFA_S907,0.000019,9.998681e-01


In [8]:

gbm_trans_sig =gbm_trans.loc[(gbm_trans["P_value"] <= 0.05)]

gbm_trans_sig.to_csv('csv_files/gbm_trans_sig_phospho.csv', index=False)
gbm_trans_sig


Unnamed: 0,Comparison,Correlation,P_value
9565,EGFR_S695,0.794188,1.049406e-22
9576,EGFR_Y1197,0.837900,1.792640e-22
9566,EGFR_S991,0.786851,2.913899e-22
9562,EGFR_S1166Y1172,0.766134,1.590295e-20
9570,EGFR_T693,0.764357,2.198515e-20
...,...,...,...
1755,ANKS1B_S364,0.248113,4.991656e-02
1351,AMER2_S371,0.206135,4.995777e-02
6630,CLASP2_S324,0.293982,4.997786e-02
27075,ROBO1_S1055,-0.206108,4.998873e-02


In [9]:
gbm_trans.to_csv("csv_files/GBM_EGFR_phos_trans_all.csv")
gbm_trans['Cancer Type']='GBM'
#df.to_csv("csv_files/GBM_EGFR_prot_all_phospho2.csv")
gbm_trans

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
9565,EGFR_S695,0.794188,1.049406e-22,GBM
9576,EGFR_Y1197,0.837900,1.792640e-22,GBM
9566,EGFR_S991,0.786851,2.913899e-22,GBM
9562,EGFR_S1166Y1172,0.766134,1.590295e-20,GBM
9570,EGFR_T693,0.764357,2.198515e-20,GBM
...,...,...,...,...
32904,TCF7L2_S154S156,0.000044,9.997966e-01,GBM
31391,SRSF4_S330S332,0.000019,9.998477e-01,GBM
36582,ZC3H4_S1275,0.000022,9.998507e-01,GBM
18975,MRTFA_S907,0.000019,9.998681e-01,GBM


In [10]:
prot = u.get_proteins_in_pathways('EGF/EGFR Signaling Pathway', 'wikipathways')
prot_list = list(prot.member)
print('Num interacting proteins:', len(prot_list))
prot_list.remove("EGFR")
len(prot_list)

Num interacting proteins: 159


158

In [14]:
gbm_trans_path = get_phosphotrans_pathway(g,"EGFR_proteomics", prot_list)
gbm_trans_path

Unnamed: 0,Comparison,Correlation,P_value
732,PTPN11_Y62,0.656249,1.246446e-13
1000,VAV3_Y217,0.632098,1.822312e-11
284,ERRFI1_Y394,0.643884,9.492971e-09
522,MAP3K3_T294,-0.635680,2.067788e-06
661,PLCG1_Y783,0.437178,5.419829e-06
...,...,...,...
525,MAP3K4_S1155,0.001124,9.948084e-01
509,MAP3K1_S531,-0.000854,9.955038e-01
437,IQSEC1_S181,0.000692,9.968035e-01
788,RAF1_S316S321,0.000185,9.986188e-01


In [15]:
gbm_trans_sig =gbm_trans_path.loc[(gbm_trans_path["P_value"] <= 0.05)]
gbm_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
732,PTPN11_Y62,0.656249,1.246446e-13
1000,VAV3_Y217,0.632098,1.822312e-11
284,ERRFI1_Y394,0.643884,9.492971e-09
522,MAP3K3_T294,-0.635680,2.067788e-06
661,PLCG1_Y783,0.437178,5.419829e-06
...,...,...,...
409,INPP5D_S959,-0.248265,4.792134e-02
92,ATXN2_S728,0.208981,4.807156e-02
651,PLCE1_S788,0.218699,4.838584e-02
473,JUN_T239S243,0.221339,4.848377e-02


In [16]:
gbm_trans_sig_copy = gbm_trans_sig
gbm_trans_sig_copy[['Gene','Site']] = gbm_trans_sig_copy.Comparison.str.split("_",expand=True)
gbm_trans_sig_copy


Unnamed: 0,Comparison,Correlation,P_value,Gene,Site
732,PTPN11_Y62,0.656249,1.246446e-13,PTPN11,Y62
1000,VAV3_Y217,0.632098,1.822312e-11,VAV3,Y217
284,ERRFI1_Y394,0.643884,9.492971e-09,ERRFI1,Y394
522,MAP3K3_T294,-0.635680,2.067788e-06,MAP3K3,T294
661,PLCG1_Y783,0.437178,5.419829e-06,PLCG1,Y783
...,...,...,...,...,...
409,INPP5D_S959,-0.248265,4.792134e-02,INPP5D,S959
92,ATXN2_S728,0.208981,4.807156e-02,ATXN2,S728
651,PLCE1_S788,0.218699,4.838584e-02,PLCE1,S788
473,JUN_T239S243,0.221339,4.848377e-02,JUN,T239S243


In [17]:
Gbm_df_site = gbm_trans_sig_copy.loc[gbm_trans_sig_copy["Gene"].isin(["JAK1","STAT1","STAT3"])]
Gbm_df_site

Unnamed: 0,Comparison,Correlation,P_value,Gene,Site
931,STAT3_S727,-0.283679,0.004235,STAT3,S727
930,STAT3_S701,-0.228037,0.022498,STAT3,S701


# Brca

In [18]:
brca_trans = get_phosphotrans(b,"EGFR_proteomics")
brca_trans

Unnamed: 0,Comparison,Correlation,P_value
6395,EGFR_T693,0.710923,5.716717e-17
13341,MYB_S653,-0.629275,1.803304e-13
7838,FOXA1_S331,-0.593246,1.454351e-12
10597,KIF1B_S1141,0.564918,2.672365e-11
16984,PREX1_S1200,-0.581471,2.721136e-11
...,...,...,...
4783,COG7_S506,-0.000058,9.996442e-01
14325,NIPBL_S2511S2513S2515,-0.000071,9.996444e-01
6366,EEPD1_S16,0.000038,9.997284e-01
25313,ZC3H13_T985S986,-0.000034,9.997364e-01


In [19]:

brca_trans_sig =brca_trans.loc[(brca_trans["P_value"] <= 0.05)]

brca_trans_sig.to_csv('csv_files/brca_trans_sig_phospho.csv', index=False)
brca_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
6395,EGFR_T693,0.710923,5.716717e-17
13341,MYB_S653,-0.629275,1.803304e-13
7838,FOXA1_S331,-0.593246,1.454351e-12
10597,KIF1B_S1141,0.564918,2.672365e-11
16984,PREX1_S1200,-0.581471,2.721136e-11
...,...,...,...
17592,PXN_S230,0.256550,4.983268e-02
1756,ARHGEF7_S25,0.187479,4.984861e-02
20318,SLC4A1_S350,0.250114,4.992475e-02
1196,APBB2_S716,-0.180911,4.993898e-02


In [20]:
brca_trans_path = get_phosphotrans(b,"EGFR_proteomics", prot_list)
brca_trans_path

Unnamed: 0,Comparison,Correlation,P_value
6395,EGFR_T693,0.710923,5.716717e-17
13341,MYB_S653,-0.629275,1.803304e-13
7838,FOXA1_S331,-0.593246,1.454351e-12
10597,KIF1B_S1141,0.564918,2.672365e-11
16984,PREX1_S1200,-0.581471,2.721136e-11
...,...,...,...
4783,COG7_S506,-0.000058,9.996442e-01
14325,NIPBL_S2511S2513S2515,-0.000071,9.996444e-01
6366,EEPD1_S16,0.000038,9.997284e-01
25313,ZC3H13_T985S986,-0.000034,9.997364e-01


# HNSCC

In [21]:
hnscc_trans = get_phosphotrans(hn, "EGFR_proteomics")
hnscc_trans

Unnamed: 0,Comparison,Correlation,P_value
6769,EGFR_Y1197,8.314116e-01,1.395159e-29
6762,EGFR_S991,7.735060e-01,2.530986e-24
6765,EGFR_T693,6.961250e-01,4.155866e-18
6760,EGFR_S1166,7.890642e-01,6.840802e-17
6768,EGFR_Y1172,8.356440e-01,1.140040e-15
...,...,...,...
2544,BCL7C_S122S126,-1.439699e-05,9.998969e-01
24288,TCOF1_S1228,-8.902610e-06,9.999264e-01
7875,FAM83G_S610S613S614,1.221104e-05,9.999274e-01
3714,CCDC120_S331,4.849565e-06,9.999608e-01


In [22]:

hnscc_trans_sig =hnscc_trans.loc[(hnscc_trans["P_value"] <= 0.05)]

hnscc_trans_sig.to_csv('csv_files/hnscc_trans_sig_phospho.csv', index=False)
hnscc_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
6769,EGFR_Y1197,0.831412,1.395159e-29
6762,EGFR_S991,0.773506,2.530986e-24
6765,EGFR_T693,0.696125,4.155866e-18
6760,EGFR_S1166,0.789064,6.840802e-17
6768,EGFR_Y1172,0.835644,1.140040e-15
...,...,...,...
25167,TOP2A_S1106,0.344061,4.992616e-02
27809,ZNF316_S22,0.188273,4.993328e-02
20867,SAV1_S94,-0.182455,4.996007e-02
7449,ESF1_S198,0.182437,4.998344e-02



# Kidney 

In [23]:
ccrcc_trans = get_phosphotrans(c, "EGFR_proteomics")
ccrcc_trans

Unnamed: 0,Comparison,Correlation,P_value
5509,EGFR_T693,0.570784,7.494312e-11
5512,EGFR_Y1172,0.738071,2.150911e-09
8187,IFT22_S137,0.563316,2.302971e-09
3044,CAV2_Y19S20,0.521948,2.561794e-08
3039,CAV1_S9,0.528519,3.116960e-08
...,...,...,...
6393,FARP1_S833,-0.000037,9.997523e-01
6758,FNIP1_S98S99,-0.000052,9.997618e-01
19669,TLN2_T2041,-0.000017,9.998886e-01
7131,GIT1_S388,0.000010,9.999510e-01


In [24]:

ccrcc_trans_sig =ccrcc_trans.loc[(ccrcc_trans["P_value"] <= 0.05)]

ccrcc_trans_sig.to_csv('csv_files/ccrcc_trans_sig_phospho.csv', index=False)
ccrcc_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
5509,EGFR_T693,0.570784,7.494312e-11
5512,EGFR_Y1172,0.738071,2.150911e-09
8187,IFT22_S137,0.563316,2.302971e-09
3044,CAV2_Y19S20,0.521948,2.561794e-08
3039,CAV1_S9,0.528519,3.116960e-08
...,...,...,...
12882,PARD6G_S295,-0.187449,4.988537e-02
14384,PPP1R9A_S1140,-0.273352,4.990587e-02
15815,REXO1_S610,-0.209686,4.990707e-02
585,AHNAK_S5318,0.194676,4.991397e-02


# Ovarian 

# Colon

# LUAD

In [25]:
luad_trans = get_phosphotrans(l, "EGFR_proteomics")
luad_trans

Unnamed: 0,Comparison,Correlation,P_value
6755,EGFR_S1064,8.282459e-01,6.506343e-28
6759,EGFR_T693,7.725070e-01,1.956120e-22
6762,EGFR_Y1197,6.901359e-01,1.024526e-14
6756,EGFR_S1071,8.167945e-01,1.456467e-12
6761,EGFR_Y1172,8.107491e-01,1.476270e-11
...,...,...,...
24807,TNKS1BP1_S1452,1.948477e-05,9.998418e-01
25702,TULP4_S1343,-2.260408e-05,9.998609e-01
5902,DENND2B_S105,1.379495e-05,9.998924e-01
7304,EPS15L1_S708,-5.879395e-06,9.999717e-01


In [26]:

luad_trans_sig = luad_trans.loc[(hnscc_trans["P_value"] <= 0.05)]

luad_trans_sig.to_csv('csv_files/luad_trans_sig_phospho.csv', index=False)
luad_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
6759,EGFR_T693,0.772507,1.956120e-22
6762,EGFR_Y1197,0.690136,1.024526e-14
6756,EGFR_S1071,0.816794,1.456467e-12
6761,EGFR_Y1172,0.810749,1.476270e-11
6758,EGFR_S991,0.560493,1.584502e-10
...,...,...,...
16573,PDLIM4_S120,0.000117,9.992298e-01
10684,JCAD_S1317,0.000089,9.993624e-01
27627,ZNF638_S1401,-0.000112,9.994469e-01
25702,TULP4_S1343,-0.000023,9.998609e-01


# Lscc

In [27]:
lscc_trans = get_phosphotrans(ls, "EGFR_proteomics")
lscc_trans


Unnamed: 0,Comparison,Correlation,P_value
7277,EGFR_T693,0.813983,3.116274e-27
7279,EGFR_Y1197,0.769996,7.342722e-19
7276,EGFR_S991,0.695488,3.422569e-17
7274,EGFR_S1071,0.838113,1.080435e-13
7272,EGFR_S1039,0.608678,5.649394e-12
...,...,...,...
26717,TPM4_S170,-0.000024,9.998817e-01
44,ABCC3_S911,0.000016,9.998961e-01
2354,ATN1_S34,-0.000011,9.999137e-01
10561,HSD17B4_S198,0.000008,9.999491e-01


In [28]:
lscc_trans_sig =lscc_trans.loc[(lscc_trans["P_value"] <= 0.05)]

lscc_trans_sig.to_csv('csv_files/lscc_trans_sig_phospho.csv', index=False)
lscc_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
7277,EGFR_T693,0.813983,3.116274e-27
7279,EGFR_Y1197,0.769996,7.342722e-19
7276,EGFR_S991,0.695488,3.422569e-17
7274,EGFR_S1071,0.838113,1.080435e-13
7272,EGFR_S1039,0.608678,5.649394e-12
...,...,...,...
14558,MINK1_S601,0.254297,4.991322e-02
24236,SRRM2_S1984T1986,-0.278792,4.993036e-02
10491,HNRNPU_S66,-0.244184,4.996570e-02
19483,PRR12_S1361,-0.235221,4.997639e-02
