# Pearson Dfs EGFR proteomics vs all phosphoprotemics 

This notebook creates dfs for each cancer type that are EGFR proteomics vs all phospho

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import cptac
import cptac.utils as u
import plot_utils as p
import cptac.pancan as pc

  import pandas.util.testing as tm


In [2]:
g = pc.PancanGbm()
hn = pc.PancanHnscc()
l = pc.PancanLuad()
#o = pc.PancanOv()
c = pc.PancanCcrcc()
#col = pc.PancanCoad()
b = pc.PancanBrca()
ls = pc.PancanLscc()
en = pc.PancanUcec()

                                                 

In [26]:
def getphosphosites(df1):
    
    df1 = df1.loc[:,~df1.columns.duplicated()]
    phospho_sites = df1.columns.values.tolist()
    phospho_sites.remove('EGFR_proteomics')
    return phospho_sites

In [33]:
def get_phosphotrans(cancer, prot_col,return_all = True):
    df1 = cancer.join_omics_to_omics(df1_name="proteomics",df2_name="phosphoproteomics", df1_source = "umich", df2_source = "umich",
genes1 = "EGFR", tissue_type = "tumor")
    drop = ["Peptide", "Database_ID"]
    df1 = u.reduce_multiindex(df=df1,levels_to_drop= drop,  flatten= True)
    df1.columns = df1.columns.str.replace("umich_phosphoproteomics_", "")
    df1.columns = df1.columns.str.replace("umich_", "")
    phospho_sites = getphosphosites(df1)
    trans_df = p.wrap_pearson_corr(df1,prot_col,comparison_columns = phospho_sites, return_all = return_all)
    return trans_df
    


In [49]:
def get_phosphotrans_pathway(cancer, prot_col,path_genes, return_all = True):
    df1 = cancer.join_omics_to_omics(df1_name="proteomics",df2_name="phosphoproteomics", df1_source = "umich", df2_source = "umich",
genes1 = "EGFR", genes2 = path_genes, tissue_type = "tumor")
    drop = ["Peptide", "Database_ID"]
    df1 = u.reduce_multiindex(df=df1,levels_to_drop= drop,  flatten= True)
    df1.columns = df1.columns.str.replace("umich_phosphoproteomics_", "")
    df1.columns = df1.columns.str.replace("umich_", "")
    phospho_sites = getphosphosites(df1)
    trans_df = p.wrap_pearson_corr(df1,prot_col,comparison_columns = phospho_sites, return_all = return_all)
    return trans_df
    

# Step 1 Create Data frames

For each cancer type, load proteomic data and phosphoprteomic data for EGFR. Also, load clinical data and use it filter out non-tumor samples

# Step 2 Create list of phosphosites

For each cancer type, create list of phosphosites by using the getphoshosites function which extracts column names and removes extra columns.

# Step 3 Run Pearson Correlation Function 

Run function on df and compare EGFR proteomics to all phosphosites in list. Store all comparisons. Save df as csv file

# GBM

In [34]:
gbm_trans = get_phosphotrans(g,"EGFR_proteomics")
gbm_trans



Unnamed: 0,Comparison,Correlation,P_value
8469,EGFR_S695,0.794188,1.049406e-22
8466,EGFR_Y1197,0.837900,1.792640e-22
8472,EGFR_S991,0.786851,2.913899e-22
8465,EGFR_S1166Y1172,0.766134,1.590295e-20
8470,EGFR_T693,0.764357,2.198515e-20
...,...,...,...
20106,TCF7L2_S154S156,0.000044,9.997966e-01
26225,SRSF4_S330S332,0.000019,9.998477e-01
3479,ZC3H4_S1275,0.000022,9.998507e-01
20798,MRTFA_S907,0.000019,9.998681e-01


In [70]:

gbm_trans_sig =gbm_trans.loc[(gbm_trans["P_value"] <= 0.05)]

gbm_trans_sig.to_csv('csv_files/gbm_trans_sig_phospho.csv', index=False)
gbm_trans_sig


Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
8469,EGFR_S695,0.794188,1.049406e-22,GBM
8466,EGFR_Y1197,0.837900,1.792640e-22,GBM
8472,EGFR_S991,0.786851,2.913899e-22,GBM
8465,EGFR_S1166Y1172,0.766134,1.590295e-20,GBM
8470,EGFR_T693,0.764357,2.198515e-20,GBM
...,...,...,...,...
34835,ANKS1B_S364,0.248113,4.991656e-02,GBM
34021,AMER2_S371,0.206135,4.995777e-02,GBM
33447,CLASP2_S324,0.293982,4.997786e-02,GBM
33740,ROBO1_S1055,-0.206108,4.998873e-02,GBM


In [37]:
gbm_trans.to_csv("csv_files/GBM_EGFR_phos_trans_all.csv")
gbm_trans['Cancer Type']='GBM'
#df.to_csv("csv_files/GBM_EGFR_prot_all_phospho2.csv")
gbm_trans

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
8469,EGFR_S695,0.794188,1.049406e-22,GBM
8466,EGFR_Y1197,0.837900,1.792640e-22,GBM
8472,EGFR_S991,0.786851,2.913899e-22,GBM
8465,EGFR_S1166Y1172,0.766134,1.590295e-20,GBM
8470,EGFR_T693,0.764357,2.198515e-20,GBM
...,...,...,...,...
20106,TCF7L2_S154S156,0.000044,9.997966e-01,GBM
26225,SRSF4_S330S332,0.000019,9.998477e-01,GBM
3479,ZC3H4_S1275,0.000022,9.998507e-01,GBM
20798,MRTFA_S907,0.000019,9.998681e-01,GBM


In [66]:
prot = u.get_proteins_in_pathways('EGF/EGFR Signaling Pathway', 'wikipathways')
prot_list = list(prot.member)
print('Num interacting proteins:', len(prot_list))
prot_list.remove("EGFR")
len(prot_list)

Num interacting proteins: 159


158

In [67]:
gbm_trans_path = get_phosphotrans_pathway(g,"EGFR_proteomics", prot_list)
gbm_trans_path



Unnamed: 0,Comparison,Correlation,P_value
732,PTPN11_Y62,0.656249,1.246446e-13
1000,VAV3_Y217,0.632098,1.822312e-11
284,ERRFI1_Y394,0.643884,9.492971e-09
522,MAP3K3_T294,-0.635680,2.067788e-06
661,PLCG1_Y783,0.437178,5.419829e-06
...,...,...,...
525,MAP3K4_S1155,0.001124,9.948084e-01
509,MAP3K1_S531,-0.000854,9.955038e-01
437,IQSEC1_S181,0.000692,9.968035e-01
788,RAF1_S316S321,0.000185,9.986188e-01


In [85]:
gbm_trans_sig =gbm_trans_path.loc[(gbm_trans_path["P_value"] <= 0.05)]
gbm_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
732,PTPN11_Y62,0.656249,1.246446e-13
1000,VAV3_Y217,0.632098,1.822312e-11
284,ERRFI1_Y394,0.643884,9.492971e-09
522,MAP3K3_T294,-0.635680,2.067788e-06
661,PLCG1_Y783,0.437178,5.419829e-06
...,...,...,...
409,INPP5D_S959,-0.248265,4.792134e-02
92,ATXN2_S728,0.208981,4.807156e-02
651,PLCE1_S788,0.218699,4.838584e-02
473,JUN_T239S243,0.221339,4.848377e-02


In [92]:
gbm_trans_sig_copy = gbm_trans_sig
gbm_trans_sig_copy[['Gene','Site']] = gbm_trans_sig_copy.Comparison.str.split("_",expand=True)
gbm_trans_sig_copy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Comparison,Correlation,P_value,Gene,Site
732,PTPN11_Y62,0.656249,1.246446e-13,PTPN11,Y62
1000,VAV3_Y217,0.632098,1.822312e-11,VAV3,Y217
284,ERRFI1_Y394,0.643884,9.492971e-09,ERRFI1,Y394
522,MAP3K3_T294,-0.635680,2.067788e-06,MAP3K3,T294
661,PLCG1_Y783,0.437178,5.419829e-06,PLCG1,Y783
...,...,...,...,...,...
409,INPP5D_S959,-0.248265,4.792134e-02,INPP5D,S959
92,ATXN2_S728,0.208981,4.807156e-02,ATXN2,S728
651,PLCE1_S788,0.218699,4.838584e-02,PLCE1,S788
473,JUN_T239S243,0.221339,4.848377e-02,JUN,T239S243


In [106]:
Gbm_df_site = gbm_trans_sig_copy.loc[gbm_trans_sig_copy["Gene"].isin(["JAK1","STAT1","STAT3"])]
Gbm_df_site

Unnamed: 0,Comparison,Correlation,P_value,Gene,Site
931,STAT3_S727,-0.283679,0.004235,STAT3,S727
930,STAT3_S701,-0.228037,0.022498,STAT3,S701


# Brca

In [74]:
brca_trans = get_phosphotrans(b,"EGFR_proteomics")
brca_trans



Unnamed: 0,Comparison,Correlation,P_value
6502,EGFR_T693,0.694899,2.774818e-16
2455,FOXA1_S331,-0.627608,4.727599e-14
20990,GATA3_S162,-0.641222,3.869228e-13
16513,NUMA1_S169,-0.584269,5.774908e-12
19120,PREX1_S1200,-0.595679,8.349990e-12
...,...,...,...
26680,RPS6KA5_S212,0.000042,9.997666e-01
22833,USP34_T3350,-0.000026,9.997832e-01
5772,DAXX_S668,0.000019,9.998835e-01
10915,KLC2_S445,-0.000011,9.999434e-01


In [76]:

brca_trans_sig =brca_trans.loc[(brca_trans["P_value"] <= 0.05)]

brca_trans_sig.to_csv('csv_files/brca_trans_sig_phospho.csv', index=False)
brca_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
6502,EGFR_T693,0.694899,2.774818e-16
2455,FOXA1_S331,-0.627608,4.727599e-14
20990,GATA3_S162,-0.641222,3.869228e-13
16513,NUMA1_S169,-0.584269,5.774908e-12
19120,PREX1_S1200,-0.595679,8.349990e-12
...,...,...,...
11749,AMOTL1_S856,0.209616,4.998563e-02
5180,SLAIN2_S247,-0.246077,4.998752e-02
16357,PLEKHG1_S108,0.338803,4.998925e-02
2298,KLF2_S248,-0.329118,4.999487e-02


In [None]:
brca_trans_path = get_phosphotrans(b,"EGFR_proteomics", prot_list)
brca_trans_path

# HNSCC

In [36]:
hnscc_trans = get_phosphotrans(hn, "EGFR_proteomics")
hnscc_trans



Unnamed: 0,Comparison,Correlation,P_value
6365,EGFR_Y1197,8.446212e-01,1.040053e-29
6367,EGFR_S991,7.905225e-01,9.689413e-25
6366,EGFR_T693,7.025493e-01,1.195033e-17
6358,EGFR_S1064,8.327813e-01,4.779083e-17
6362,EGFR_S1166,7.890642e-01,6.840802e-17
...,...,...,...
17539,OBSCN_S5786,1.226560e-04,9.993398e-01
9552,YEATS2_S465,7.319426e-05,9.994145e-01
91,AKAP11_S1113,1.029966e-04,9.994901e-01
3305,EIF4E2_S13,4.576030e-05,9.997705e-01


In [78]:

hnscc_trans_sig =hnscc_trans.loc[(hnscc_trans["P_value"] <= 0.05)]

hnscc_trans_sig.to_csv('csv_files/hnscc_trans_sig_phospho.csv', index=False)
hnscc_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
6365,EGFR_Y1197,0.844621,1.040053e-29
6367,EGFR_S991,0.790522,9.689413e-25
6366,EGFR_T693,0.702549,1.195033e-17
6358,EGFR_S1064,0.832781,4.779083e-17
6362,EGFR_S1166,0.789064,6.840802e-17
...,...,...,...
25319,PLCD1_S216,-0.212185,4.984154e-02
12031,VRK3_T104,-0.203989,4.984730e-02
2031,LRRFIP1_T652,0.316214,4.986071e-02
16853,SCAF1_T976,0.214596,4.996841e-02



# Kidney 

In [38]:
ccrcc_trans = get_phosphotrans(c, "EGFR_proteomics")
ccrcc_trans



Unnamed: 0,Comparison,Correlation,P_value
5142,EGFR_T693,0.570784,7.494312e-11
5140,EGFR_Y1172,0.738071,2.150911e-09
9415,IFT22_S137,0.563316,2.302971e-09
852,CAV2_Y19S20,0.521948,2.561794e-08
11311,CAV1_S9,0.528519,3.116960e-08
...,...,...,...
9709,FARP1_S833,-0.000037,9.997523e-01
20132,FNIP1_S98S99,-0.000052,9.997618e-01
20786,TLN2_T2041,-0.000017,9.998886e-01
956,GIT1_S388,0.000010,9.999510e-01


In [80]:

ccrcc_trans_sig =ccrcc_trans.loc[(ccrcc_trans["P_value"] <= 0.05)]

ccrcc_trans_sig.to_csv('csv_files/ccrcc_trans_sig_phospho.csv', index=False)
ccrcc_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
5142,EGFR_T693,0.570784,7.494312e-11
5140,EGFR_Y1172,0.738071,2.150911e-09
9415,IFT22_S137,0.563316,2.302971e-09
852,CAV2_Y19S20,0.521948,2.561794e-08
11311,CAV1_S9,0.528519,3.116960e-08
...,...,...,...
11851,PARD6G_S295,-0.187449,4.988537e-02
5908,PPP1R9A_S1140,-0.273352,4.990587e-02
216,REXO1_S610,-0.209686,4.990707e-02
16630,AHNAK_S5318,0.194676,4.991397e-02


# Ovarian 

# Colon

# LUAD

In [39]:
luad_trans = get_phosphotrans(l, "EGFR_proteomics")
luad_trans



Unnamed: 0,Comparison,Correlation,P_value
6290,EGFR_S1064,0.829086,5.159469e-28
6296,EGFR_T693,0.765129,1.322244e-21
6295,EGFR_Y1197,0.692721,7.424229e-15
6291,EGFR_S1071,0.813803,3.551941e-12
6294,EGFR_Y1172,0.816226,1.457530e-11
...,...,...,...
8951,PBK_T24,0.000029,9.998211e-01
4928,SP100_S394,-0.000011,9.999079e-01
11591,DCBLD2_S724S727,-0.000017,9.999189e-01
16937,RUNDC1_S491,-0.000014,9.999245e-01


In [81]:

luad_trans_sig = luad_trans.loc[(hnscc_trans["P_value"] <= 0.05)]

luad_trans_sig.to_csv('csv_files/luad_trans_sig_phospho.csv', index=False)
luad_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
6296,EGFR_T693,0.765129,1.322244e-21
1860,ELF1_S334,-0.578271,1.034134e-08
6292,EGFR_Y1092,0.790612,1.601382e-08
5301,SSR3_S11,0.511356,1.818200e-08
17812,RCSD1_S179,-0.443884,1.062375e-06
...,...,...,...
12783,PRKCD_S304Y313,-0.000206,9.990058e-01
22822,KIAA1522_S728S732,-0.000111,9.991304e-01
27083,EPB41L1_S75,-0.000062,9.995437e-01
23258,SETD2_S1413S1415S1417,-0.000069,9.996835e-01


# Lscc

In [40]:
lscc_trans = get_phosphotrans(ls, "EGFR_proteomics")
lscc_trans




Unnamed: 0,Comparison,Correlation,P_value
6821,EGFR_T693,0.813983,3.116274e-27
6820,EGFR_Y1197,0.769996,7.342722e-19
6822,EGFR_S991,0.695488,3.422569e-17
6817,EGFR_S1071,0.838113,1.080435e-13
6815,EGFR_S1039,0.608678,5.649394e-12
...,...,...,...
29074,TPM4_S170,-0.000024,9.998817e-01
7622,ABCC3_S911,0.000016,9.998961e-01
16684,ATN1_S34,-0.000011,9.999137e-01
26833,HSD17B4_S198,0.000008,9.999491e-01


In [84]:
lscc_trans_sig =lscc_trans.loc[(lscc_trans["P_value"] <= 0.05)]

lscc_trans_sig.to_csv('csv_files/lscc_trans_sig_phospho.csv', index=False)
lscc_trans_sig

Unnamed: 0,Comparison,Correlation,P_value
6821,EGFR_T693,0.813983,3.116274e-27
6820,EGFR_Y1197,0.769996,7.342722e-19
6822,EGFR_S991,0.695488,3.422569e-17
6817,EGFR_S1071,0.838113,1.080435e-13
6815,EGFR_S1039,0.608678,5.649394e-12
...,...,...,...
16277,MINK1_S601,0.254297,4.991322e-02
9528,SRRM2_S1984T1986,-0.278792,4.993036e-02
28890,HNRNPU_S66,-0.244184,4.996570e-02
25365,PRR12_S1361,-0.235221,4.997639e-02
