# Figure 1: Map color legend to phospho results for MAPK Signaling diagram

In [1]:
import pandas as pd

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
print('cptac version:', cptac.version())

cptac version: 0.8.6


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
'''
This function returns the color used in Figure 1 for every 
differential expression/p-value status of a certain phosphosite.
@Param site: String. Name of phosphosite column.
@Param df: df with pval and differential expressions. NaN dropped.
@Param cancer: String. Cancer name used in df columns. 
'''

def check_status_color(site, df, cancer):
    df = df.loc[df['Phospho'] == site]
    pval = float(df[cancer+'_P_Value'])
    dif_expression = float(df[cancer+'_Median'])
    color = ''
    if pval >= 0.1:
        color += 'grey'
    else:
        if pval < .05:
            color += 'dark_'
        if dif_expression < 0:
            color += 'blue'
        elif dif_expression > 0:
            color += 'red'
        
    return color
    

# Step 1: Get tables with MAPK signaling phosphoproteomics.

In [5]:
df = pd.read_csv("../Make_Tables/csv/phospho_MAPK.csv")
luad_df = pd.read_csv("../Make_Tables/csv/LUAD_phospho_MAPK.csv")
e_df = pd.read_csv("../Make_Tables/csv/EC_phospho_MAPK.csv")
c_df = pd.read_csv("../Make_Tables/csv/CO_phospho_MAPK.csv")

# Step 2: Create df with all phosphosites mapped with colors


Color Legend:
* red - differential expression > 0
* blue - differential expression < 0
* grey - not significant
* white - no data / not enough data for t-test

All sites used in Figure 1 have at least 1 FDR corrected p-value < 0.1. The darker hue of a color represents a  p-value < 0.05 (dark_red, dark_blue). 

In [6]:
e_df_2 = e_df.dropna() # drop nan (p-vals nan when not enough data to do t-test)
e_df_2['color'] = e_df_2['Phospho'].apply(lambda x: check_status_color(x, e_df_2, 'EC'))
#e_df_2

In [7]:
luad_df_2 = luad_df.dropna()
luad_df_2['color'] = luad_df_2['Phospho'].apply(lambda x: check_status_color(x, luad_df_2, 'LUAD'))
#luad_df_2

# Step 3: Show data for sites used in KRAS Figure 

* Shown phosphosites are significant in at least 1 cancer.

In [8]:
# Get list of phosphosites sig in at least 1 cancer
e = e_df.loc[e_df['EC_P_Value'] < 0.1]
e_sig_phospho = list(e.Phospho)

luad = luad_df.loc[luad_df['LUAD_P_Value'] < 0.1]
luad_sig_phospho = list(luad.Phospho)

fig_phospho = list(set(luad_sig_phospho + e_sig_phospho))

In [9]:
combined = e_df_2.merge(luad_df_2, on = 'Phospho', how = 'outer', suffixes = ['_Endo', '_Luad'])
fig_data = combined[combined['Phospho'].isin(fig_phospho)]
fig_data.color_Luad = fig_data.color_Luad.fillna('white')
fig_data.color_Endo = fig_data.color_Endo.fillna('white')
fig_data

Unnamed: 0,Phospho,EC_P_Value,EC_Median,color_Endo,LUAD_P_Value,LUAD_Median,color_Luad
0,MKNK2_S220,0.001348,0.94700,dark_red,0.015097,1.33870,dark_red
1,NFKB2_S858,0.001348,0.69285,dark_red,0.015097,1.40475,dark_red
2,RAF1_T330,0.002399,1.12066,dark_red,0.118795,0.97875,grey
3,TAB1_S378,0.002916,0.46350,dark_red,,,white
4,MKNK1_S401,0.003520,0.47800,dark_red,0.424119,0.20325,grey
...,...,...,...,...,...,...,...
864,PLA2G4A_S434S435S437,,,white,0.082020,0.89615,red
865,FLNB_S2114,,,white,0.082020,2.17800,red
866,PLA2G4A_S435S437,,,white,0.085660,0.99990,red
867,NFKB2_S161,,,white,0.085660,0.79615,red


In [23]:
fig_data[fig_data.Phospho.str.contains('STK')]

Unnamed: 0,Phospho,EC_P_Value,EC_Median,color_Endo,LUAD_P_Value,LUAD_Median,color_Luad
6,STK3_S413,0.008592,0.61475,dark_red,,,white


# Get number of significant phosphosites

The manuscript mentions the number of phosphosites significant in each cancer and in multiple cancers.

In [11]:
print('LUAD')
luad_sig_2 = luad_df.loc[luad_df['LUAD_P_Value'] < 0.1]
print('p-val < 0.1 : ', len(luad_sig_2))

print('\nEC')
e_sig_2 = e_df.loc[e_df['EC_P_Value'] < 0.1]
print('p-val < 0.1 : ', len(e_sig_2))

print('\nCO')
c_sig_2 = c_df.loc[c_df['CO_P_Value'] < 0.1]
print('p-val < 0.1 : ', len(c_sig_2))

LUAD
p-val < 0.1 :  40

EC
p-val < 0.1 :  26

CO
p-val < 0.1 :  0


In [12]:
print('Sig in both')
p = 0.1
sig = df.loc[df['EC_P_Value'] < p]
sig = sig.loc[sig['LUAD_P_Value'] < p]
print('p-val < ',p,' : ', len(sig), '\n')
print(list(sig.Phospho), '\n')

Sig in both
p-val <  0.1  :  4 

['SOS1_S1161', 'MKNK2_S220', 'NFKB2_S858', 'SOS1_S1229'] 



# Check that Colon has measurements in the MAPK pathway

In [13]:
# Get protein column
e_sig_2['Protein'], e_sig_2['Site'] = e_sig_2.Phospho.str.split('_', 1).str
luad_sig_2['Protein'], luad_sig_2['Site'] = luad_sig_2.Phospho.str.split('_', 1).str
c_df['Protein'], c_df['Site'] = c_df.Phospho.str.split('_', 1).str

In [14]:
e_list = list(e_sig_2.Protein)
luad_list = list(luad_sig_2.Protein)
prot = list(set(e_list + luad_list))
print('Proteins')
print('Num proteins with sig sites in EC and LUAD:', len(prot), 'proteins')

prot_sig = c_df[c_df.Protein.isin(prot)].dropna()
print(len(prot_sig.Protein.unique()), 'of those proteins found in CO')

Proteins
Num proteins with sig sites in EC and LUAD: 36 proteins
32 of those proteins found in CO


In [15]:
# does colon have data for sites sig in other cancers?
e_list = list(e_sig_2.Phospho)
luad_list = list(luad_sig_2.Phospho)
sites = list(set(e_list + luad_list))
print('Phosphosites')
print('Num sig sites in EC or LUAD:', len(sites) ,'sites')

sites_sig = c_df[c_df.Phospho.isin(sites)]
print(len(sites_sig.Phospho.unique()), 'sig sites found in CO')

Phosphosites
Num sig sites in EC or LUAD: 62 sites
0 sig sites found in CO
