# Analyze Phospho Results for MAPK Signaling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u
import plot_utils as p

import warnings
warnings.filterwarnings('ignore')

Get tables with MAPK signaling phosphoproteomics.

In [74]:
df = pd.read_csv("csv/phospho_MAPK.csv")
luad_df = pd.read_csv("csv/Single_Cancer/Luad_phospho_MAPK.csv")
e_df = pd.read_csv("csv/Single_Cancer/Endo_phospho_MAPK.csv")
c_df = pd.read_csv("csv/Single_Cancer/Colon_phospho_MAPK.csv")

# Get Count of Significant Sites

In [6]:
print('LUAD')
luad_sig = luad_df.loc[luad_df['Luad_P_Value'] < 0.05]
print('p-val < 0.05 : ', len(luad_sig))
luad_sig_2 = luad_df.loc[luad_df['Luad_P_Value'] < 0.1]
print('p-val < 0.1 : ', len(luad_sig_2))

LUAD
p-val < 0.05 :  20
p-val < 0.1 :  40


In [7]:
print('ENDO')
e_sig = e_df.loc[e_df['Endo_P_Value'] < 0.05]
print('p-val < 0.05 : ', len(e_sig))
e_sig_2 = e_df.loc[e_df['Endo_P_Value'] < 0.1]
print('p-val < 0.1 : ', len(e_sig_2))

ENDO
p-val < 0.05 :  17
p-val < 0.1 :  26


In [8]:
print('sig in both\n')
pval = [0.05, 0.1]
for p in pval:
    sig = df.loc[df['Endo_P_Value'] < p]
    sig = sig.loc[sig['Luad_P_Value'] < p]
    print('p-val < ',p,' : ', len(sig))
    print(list(sig.Phospho), '\n')

sig in both

p-val <  0.05  :  3
['SOS1_S1161', 'MKNK2_S220', 'NFKB2_S858'] 

p-val <  0.1  :  4
['SOS1_S1161', 'MKNK2_S220', 'NFKB2_S858', 'SOS1_S1229'] 



# Get df with colors of legend from Figure

In [77]:
'''

df: df with pval and differential expressions. nan dropped.

'''

def check_status_color(site, df, cancer):
    df = df.loc[df['Phospho'] == site]
    pval = float(df[cancer+'_P_Value'])
    dif_expression = float(df[cancer+'_Median'])
    color = ''
    if pval >= 0.1:
        color += 'grey'
    else:
        if pval < .05:
            color += 'dark_'
        if dif_expression < 0:
            color += 'blue'
        elif dif_expression > 0:
            color += 'red'
        
    return color
    

In [78]:
e_df_2 = e_df.dropna() # drop nan (p-vals nan when not enough data to do t-test)
e_df_2['color'] = e_df_2['Phospho'].apply(lambda x: check_status_color(x, e_df_2, 'Endo'))
e_df_2

Unnamed: 0,Phospho,Endo_P_Value,Endo_Median,color
0,MKNK2_S220,0.001348,0.94700,dark_red
1,NFKB2_S858,0.001348,0.69285,dark_red
2,RAF1_T330,0.002399,1.12066,dark_red
3,TAB1_S378,0.002916,0.46350,dark_red
4,MKNK1_S401,0.003520,0.47800,dark_red
...,...,...,...,...
844,TAB2_S372,0.998789,0.12665,grey
845,CACNA1H_S1144,0.999600,0.40850,grey
846,MEF2C_S406,0.999600,0.08330,grey
847,RAPGEF2_S1281,0.999600,-0.02472,grey


In [79]:
luad_df_2 = luad_df.dropna()
luad_df_2['color'] = luad_df_2['Phospho'].apply(lambda x: check_status_color(x, luad_df_2, 'Luad'))
luad_df_2

Unnamed: 0,Phospho,Luad_P_Value,Luad_Median,color
0,SOS1_S1161,0.000055,1.30250,dark_red
1,MKNK1_S209S214,0.015097,1.49000,dark_red
2,MKNK2_S220,0.015097,1.33870,dark_red
3,FLNB_S316,0.015097,0.98345,dark_red
4,FLNB_T2585,0.015097,1.00035,dark_red
...,...,...,...,...
858,MAPT_T529,0.998304,0.22970,grey
859,MEF2C_S222,0.998304,0.18835,grey
860,NFATC3_S98,0.999373,-0.14765,grey
861,TGFBR2_S578,0.999373,-0.73630,grey


# Show data for phosphosites used in KRAS Figure 

* Shown phosphosites are significant in at least 1 cancer.

In [75]:
# Get list of phosphosites sig in at least 1 cancer
e = e_df.loc[e_df['Endo_P_Value'] < 0.1]
e_sig_phospho = list(e.Phospho)

luad = luad_df.loc[luad_df['Luad_P_Value'] < 0.1]
luad_sig_phospho = list(luad.Phospho)

fig_phospho = list(set(luad_sig_phospho + e_sig_phospho))

In [82]:
combined = e_df_2.merge(luad_df_2, on = 'Phospho', how = 'outer', suffixes = ['_Endo', '_Luad'])
fig_data = combined[combined['Phospho'].isin(fig_phospho)]
fig_data

Unnamed: 0,Phospho,Endo_P_Value,Endo_Median,color_Endo,Luad_P_Value,Luad_Median,color_Luad
0,MKNK2_S220,0.001348,0.94700,dark_red,0.015097,1.33870,dark_red
1,NFKB2_S858,0.001348,0.69285,dark_red,0.015097,1.40475,dark_red
2,RAF1_T330,0.002399,1.12066,dark_red,0.118795,0.97875,grey
3,TAB1_S378,0.002916,0.46350,dark_red,,,
4,MKNK1_S401,0.003520,0.47800,dark_red,0.424119,0.20325,grey
...,...,...,...,...,...,...,...
864,PLA2G4A_S434S435S437,,,,0.082020,0.89615,red
865,FLNB_S2114,,,,0.082020,2.17800,red
866,PLA2G4A_S435S437,,,,0.085660,0.99990,red
867,NFKB2_S161,,,,0.085660,0.79615,red


# Get counts of sites affected for each protein

In [9]:
# proteins affected multiple times p < .1
luad_sig_2['Protein'], luad_sig_2['Site'] = luad_sig_2['Phospho'].str.split('_', 1).str
luad = luad_sig_2.Protein.value_counts().to_frame('Luad_Sig_Phosphosites')

e_sig_2['Protein'], e_sig_2['Site'] = e_sig_2['Phospho'].str.split('_', 1).str
ec = e_sig_2.Protein.value_counts().to_frame('Endo_Sig_Phosphosites')

m = luad.join(ec, how = 'outer').sort_values(by = ['Luad_Sig_Phosphosites', 'Endo_Sig_Phosphosites'], ascending = False)
m.head(7)

m.to_csv('KRAS_Table_1.csv')

In [10]:
# Check - Get all sites for a protein
df['Protein'], df['Site'] = df['Phospho'].str.split('_', 1).str
#df.loc[df['Protein'] == 'NFKB2']

# Check non-NaN measurements in Colon in the MAPK pathway

In [11]:
# does colon have data for proteins sig in other cancers?
e_list = list(e_sig_2.Protein)
luad_list = list(luad_sig_2.Protein)
prot = list(set(e_list + luad_list))
print('Num proteins with sig sites in endo or luad:', len(prot), 'proteins')

prot_sig = c_df[c_df.Protein.isin(prot)].dropna()
print(len(prot_sig.Protein.value_counts()), 'of those proteins found in colon')


Num proteins with sig sites in endo or luad: 36 proteins


AttributeError: 'DataFrame' object has no attribute 'Protein'

In [None]:
# does colon have data for sites sig in other cancers?
e_list = list(e_sig_2.Phospho)
luad_list = list(luad_sig_2.Phospho)
sites = list(set(e_list + luad_list))
print('Num sig sites in endo or luad:', len(sites) ,'sites')

sites_sig = c_df[c_df.Phospho.isin(sites)]
print(len(sites_sig.Phospho.value_counts()), 'found in colon')