Phospho for Interacting Proteins

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

In [2]:
gbm = cptac.Gbm()
endo = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that luad index is up-to-date...       



Checking that ovarian index is up-to-date...



                                            

In [3]:
# colon, endo no peptide

In [4]:
#ip = u.get_interacting_proteins('PTEN')
ip = 'MCM2'
len(ip)

4

In [5]:
gene = 'PTEN'

# Gbm

In [6]:
mut_type_gbm = gbm.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = gbm.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'phosphoproteomics', omics_genes = ip)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
prot_and_mutations = gbm.reduce_multiindex(prot_and_mutations, levels_to_drop = 3, flatten = True)

ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type_gbm = mut_type_gbm[['Mutation']] 
merged = ip_df.join(mut_type_gbm) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Deletion']
get = merged['Mutation'].isin(compare)
del_wt = merged[get]
del_wt['Mutation'].value_counts()
del_wt



AttributeError: 'Gbm' object has no attribute 'reduce_multiindex'

In [None]:
cols = list(del_wt.columns[:-1])

# Get only sig sites
g_sig = u.wrap_ttest(del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if g_sig is not None:
    g_sig_list = list(g_sig.Comparison)
else: 
    g_sig_list = None

print('significant pvals: \n', g_sig)
# Get all pvals
g = u.wrap_ttest(del_wt, 'Mutation', cols,return_all = True, correction_method = 'fdr_bh')
g_pval = g.rename(columns = {'Comparison': 'Phosphoproteomics','P_Value': 'Gbm_P_Value'})
g_pval

In [None]:
d = del_wt[del_wt.Mutation == "Deletion"]
wt = del_wt[del_wt.Mutation == "Wildtype_Tumor"]
del_med = d.median()
wt_med = wt.median()

gbm_d = {}

for phos in cols:
    dif_gbm = wt_med[phos] - del_med[phos]
    gbm_d[phos] = dif_gbm

gbm_df = pd.DataFrame.from_dict(gbm_d, orient='index', columns=['Gbm_Median'])
gbm_df = gbm_df.reset_index().rename(columns={'index':'Phosphoproteomics'})
gbm_df

In [None]:
g_merged = g_pval.merge(gbm_df, on='Phosphoproteomics',how='outer')
#g_merged = g_merged.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
#g_merged = g_merged.replace(to_replace = r'_[A-Z*]*$', value = '', regex = True)
g_merged

# Endo

In [None]:
#no peptide in phospho
mut_type = endo.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = endo.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'phosphoproteomics', omics_genes = ip)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
prot_and_mutations = endo.reduce_multiindex(prot_and_mutations, flatten = True)

ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
trunc_wt['Mutation'].value_counts()
trunc_wt

In [None]:
cols = list(trunc_wt.columns[:-1])

# Get only sig sites
e_sig = u.wrap_ttest(trunc_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if e_sig is not None:
    e_sig_list = list(e_sig.Comparison)
else: 
    e_sig_list = None

print('significant pvals: \n', e_sig)
# Get all pvals
e_pval = u.wrap_ttest(trunc_wt, 'Mutation', cols, return_all = True,correction_method = 'fdr_bh')
e_pval = e_pval.rename(columns = {'Comparison': 'Phosphoproteomics','P_Value': 'En_P_Value'})
e_pval

In [None]:
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

for phos in cols:
    dif = wt_med[phos] - trunc_med[phos]
    en_d[phos] = dif

en_df = pd.DataFrame.from_dict(en_d, orient='index', columns=['En_Median'])
en_df = en_df.reset_index().rename(columns={'index':'Phosphoproteomics'})
en_df

In [None]:
e_merged = e_pval.merge(en_df, on='Phosphoproteomics',how='outer')
e_merged

# Ovarian

In [None]:
gene = 'PTEN'

mut_type = o.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = o.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'phosphoproteomics', omics_genes = ip)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

# accounting for duplicates (keep peptide)
prot_and_mutations = o.reduce_multiindex(prot_and_mutations, levels_to_drop = 3, flatten=True)

prot_and_mutations = prot_and_mutations.loc[:,~prot_and_mutations.columns.duplicated()] # drop perfect duplicate columns 
# dropped 244 perfect duplicate cols  
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Deletion']
get = merged['Mutation'].isin(compare)
del_wt = merged[get]
del_wt['Mutation'].value_counts()
print(len(del_wt.columns))
del_wt = del_wt.dropna(axis = 'columns', how='all')

In [None]:
# check same peptide, dif database id
del_wt.columns.value_counts()
t = del_wt['NEDD4_phosphoproteomics_S739_R.LTIFGNSAVSQPASSS*NHSSR.R']
t

In [None]:
cols = list(del_wt.columns[:-1])

# Get only sig sites
o_sig = u.wrap_ttest(del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if o_sig is not None:
    o_sig_list = list(o_sig.Comparison)
else: 
    o_sig_list = None
print('significant pvals: \n',o_sig)

# Get all pvals
o_pval = u.wrap_ttest(del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
o_pval = o_pval.rename(columns = {'Comparison': 'Phosphoproteomics','P_Value': 'Ov_P_Value'})
o_pval

In [None]:
d = del_wt[del_wt.Mutation == "Deletion"]
wt = del_wt[del_wt.Mutation == "Wildtype_Tumor"]
del_med = d.median()
wt_med = wt.median()

ov_d = {}

for phos in cols:
    dif = wt_med[phos] - del_med[phos]
    ov_d[phos] = dif

o_df = pd.DataFrame.from_dict(ov_d, orient='index', columns=['Ov_Median'])
o_df = o_df.reset_index().rename(columns={'index':'Phosphoproteomics'})
o_df

In [None]:
o_merged = o_pval.merge(o_df, on='Phosphoproteomics',how='outer')
#o_merged = o_merged.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
#o_merged = o_merged.replace(to_replace = r'_[A-Z.*]*$', value = '', regex = True)
o_merged

# Breast

In [None]:
mut_type = b.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = b.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'phosphoproteomics', omics_genes = ip)
#prot_and_mutations = prot_and_mutations.loc[:,~prot_and_mutations.columns.duplicated()] # drop duplicated columns
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
#prot_and_mutations = b.reduce_multiindex(prot_and_mutations, levels_to_drop = 3)
prot_and_mutations = b.reduce_multiindex(prot_and_mutations, levels_to_drop = 3, flatten = True)

ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Deletion']
get = merged['Mutation'].isin(compare)
del_wt = merged[get]
del_wt['Mutation'].value_counts()
prot_and_mutations

In [None]:
cols = list(del_wt.columns[:-1])

# Get only sig sites
b_sig = u.wrap_ttest(del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if b_sig is not None:
    b_sig_list = list(b_sig.Comparison)
else: 
    b_sig_list = None
print('significant pvals: \n',b_sig)

# Get all pvals
b_pval = u.wrap_ttest(del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
b_pval = b_pval.rename(columns = {'Comparison': 'Phosphoproteomics','P_Value': 'Brca_P_Value'})
b_pval

In [None]:
d = del_wt[del_wt.Mutation == "Deletion"]
wt = del_wt[del_wt.Mutation == "Wildtype_Tumor"]
del_med = d.median()
wt_med = wt.median()

b_d = {}

for phos in cols:
    dif = wt_med[phos] - del_med[phos]
    b_d[phos] = dif

b_df = pd.DataFrame.from_dict(b_d, orient='index', columns=['Brca_Median'])
b_df = b_df.reset_index().rename(columns={'index':'Phosphoproteomics'})
b_df

In [None]:
b_merged = b_pval.merge(b_df, on='Phosphoproteomics',how='outer')
#b_merged = b_merged.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
b_merged

# Colon

In [None]:
#no peptide phospho
mut_type = col.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = col.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'phosphoproteomics', omics_genes = ip)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
prot_and_mutations = col.reduce_multiindex(prot_and_mutations, levels_to_drop = 2, flatten = True)
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Deletion']
get = merged['Mutation'].isin(compare)
del_wt = merged[get]
del_wt['Mutation'].value_counts()
prot_and_mutations

In [None]:
cols = list(del_wt.columns[:-1])

# Get only sig sites
c_sig = u.wrap_ttest(del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if c_sig is not None:
    c_sig_list = list(c_sig.Comparison)
else: 
    c_sig_list = None
print('significant pvals: \n',c_sig)

# Get all pvals
c_pval = u.wrap_ttest(del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
c_pval = c_pval.rename(columns = {'Comparison': 'Phosphoproteomics','P_Value': 'Colon_P_Value'})
c_pval

In [None]:
d = del_wt[del_wt.Mutation == "Deletion"]
wt = del_wt[del_wt.Mutation == "Wildtype_Tumor"]
del_med = d.median()
wt_med = wt.median()

d = {}

for phos in cols:
    dif = wt_med[phos] - del_med[phos]
    d[phos] = dif

c_df = pd.DataFrame.from_dict(d, orient='index', columns=['Colon_Median'])
c_df = c_df.reset_index().rename(columns={'index':'Phosphoproteomics'})
c_df

In [None]:
c_merged = c_pval.merge(c_df, on='Phosphoproteomics',how='outer')
#c_merged = c_merged.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
c_merged

# Head and Neck

In [None]:
mut_type = h.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = h.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'phosphoproteomics', omics_genes = ip)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

# CODE accounting for duplicates (keep peptide)
#prot_and_mutations = h.reduce_multiindex(prot_and_mutations, levels_to_drop = 3)
prot_and_mutations = h.reduce_multiindex(prot_and_mutations, levels_to_drop = 3, flatten=True)
#prot_and_mutations = prot_and_mutations.loc[:,~prot_and_mutations.columns.duplicated()] # drop duplicated columns


ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Deletion']
get = merged['Mutation'].isin(compare)
del_wt = merged[get]
del_wt['Mutation'].value_counts()
del_wt.head()

In [None]:
cols = list(del_wt.columns[:-1])

# Get only sig sites
h_sig = u.wrap_ttest(del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if h_sig is not None:
    h_sig_list = list(h_sig.Comparison)
else: 
    h_sig_list = None
print('significant pvals: \n',h_sig)

# Get all pvals
h_pval = u.wrap_ttest(del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
h_pval = h_pval.rename(columns = {'Comparison': 'Phosphoproteomics','P_Value': 'Hnscc_P_Value'})
h_pval

In [None]:
d = del_wt[del_wt.Mutation == "Deletion"]
wt = del_wt[del_wt.Mutation == "Wildtype_Tumor"]
del_med = d.median()
wt_med = wt.median()

h_d = {}

for phos in cols:
    dif = wt_med[phos] - del_med[phos]
    h_d[phos] = dif

h_df = pd.DataFrame.from_dict(h_d, orient='index', columns=['Hnscc_Median'])
h_df = h_df.reset_index().rename(columns={'index':'Phosphoproteomics'})
h_df

In [None]:
h_merged = h_pval.merge(h_df, on='Phosphoproteomics',how='outer')
#h_merged = h_merged.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
#h_merged = h_merged.replace(to_replace = r'_[A-Za-z]*$', value = '', regex = True)
h_merged

# Lung

In [None]:
mut_type = l.get_genotype_all_vars(gene)
mut_type = mut_type.rename(columns={'PTEN':'cnv'})

# different code because no somatic mutation data for pten (can't join to somatic mutations)
omics = l.join_omics_to_omics(df1_name = 'CNV', df2_name='phosphoproteomics',genes1="PTEN", 
    genes2=ip)
#omics = l.reduce_multiindex(omics, levels_to_drop =3)
omics = l.reduce_multiindex(omics, levels_to_drop = 3, flatten = True)
omics = omics.drop(columns='PTEN_CNV__')

# Get only tumor samples
phos = l.get_phosphoproteomics(tissue_type='tumor')
tumor_ids = list(phos.index)
get = omics.index.isin(tumor_ids)
omics = omics[get]

merged = omics.join(mut_type) # checked and there is 110 tumor samples for lung

compare = ['No_Mutation','Deletion']
get = merged['Mutation'].isin(compare)
del_wt = merged[get]
print(del_wt['Mutation'].value_counts())
del_wt.columns[:-2]

In [None]:
cols = list(del_wt.columns[:-2])

# Get only sig sites
l_sig = u.wrap_ttest(del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if l_sig is not None:
    l_sig_list = list(l_sig.Comparison)
else: 
    l_sig_list = None
print('significant pvals: \n',l_sig)

# Get all pvals
l_pval = u.wrap_ttest(del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
l_pval = l_pval.rename(columns = {'Comparison': 'Phosphoproteomics','P_Value': 'Luad_P_Value'})
l_pval

In [None]:
d = del_wt[del_wt.Mutation == "Deletion"]
wt = del_wt[del_wt.Mutation == "No_Mutation"]
del_med = d.median()
wt_med = wt.median()

l_d = {}

for phos in cols:
    dif = wt_med[phos] - del_med[phos]
    l_d[phos] = dif

l_df = pd.DataFrame.from_dict(l_d, orient='index', columns=['Luad_Median'])
l_df = l_df.reset_index().rename(columns={'index':'Phosphoproteomics'})
l_df

In [None]:
l_merged = l_pval.merge(l_df, on='Phosphoproteomics',how='outer')
#l_merged = l_merged.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
l_merged

# Merge

In [None]:
n = g_merged.merge(h_merged, on='Phosphoproteomics',how='outer')
n = n.merge(l_merged, on='Phosphoproteomics',how='outer')
n= n.merge(b_merged, on='Phosphoproteomics',how='outer')
n = n.merge(o_merged, on='Phosphoproteomics',how='outer')
n = n.merge(e_merged, on='Phosphoproteomics',how='outer')
all_df = n.merge(c_merged, on='Phosphoproteomics',how='outer')
#all_df = all_df.dropna(axis='index', thresh = 1)
all_df


In [None]:
# Keep only genes significant in > 1 cancer
all_sig = []

sig_lists = [g_sig_list, e_sig_list, b_sig_list, o_sig_list, c_sig_list, h_sig_list, h_sig_list, l_sig_list]

for next_list in sig_lists:
    if next_list is not None:
        all_sig.append(next_list)
        
flat_list = [item for sublist in all_sig for item in sublist]
sig = set(flat_list) # remove duplicates

#sig = [re.sub(r'_phosphoproteomics', '', i) for i in sig] 

bool_df = all_df['Phosphoproteomics'].isin(sig)
sig_df = all_df[bool_df]
print(len(sig_df))
sig_df

In [None]:
g_merged
#g_merged = g_merged.replace(to_replace = r'_phosphoproteomics_S1064_[A-Z*]*$', value = '', regex = True)

s = g_merged['Phosphoproteomics']
b = s.str.contains('EGFR_phosphoproteomics_S1064_', case=True, regex=True)
n = g_merged[b]
n

In [None]:
s = o_merged['Phosphoproteomics']
b = s.str.contains('EGFR_phosphoproteomics_S1064', case=True, regex=True)
o_merged[b]

In [None]:
# ALTERNATE one cutoff - not specific to cancer
# Keep only genes significant in > 1 cancer
cols = ['Gbm_P_Value','Hnscc_P_Value','Luad_P_Value','Brca_P_Value','Ov_P_Value','En_P_Value','Colon_P_Value']

# change cutoff for each cancer -FIX
bc = .05/28

t = []

for c in cols:
    df = all_df[all_df[c] <= bc]
    t.append(list(df.Phosphoproteomics))
    
flat_list = [item for sublist in t for item in sublist]
sig = set(flat_list) # remove duplicates



bool_df = all_df['Phosphoproteomics'].isin(sig)
sig_df = all_df[bool_df]
print(len(sig_df))
sig_df

Combine

In [None]:
sig_df.to_csv('at_least_one_significant_interacting.csv')

In [None]:
stdev = ip_df.std()

In [None]:
stdev = stdev.to_frame()

In [None]:
stdev.loc[stdev[0] > .2]

In [None]:
d_stdev = stdev.std()
d_stdev

In [None]:
a = stdev.hist(bins = 20)
plt.title("Standard Deviations of Pancancer CNV deletions and amplifications of PTEN on Interacting Proteins\n")
plt.xlabel('StDeV')
plt.ylabel('Count')

In [None]:
plt.rcParams['figure.figsize']=(10,8)
sns.set(font_scale = 1.3)
a = sns.distplot(stdev, bins = 20)

a.set_title("Standard Deviation Distribution for Pancancer PTEN Mutation Effect on Proteomics\n\n (Median of proteomics with Wildtype PTEN - Median of proteomics with Mutated PTEN)")
a.set(xlabel = 'StDeV', ylabel = 'Frequency')

Mean

In [None]:
mean_df = stdev.mean()

In [None]:
a = mean_df.hist(bins = 20)
plt.title("Distribution of the Mean between Cancers for PTEN Mutation Effect on all Proteins\n\n (Median of proteomics with Wildtype PTEN - Median of proteomics with Mutated PTEN)")
plt.xlabel('Mean')
plt.ylabel('Count')

In [None]:
m_df = stdev.median()

In [None]:
a = m_df.hist(bins = 20)
plt.title("Distribution of the Medians between Cancers for PTEN Mutation Effect on all Proteins\n\n (Median of proteomics with Wildtype PTEN - Median of proteomics with Mutated PTEN)")
plt.xlabel('Median')
plt.ylabel('Count')