# Find MAPK signaling phosphosites without enough data to do a t-test 

In [1]:
import pandas as pd
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'KRAS', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    mut_type['Mutation'].where(mut_type['Mutation'] != 'Missense_Mutation', 'Missense', inplace = True) # replace when false
    mut_type['Mutation'].where(mut_type['Mutation'] != 'nonsynonymous SNV', 'Missense', inplace = True)

    # Keep only tumor samples from proteomics
    prot_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = [gene_in], omics_df_name = 'phosphoproteomics', omics_genes = all_prot, tissue_type = 'tumor')
    # Reduce a multiindex 
    if cancer_object.get_cancer_type() in ('endometrial'):
        prot_and_mutations = u.reduce_multiindex(prot_and_mutations, flatten = True)
    elif cancer_object.get_cancer_type() in ('colon'):
        prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = [2], flatten = True)
    elif cancer_object.get_cancer_type() in ('luad'):
        prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = [2,3], flatten = True)

    # Keep certain missense mutations
    prot_and_mutations['KRAS_Location'] = [','.join(map(str, l)) for l in prot_and_mutations['KRAS_Location']]
    hotspots = ['G12', 'G13', 'Q61', 'No_mutation']
    hotspots_wt = pd.DataFrame()
    for site in hotspots:
        df = prot_and_mutations[prot_and_mutations.KRAS_Location.str.contains(site, regex= True, na=False)]
        hotspots_wt = hotspots_wt.append(df)
    #print(hotspots_wt.KRAS_Location.value_counts())
    
    # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
    mut_type = mut_type[['Mutation']]
    prot_df = hotspots_wt.iloc[:,:-4] # Keep only proteomics
    merged = prot_df.join(mut_type)
    merged = merged.join(hotspots_wt['KRAS_Location'])

    # Keep only Wildtype and deletion
    compare = ['Wildtype_Tumor','Missense']
    get = merged['Mutation'].isin(compare)
    mut_wt = merged[get]
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_phosphoproteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [4]:
# Adds '_i' to duplicate col names where i is a number (increases with each duplicate)

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [5]:
en = cptac.Endometrial()
l = cptac.Luad()
col = cptac.Colon()

                                                

In [6]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Missense','Wildtype_Tumor'), 
    binary_col = 'Mutation', gene = 'KRAS'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        set_omics_cols = set(omics_cols)
        if binary_col in (set_omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    print('sites with not enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

In [7]:
luad_df = pd.read_csv("../Make_Tables/csv/Single_Cancer/Luad_phospho_MAPK.csv")
e_df = pd.read_csv("../Make_Tables/csv/Single_Cancer/Endo_phospho_MAPK.csv")
c_df = pd.read_csv("../Make_Tables/csv/Single_Cancer/Colon_phospho_MAPK.csv")

Check of the proteins involved in the MAPK pathways (and sig in at least one cancer) have enough data.

In [8]:
# Keep sig sites
luad_sig_2 = luad_df.loc[luad_df['Luad_P_Value'] < 0.1]
e_sig_2 = e_df.loc[e_df['Endo_P_Value'] < 0.1]

# Create protein column
e_sig_2['Protein'], e_sig_2['Site'] = e_sig_2.Phospho.str.split('_', 1).str
luad_sig_2['Protein'], luad_sig_2['Site'] = luad_sig_2.Phospho.str.split('_', 1).str
c_df['Protein'], c_df['Site'] = c_df.Phospho.str.split('_', 1).str

# Get list of proteins sig in at least 1 cancer
e_list = list(e_sig_2.Protein)
luad_list = list(luad_sig_2.Protein)
sig_prot_list = list(set(e_list + luad_list))

In [15]:
cancer_obj = {'Endo':en, 'Colon':col, 'Luad':l}
mincount = 5

few_data = {}
for c in cancer_obj:
    print(c)
    mut_wt = all_prot_format_df(cancer_obj[c], sig_prot_list)
    mut_wt_cols = list(mut_wt.columns[:-1])
    nd_list = find_few_data_genes(mut_wt, mincount, mut_wt_cols, gene = 'KRAS')
    few_data[c] = nd_list
    

Endo
sites with not enough data:  230 / 657
Colon
sites with not enough data:  149 / 356
Luad
sites with not enough data:  1 / 411


In [18]:
few_data['Luad']

['DUSP9']

Check how many sites of the proteins involved in the MAPK pathways (with at least one sig site) have enough data.

In [11]:
# Creat list of all sig sites in at least one cancer
luad_phos = list(luad_sig_2.Phospho)
e_phos = list(e_sig_2.Phospho)

In [12]:
cancer_obj = {'Endo':en, 'Luad':l}
mincount = 5

few_data = {}
for c in cancer_obj:
    print(c)
    mut_wt = all_prot_format_df(cancer_obj[c], sig_prot_list)
    mut_wt_cols = list(mut_wt.columns[:-1])
    if c == 'Endo':
        nd_list = find_few_data_genes(mut_wt, mincount, e_phos, gene = 'KRAS')
    if c == 'Luad':
        nd_list = find_few_data_genes(mut_wt, mincount, luad_phos, gene = 'KRAS')
    few_data[c] = nd_list
    

Endo
sites with not enough data:  0 / 26
Luad


KeyError: 'NFKB2_S858_1'

In [None]:
few_data['Luad']
#mut_wt_cols

In [None]:
# Test
mincount = 0
nd_genes = find_few_data_genes(mut_wt, mincount, prot_list, gene = 'PTEN')
print(len(nd_genes))

In [None]:
nd_genes[5]

In [None]:
gene = 'CCDC140' # gene in not_enough_data list (< mincount)

prot = cancer_obj.get_proteomics()
if cancer_obj == l:
    prot = u.reduce_multiindex(prot, levels_to_drop = 1)
test_prot_list = list(prot.columns)
test_mut_wt = all_prot_format_df(cancer_obj, test_prot_list)

gene_df = test_mut_wt[[gene,'Mutation']]
mut_df = gene_df.loc[gene_df['Mutation'] == 'Missense']
wt_df = gene_df.loc[gene_df['Mutation'] == 'Wildtype_Tumor']

print('Num of missense with proteomics data:', len(mut_df[gene].dropna()), '(<=', mincount, '?)')
print('Num of wt with proteomics data:', len(wt_df[gene].dropna()), '(<=', mincount, '?)')
mut_df