# Find MAPK signaling phosphosites without enough data to do a t-test 

In [1]:
import pandas as pd
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
print('cptac version:', cptac.version())

cptac version: 0.8.5


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_phospho_format_df(cancer_object, all_prot, gene_in = 'KRAS', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    mut_type['Mutation'].where(mut_type['Mutation'] != 'Missense_Mutation', 'Missense', inplace = True) # replace when false
    mut_type['Mutation'].where(mut_type['Mutation'] != 'nonsynonymous SNV', 'Missense', inplace = True)

    # Keep only tumor samples from proteomics
    prot_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = [gene_in], omics_df_name = 'phosphoproteomics', omics_genes = all_prot, tissue_type = 'tumor')
    # Reduce a multiindex 
    if cancer_object.get_cancer_type() in ('endometrial'):
        prot_and_mutations = u.reduce_multiindex(prot_and_mutations, flatten = True)
    elif cancer_object.get_cancer_type() in ('colon'):
        prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = [2], flatten = True)
    elif cancer_object.get_cancer_type() in ('luad'):
        prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = [2,3], flatten = True)

    # Keep certain missense mutations
    prot_and_mutations['KRAS_Location'] = [','.join(map(str, l)) for l in prot_and_mutations['KRAS_Location']]
    hotspots = ['G12', 'G13', 'Q61', 'No_mutation']
    hotspots_wt = pd.DataFrame()
    for site in hotspots:
        df = prot_and_mutations[prot_and_mutations.KRAS_Location.str.contains(site, regex= True, na=False)]
        hotspots_wt = hotspots_wt.append(df)
    #print(hotspots_wt.KRAS_Location.value_counts())
    
    # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
    mut_type = mut_type[['Mutation']]
    prot_df = hotspots_wt.iloc[:,:-4] # Keep only proteomics
    merged = prot_df.join(mut_type)
    merged = merged.join(hotspots_wt['KRAS_Location'])

    # Keep only Wildtype and deletion
    compare = ['Wildtype_Tumor','Missense']
    get = merged['Mutation'].isin(compare)
    mut_wt = merged[get]
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_phosphoproteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [4]:
# Adds '_i' to duplicate col names where i is a number (increases with each duplicate)

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [5]:
en = cptac.Endometrial()
l = cptac.Luad()
col = cptac.Colon()

                                                

In [6]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Missense','Wildtype_Tumor'), 
    binary_col = 'Mutation', gene = 'KRAS'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        set_omics_cols = set(omics_cols)
        if binary_col in (set_omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    print('sites without enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

In [7]:
luad_df = pd.read_csv("../Make_Tables/csv/Single_Cancer/Luad_phospho_MAPK.csv")
e_df = pd.read_csv("../Make_Tables/csv/Single_Cancer/Endo_phospho_MAPK.csv")
c_df = pd.read_csv("../Make_Tables/csv/Single_Cancer/Colon_phospho_MAPK.csv")

# Phosphosites

Check of the phosphosites of proteins involved in the MAPK pathways that have enough data to do a t-tes.

In [8]:
# Keep sig sites
luad_sig_2 = luad_df.loc[luad_df['Luad_P_Value'] < 0.1]
e_sig_2 = e_df.loc[e_df['Endo_P_Value'] < 0.1]

# Create protein column
e_sig_2['Protein'], e_sig_2['Site'] = e_sig_2.Phospho.str.split('_', 1).str
luad_sig_2['Protein'], luad_sig_2['Site'] = luad_sig_2.Phospho.str.split('_', 1).str
c_df['Protein'], c_df['Site'] = c_df.Phospho.str.split('_', 1).str

# Get list of proteins sig in at least 1 cancer
e_list = list(e_sig_2.Protein)
luad_list = list(luad_sig_2.Protein)
sig_prot_list = list(set(e_list + luad_list))

In [9]:
# Get sites without enough data to do a t-test
cancer_obj = {'Endo':en, 'Colon':col, 'Luad':l}
mincount = 5

few_data = {}
for c in cancer_obj:
    print(c)
    mut_wt = all_phospho_format_df(cancer_obj[c], sig_prot_list) # give list of proteins to get all sites for those proteins 
    if cancer_obj[c].get_cancer_type() not in ('endometrial'): # endo has sites for all proteins - the others add cols of nan for the proteins wihtout sites
        mut_wt = mut_wt.dropna(axis = 'columns', how='all') # drop added protein columns (no sites in dataset)
    #(added with nan if not found in dataset -from sig_prot_list with join_mutations_to_omics)
    mut_wt_cols = list(mut_wt.columns[:-1])
    nd_list = find_few_data_genes(mut_wt, mincount, mut_wt_cols, gene = 'KRAS')
    few_data[c] = nd_list  

Endo
sites without enough data:  230 / 657
Colon
sites without enough data:  147 / 354
Luad
sites without enough data:  0 / 410


# Missing Proteins

Any protein without a site is missing from the phosproteomic data (no sites with data for that protein).

In [10]:
cancer_obj = {'Endo':en, 'Colon':col, 'Luad':l}
missing = {}
for c in cancer_obj:
    print(c)
    mut_wt = all_phospho_format_df(cancer_obj[c], sig_prot_list)
    null_columns = mut_wt.columns[mut_wt.isnull().all()]
    missing[c] = null_columns
    print(len(missing[c]))
    print(missing[c], '\n')

Endo
13
Index(['ARAF_S277', 'BRAF_S319', 'BRAF_S429', 'DAXX_S510', 'MAP3K11_S783',
       'MAP3K5_S955', 'MAP3K7_S310', 'MAP4K4_T59', 'PLA2G4A_S731',
       'RAPGEF2_S1442', 'RAPGEF2_T1436', 'RAPGEF2_T763', 'RPS6KA5_T700'],
      dtype='object') 

Colon
2
Index(['TRAF2', 'TRAF6'], dtype='object') 

Luad
1
Index(['DUSP9'], dtype='object') 



# Check Values

In [11]:
cancer_obj = en

In [12]:
# Get all sites of proteins in the MAPK pathway
test_mut_wt = all_phospho_format_df(cancer_obj, sig_prot_list)
df_cols = list(test_mut_wt.columns[:-1])
print(cancer_obj.get_cancer_type())
print('Num sites:', len(df_cols))
#df_cols[20:]

endometrial
Num sites: 658


In [13]:
 # Get list of sites without enough data
mincount = 5
#test_mut_wt = test_mut_wt.dropna(axis = 'columns', how = 'all') # drop sites with only nan
nd_sites = find_few_data_genes(test_mut_wt, mincount, gene = 'KRAS')

sites without enough data:  230 / 658


In [14]:
# Uncomment to use a different list of sites
#site = df_cols[342] # df_cols = all sites of proteins in MAPK pathway
site = nd_sites[40] # nd_sites = list of sites without enough data

print('Site:', site, '\n')
if site in (nd_genes): # Check if site in not_enough_data list (< mincount)
    print('NOT enough data')
else:
    print('Enough data')

Site: FLNA_S2279 



NameError: name 'nd_genes' is not defined

In [None]:
test_mut_wt = all_phospho_format_df(cancer_obj, sig_prot_list)
df_cols = list(mut_wt.columns[:-1])

site_df = test_mut_wt[[site,'Mutation']]
mut_df = site_df.loc[site_df['Mutation'] == 'Missense']
wt_df = site_df.loc[site_df['Mutation'] == 'Wildtype_Tumor']

print('Num of missense samples with phospho data:', len(mut_df[site].dropna()), '(<=', mincount, '?)')
print('Num of wt samples with phospho data:', len(wt_df[site].dropna()), '(<=', mincount, '?)')
mut_df

In [None]:
wt_df