# Find immune proteins without enough data for a t-test

In [1]:
import pandas as pd
import numpy as np
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
print('cptac version:', cptac.version())

cptac version: 0.8.5


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
'''
Params
cancer_object: Object of the loaded cancer data set.
all_prot: List of trans proteins used to get proteomics data. 
gene_in: String. Gene used to get mutation status for the Mutation col.
utils: utils package from cptac.  

Returns a dataframe with trans proteomics and mutation status of a specific gene.
'''

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad', 'endometrial'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor')
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
            prot_and_mutations = rename_duplicate_cols(prot_and_mutations)
        
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    elif cancer_object.get_cancer_type() in ('luad'): 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot, tissue_type = 'tumor')
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = rename_duplicate_cols(omics)
        omics = omics.drop(columns='PTEN_CNV')
       
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
            mut_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')
        
    elif cancer_object.get_cancer_type() in ('endometrial'):

        # merge cnv with genotype all mut type
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor') # drop Normal samples

        prot_df = prot_and_mutations.iloc[:,:-4] #drop Mutation and Location cols
        mut_type = mut_type[['Mutation']] # Get Mutation col that includes CNV
        merged = prot_df.join(mut_type) # merge 

        # Create Truncation category and keep truncation and wt
        compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
                        mut_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_proteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [5]:
''' Adds '_i' to duplicate col names where the int i increases by one with each duplicate
Returns a df with unique column names. '''

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [6]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Deletion','Wildtype_Tumor'), 
    binary_col = 'Mutation'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        omics_cols = set(omics_cols)
        if binary_col in (omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    
    print('genes with not enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

Load cancer data sets.

In [7]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

                                                

# Step 1: Find proteins without enough data to do a t-test

Use the function find_few_data_genes to find genes with less than 5 nonNaN measurements in sample groups of PTEN deletion or PTEN wt.

In [8]:
cancer_obj = {'EC':en, 'CO':col, 'LUAD':l, 'LSCC': ls, 'GBM': g, 'OV': o, 'BR': b, 'HNSCC': h}

In [10]:
# ordered and grouped proteins
immune_pten = ['BTK', 'BLNK', 'CD2', 'CD4', 'CD5', 'DOCK2', 'ELMO1', 'FYB1', 'GRB2', 'INPP5D', 'LCK', 
               'NFATC2', 'NFKB2', 'PIK3CD', 'PIK3CG', 'PLCG1', 'PLCG2', 'PRKCB', 'PRKCQ', 'PTPRC',
               'RAC2', 'SYK', 'VAV1', 'WAS', 'ZAP70', 'PRKCB', 'CHUK']
len(immune_pten)

27

In [11]:
mincount = 5
gene = 'PTEN'
prot_list = immune_pten

few_data_prot = {}
for c in cancer_obj:
    print(c)
    mut_wt = all_prot_format_df(cancer_obj[c], prot_list)
    if cancer_obj[c].get_cancer_type() in ('endometrial'):
        labels = ('Truncation', 'Wildtype_Tumor')
    else:
        labels = ('Deletion', 'Wildtype_Tumor')
    nd_list = find_few_data_genes(mut_wt, mincount, omics_cols = prot_list, binary_labels = labels)
    few_data_prot[c] = nd_list

EC
genes with not enough data:  0 / 26
CO
genes with not enough data:  5 / 26
LUAD
genes with not enough data:  0 / 26
LSCC
genes with not enough data:  0 / 26
GBM
genes with not enough data:  2 / 26
OV
genes with not enough data:  2 / 26
BR
genes with not enough data:  1 / 26
HNSCC
genes with not enough data:  0 / 26


In [12]:
# Print results
few_data_prot

{'EC': [],
 'CO': ['PRKCQ', 'FYB1', 'BLNK', 'PIK3CD', 'PIK3CG'],
 'LUAD': [],
 'LSCC': [],
 'GBM': ['LCK', 'CD5'],
 'OV': ['CD2', 'LCK'],
 'BR': ['FYB1'],
 'HNSCC': []}

# Step 2: Find proteins with only NaN data

In [13]:
missing = {}
for c in cancer_obj:
    mut_wt = all_prot_format_df(cancer_obj[c], prot_list)
    null_columns = mut_wt.columns[mut_wt.isnull().all()]
    missing[c] = null_columns
    
# print results
missing

{'EC': Index([], dtype='object', name='Name'),
 'CO': Index(['FYB1', 'PIK3CG'], dtype='object', name='Name'),
 'LUAD': Index([], dtype='object', name='Name'),
 'LSCC': Index([], dtype='object', name='Name'),
 'GBM': Index(['CD5', 'LCK'], dtype='object', name='Name'),
 'OV': Index(['CD2', 'LCK'], dtype='object', name='Name'),
 'BR': Index(['FYB1'], dtype='object', name='Name'),
 'HNSCC': Index([], dtype='object', name='Name')}

Check results for Colon.

In [14]:
# check missing genes in colon
mut_wt = all_prot_format_df(col, prot_list)
null_columns = mut_wt.columns[mut_wt.isnull().all()]
mut_wt[mut_wt.isnull().any(axis=1)][null_columns].head()
null_columns

Index(['FYB1', 'PIK3CG'], dtype='object', name='Name')

In [None]:
# Checks

In [16]:
df = pd.read_csv("../Make_Tables/csv/all_heatmap.csv")

In [17]:
df.loc[df['Proteomics'] == 'FYB1']

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
1459,FYB1,0.032718,-0.847324,GBM
11325,FYB1,0.018251,-0.644981,HNSCC
23159,FYB1,0.087517,-0.91735,LUAD
35577,FYB1,0.304008,-0.76585,LSCC
61881,FYB1,0.985704,0.131534,OV
66946,FYB1,0.113205,-0.53,EC
