# Find cell cycle genes without enough data to do a t-test in each cancer

In [1]:
import pandas as pd
import numpy as np
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
print('cptac version:', cptac.version())

cptac version: 0.8.5


In [3]:
#import warnings
#warnings.filterwarnings('ignore')

In [4]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad', 'endometrial'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor')
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
            prot_and_mutations = rename_duplicate_cols(prot_and_mutations)
        
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    elif cancer_object.get_cancer_type() in ('luad'): 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot, tissue_type = 'tumor')
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = rename_duplicate_cols(omics)
        omics = omics.drop(columns='PTEN_CNV')
       
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
            mut_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')
        
    elif cancer_object.get_cancer_type() in ('endometrial'):

        # merge cnv with genotype all mut type
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor') # drop Normal samples

        prot_df = prot_and_mutations.iloc[:,:-4] #drop Mutation and Location cols
        mut_type = mut_type[['Mutation']] # Get Mutation col that includes CNV
        merged = prot_df.join(mut_type) # merge 

        # Create Truncation category and keep truncation and wt
        compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
                        mut_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_proteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [5]:
# Adds '_i' to duplicate col names where i is a number (increases with each duplicate)

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [6]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that luad index is up-to-date...       



Checking that ovarian index is up-to-date...



                                            

In [7]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Deletion','Wildtype_Tumor'), 
    binary_col = 'Mutation'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        omics_cols = set(omics_cols)
        if binary_col in (omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    
    print('genes with not enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

In [8]:
cancer_obj = {'Endo':en, 'Colon':col, 'Luad':l, 'Lscc': ls, 'Gbm': g, 'Ov': o, 'Brca': b, 'Hnscc': h}

In [9]:
prot_list = ['DCTN1', 'ACTR1A', 'MAD2L1', 'MCM4', 'PCNA', 'MCM5', 'MCM2','PCM1',
              'MCM7', 'TPR', 'MCM3', 'MCM6', 'NUP153', 'RFC3', 'CDK11B', 'XPO1','PRKCB', 'RFC2', 'NUF2', 
              'RAB8A', 'RFC4', 'RANBP2', 'CENPF', 'GINS4', 'NDC80', 'OPTN', 'PPP2R2A', 'PPP2R2D',
              'TPX2', 'GINS2', 'BUB1B', 'TOP2A', 'TOPBP1', 'TP53', 'NUP210', 'MSH2', 'MSH6']

In [10]:
mincount = 5
gene = 'PTEN'

few_data_prot = {}
for c in cancer_obj:
    print(c)
    #prot = cancer_obj[c].get_proteomics()
    #if isinstance(prot.keys(), pd.core.indexes.multi.MultiIndex):
    #    prot = u.reduce_multiindex(prot, levels_to_drop = 1)
    #prot_list = list(prot.columns)
    mut_wt = all_prot_format_df(cancer_obj[c], prot_list)
    if cancer_obj[c].get_cancer_type() in ('endometrial'):
        labels = ('Truncation', 'Wildtype_Tumor')
    else:
        labels = ('Deletion', 'Wildtype_Tumor')
    nd_list = find_few_data_genes(mut_wt, mincount, omics_cols = prot_list, binary_labels = labels)
    few_data_prot[c] = nd_list
    print('\n')


Endo


  return array(a, dtype, copy=False, order=order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

genes with not enough data:  0 / 37


Colon



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


genes with not enough data:  7 / 37


Luad
genes with not enough data:  1 / 37


Lscc




genes with not enough data:  0 / 37


Gbm
genes with not enough data:  0 / 37


Ov




genes with not enough data:  0 / 37


Brca




genes with not enough data:  1 / 37


Hnscc
genes with not enough data:  0 / 37






In [11]:
few_data_prot

{'Endo': [],
 'Colon': ['NDC80', 'BUB1B', 'GINS2', 'TOPBP1', 'NUF2', 'CENPF', 'GINS4'],
 'Luad': ['XPO1'],
 'Lscc': [],
 'Gbm': [],
 'Ov': [],
 'Brca': ['DCTN1'],
 'Hnscc': []}

In [12]:
col = cptac.Colon()

                                          

In [13]:
cancer_obj = {'Endo':en, 'Colon':col, 'Luad':l, 'Lscc': ls, 'Gbm': g, 'Ov': o, 'Brca': b, 'Hnscc': h}
missing = {}
for c in cancer_obj:
    print(c)
    mut_wt = all_prot_format_df(cancer_obj[c], prot_list)
    null_columns = mut_wt.columns[mut_wt.isnull().all()]
    missing[c] = null_columns

Endo


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Colon



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Luad
Lscc




Gbm
Ov




Brca




Hnscc


In [14]:
missing

{'Endo': Index([], dtype='object', name='Name'),
 'Colon': Index(['NUF2', 'CENPF', 'GINS2', 'BUB1B', 'TOPBP1'], dtype='object', name='Name'),
 'Luad': Index(['XPO1'], dtype='object', name='Name'),
 'Lscc': Index([], dtype='object', name='Name'),
 'Gbm': Index([], dtype='object', name='Name'),
 'Ov': Index([], dtype='object', name='Name'),
 'Brca': Index(['DCTN1'], dtype='object', name='Name'),
 'Hnscc': Index([], dtype='object', name='Name')}

In [15]:
BUB1B, CENPF, GINS2, NUF2, TOPBP1 

NameError: name 'BUB1B' is not defined

In [None]:
mut_wt = all_prot_format_df(col, prot_list)
mut_wt

In [None]:
mut_wt

In [None]:
null_columns = mut_wt.columns[mut_wt.isnull().all()]
mut_wt[mut_wt.isnull().any(axis=1)][null_columns].head()
null_columns

In [None]:
# checks

In [None]:
# Get df
cancer_obj = col
prot = cancer_obj.get_proteomics()
if cancer_obj == l:
    prot = u.reduce_multiindex(prot, levels_to_drop = 1)
prot_list = list(prot.columns)
#prot_list = list(u.get_interacting_proteins_biogrid('KRAS', num_results=100))
#prot_list = ['KRAS']

mut_wt = all_prot_format_df(cancer_obj, prot_list)

mut_wt_cols = list(mut_wt.columns[:-1])
#print('Last column name:', mut_wt_cols[-1])

In [None]:
# Test
mincount = 0
nd_genes = find_few_data_genes(mut_wt, mincount, prot_list, gene = 'PTEN')
print(len(nd_genes))

In [None]:
nd_genes[5]

In [None]:
gene = 'CCDC140' # gene in not_enough_data list (< mincount)

prot = cancer_obj.get_proteomics()
if cancer_obj == l:
    prot = u.reduce_multiindex(prot, levels_to_drop = 1)
test_prot_list = list(prot.columns)
test_mut_wt = all_prot_format_df(cancer_obj, test_prot_list)

gene_df = test_mut_wt[[gene,'Mutation']]
mut_df = gene_df.loc[gene_df['Mutation'] == 'Missense']
wt_df = gene_df.loc[gene_df['Mutation'] == 'Wildtype_Tumor']

print('Num of missense with proteomics data:', len(mut_df[gene].dropna()), '(<=', mincount, '?)')
print('Num of wt with proteomics data:', len(wt_df[gene].dropna()), '(<=', mincount, '?)')
mut_df