# Find cell cycle genes without enough data to do a t-test in each cancer

In [1]:
import pandas as pd
import numpy as np
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
#import warnings
#warnings.filterwarnings('ignore')

In [34]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad', 'endometrial'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor')
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
            prot_and_mutations = rename_duplicate_cols(prot_and_mutations)
        
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    elif cancer_object.get_cancer_type() in ('luad'): 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot, tissue_type = 'tumor')
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = rename_duplicate_cols(omics)
        omics = omics.drop(columns='PTEN_CNV')
       
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
            mut_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')
        
    elif cancer_object.get_cancer_type() in ('endometrial'):

        # merge cnv with genotype all mut type
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor') # drop Normal samples

        prot_df = prot_and_mutations.iloc[:,:-4] #drop Mutation and Location cols
        mut_type = mut_type[['Mutation']] # Get Mutation col that includes CNV
        merged = prot_df.join(mut_type) # merge 

        # Create Truncation category and keep truncation and wt
        compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
                        mut_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_proteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [4]:
# Adds '_i' to duplicate col names where i is a number (increases with each duplicate)

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [5]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that luad index is up-to-date...       



Checking that ovarian index is up-to-date...



                                            

In [6]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Deletion','Wildtype_Tumor'), 
    binary_col = 'Mutation'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        omics_cols = set(omics_cols)
        if binary_col in (omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    
    print('genes with not enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

In [7]:
cancer_obj = {'Endo':en, 'Colon':col, 'Luad':l, 'Lscc': ls, 'Gbm': g, 'Ov': o, 'Brca': b, 'Hnscc': h}

In [20]:
prot_list = ['DCTN1', 'ACTR1A', 'MAD2L1', 'MCM4', 'PCNA', 'MCM5', 'MCM2','PCM1',
              'MCM7', 'TPR', 'MCM3', 'MCM6', 'NUP153', 'RFC3', 'CDK11B', 'XPO1','PRKCB', 'RFC2', 'NUF2', 
              'RAB8A', 'RFC4', 'RANBP2', 'CENPF', 'GINS4', 'NDC80', 'OPTN', 'PPP2R2A', 'PPP2R2D',
              'TPX2', 'GINS2', 'BUB1B', 'TOP2A', 'TOPBP1', 'TP53', 'NUP210', 'MSH2', 'MSH6']

In [9]:
mincount = 5
gene = 'PTEN'

few_data_prot = {}
for c in cancer_obj:
    print(c)
    #prot = cancer_obj[c].get_proteomics()
    #if isinstance(prot.keys(), pd.core.indexes.multi.MultiIndex):
    #    prot = u.reduce_multiindex(prot, levels_to_drop = 1)
    #prot_list = list(prot.columns)
    mut_wt = all_prot_format_df(cancer_obj[c], prot_list)
    if cancer_obj[c].get_cancer_type() in ('endometrial'):
        labels = ('Truncation', 'Wildtype_Tumor')
    else:
        labels = ('Deletion', 'Wildtype_Tumor')
    nd_list = find_few_data_genes(mut_wt, mincount, omics_cols = prot_list, binary_labels = labels)
    few_data_prot[c] = nd_list
    print('\n')


Endo


  return array(a, dtype, copy=False, order=order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


genes with not enough data:  0 / 37


Colon


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


genes with not enough data:  7 / 37


Luad
genes with not enough data:  1 / 37


Lscc




genes with not enough data:  0 / 37


Gbm
genes with not enough data:  0 / 37


Ov




genes with not enough data:  0 / 37


Brca
genes with not enough data:  1 / 37


Hnscc




genes with not enough data:  0 / 37






In [10]:
few_data_prot

{'Endo': [],
 'Colon': ['NUF2', 'GINS4', 'CENPF', 'NDC80', 'BUB1B', 'TOPBP1', 'GINS2'],
 'Luad': ['XPO1'],
 'Lscc': [],
 'Gbm': [],
 'Ov': [],
 'Brca': ['DCTN1'],
 'Hnscc': []}

In [42]:
col = cptac.Colon()

                                          

In [43]:
cancer_obj = {'Endo':en, 'Colon':col, 'Luad':l, 'Lscc': ls, 'Gbm': g, 'Ov': o, 'Brca': b, 'Hnscc': h}
missing = {}
for c in cancer_obj:
    print(c)
    mut_wt = all_prot_format_df(cancer_obj[c], prot_list)
    null_columns = mut_wt.columns[mut_wt.isnull().all()]
    missing[c] = null_columns

Endo


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Colon




Luad
Lscc


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Gbm
Ov




Brca




Hnscc




In [44]:
missing

{'Endo': Index([], dtype='object', name='Name'),
 'Colon': Index(['NUF2', 'CENPF', 'GINS2', 'BUB1B', 'TOPBP1'], dtype='object', name='Name'),
 'Luad': Index(['XPO1'], dtype='object', name='Name'),
 'Lscc': Index([], dtype='object', name='Name'),
 'Gbm': Index([], dtype='object', name='Name'),
 'Ov': Index([], dtype='object', name='Name'),
 'Brca': Index(['DCTN1'], dtype='object', name='Name'),
 'Hnscc': Index([], dtype='object', name='Name')}

In [None]:
BUB1B, CENPF, GINS2, NUF2, TOPBP1 

In [48]:
mut_wt = all_prot_format_df(col, prot_list)
mut_wt

  return array(a, dtype, copy=False, order=order)


Name,DCTN1,ACTR1A,MAD2L1,MCM4,PCNA,MCM5,MCM2,PCM1,MCM7,TPR,...,TPX2,GINS2,BUB1B,TOP2A,TOPBP1,TP53,NUP210,MSH2,MSH6,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01CO001,,,,,,,,,,,...,,,,,,,,,,Wildtype_Tumor
01CO005,-0.2120,0.0468,-0.0627,0.1760,0.0313,0.0517,0.0363,-0.2020,0.258,0.1740,...,-0.277,,,-0.0488,,-0.489,0.5620,0.3090,0.4610,Deletion
01CO006,-0.1250,0.0500,-0.1620,0.2420,-0.0178,0.1790,0.2330,0.0144,0.215,0.0299,...,,,,-0.6340,,-0.796,-0.3520,0.2270,0.0907,Wildtype_Tumor
01CO008,-0.2510,-0.3780,-0.0747,-0.4790,-0.3960,-0.3320,-0.4490,0.1920,-0.368,0.3660,...,,,,-0.6550,,,0.2520,0.1140,0.0143,Wildtype_Tumor
01CO013,-0.4470,-0.4460,1.0600,0.7490,1.0100,0.9190,0.7990,-0.4050,0.927,0.4820,...,0.468,,,0.5180,,-1.390,0.0852,0.6560,0.6460,Wildtype_Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21CO007,-0.2500,-0.4660,0.0842,0.3090,0.7900,0.4900,0.6050,-0.5170,0.565,0.2520,...,0.642,,,0.0887,,,-0.1130,0.3840,0.3030,Wildtype_Tumor
22CO004,-0.2610,-0.5280,0.1390,-0.1260,0.4260,-0.1300,-0.1700,-0.5210,-0.456,0.1550,...,0.438,,,0.1710,,0.850,0.0758,0.0689,0.0213,Wildtype_Tumor
22CO006,-0.0767,0.1290,0.1260,-0.1990,0.1890,-0.0601,-0.1810,0.0689,-0.172,0.0756,...,,,,0.1450,,,0.7110,-0.2720,-0.1750,Wildtype_Tumor
24CO005,,,,,,,,,,,...,,,,,,,,,,Wildtype_Tumor


In [36]:
mut_wt

Name,DCTN1,ACTR1A,MAD2L1,MCM4,PCNA,MCM5,MCM2,PCM1,MCM7,TPR,...,TPX2,GINS2,BUB1B,TOP2A,TOPBP1,TP53,NUP210,MSH2,MSH6,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,-0.2660,-0.01850,0.639,0.759,0.800,0.6890,0.6150,0.267,0.8020,-0.0715,...,0.925,1.68000,1.420000,1.380,0.3830,0.2950,0.6940,-0.7670,-1.950,Truncation
C3L-00032,0.0384,0.21200,0.309,-0.440,-0.285,-0.2580,-0.0715,0.727,-0.2180,-0.2350,...,-0.786,-0.06140,0.021800,0.168,-0.1510,-0.8710,-0.0541,-0.1170,-0.144,Truncation
C3L-00098,-0.4350,-0.47900,0.294,1.660,1.220,1.5900,1.4100,-0.308,1.7000,0.1700,...,2.090,0.30900,0.936000,3.280,0.8440,3.0100,-0.5780,0.2110,0.407,Wildtype_Tumor
C3L-00137,-0.0387,0.00114,0.837,0.913,0.469,1.0600,0.8550,0.248,1.2100,0.0681,...,0.422,1.16000,0.940000,0.629,0.4180,0.4410,1.1600,0.6680,0.450,Truncation
C3L-00139,-0.2770,-0.50400,0.520,1.560,1.570,1.6800,1.5100,0.386,1.3500,0.2580,...,2.130,-0.01940,1.330000,3.120,0.9720,-1.2200,1.3600,0.6790,0.828,Wildtype_Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01520,-0.0934,0.10300,0.589,0.728,0.663,0.6760,0.4810,0.640,0.4520,-0.1380,...,0.265,0.79300,0.195000,1.000,0.0932,-0.2570,0.5130,-0.1980,0.510,Truncation
C3N-01521,-0.3890,-0.97900,0.191,0.300,0.822,0.0119,0.2310,-0.458,-0.0943,0.3050,...,0.620,0.67500,0.000046,0.384,0.4400,1.2000,0.9370,0.1570,0.563,Wildtype_Tumor
C3N-01537,-0.3070,-0.77300,0.648,0.836,-0.069,0.4240,0.7840,-0.254,0.7000,-0.0173,...,1.230,-0.00635,1.150000,0.964,0.5770,-0.0333,0.7190,-0.0961,0.176,Wildtype_Tumor
C3N-01802,-0.4780,0.14000,-0.755,-0.119,0.368,-0.1780,0.1380,-0.713,0.4100,0.0605,...,1.040,0.19700,0.627000,0.668,0.7550,0.2920,1.3100,-0.2970,-0.332,Wildtype_Tumor


In [37]:
null_columns = mut_wt.columns[mut_wt.isnull().all()]
mut_wt[mut_wt.isnull().any(axis=1)][null_columns].head()
null_columns

Index([], dtype='object', name='Name')

In [12]:
# checks

In [13]:
# Get df
cancer_obj = col
prot = cancer_obj.get_proteomics()
if cancer_obj == l:
    prot = u.reduce_multiindex(prot, levels_to_drop = 1)
prot_list = list(prot.columns)
#prot_list = list(u.get_interacting_proteins_biogrid('KRAS', num_results=100))
#prot_list = ['KRAS']

mut_wt = all_prot_format_df(cancer_obj, prot_list)

mut_wt_cols = list(mut_wt.columns[:-1])
#print('Last column name:', mut_wt_cols[-1])



In [14]:
# Test
mincount = 0
nd_genes = find_few_data_genes(mut_wt, mincount, prot_list, gene = 'PTEN')
print(len(nd_genes))

TypeError: find_few_data_genes() got an unexpected keyword argument 'gene'

In [None]:
nd_genes[5]

In [None]:
gene = 'CCDC140' # gene in not_enough_data list (< mincount)

prot = cancer_obj.get_proteomics()
if cancer_obj == l:
    prot = u.reduce_multiindex(prot, levels_to_drop = 1)
test_prot_list = list(prot.columns)
test_mut_wt = all_prot_format_df(cancer_obj, test_prot_list)

gene_df = test_mut_wt[[gene,'Mutation']]
mut_df = gene_df.loc[gene_df['Mutation'] == 'Missense']
wt_df = gene_df.loc[gene_df['Mutation'] == 'Wildtype_Tumor']

print('Num of missense with proteomics data:', len(mut_df[gene].dropna()), '(<=', mincount, '?)')
print('Num of wt with proteomics data:', len(wt_df[gene].dropna()), '(<=', mincount, '?)')
mut_df