In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats

import cptac
import cptac.algorithms as al

In [None]:
def format_cis_comparison_data(cancer_object, specific_omics, gene, compare_mut_type = False):
    
    if compare_mut_type = True:
        
    else:    
        # Step 1 - Create dataframe in order to do comparisons with wrap_ttest - drop nan values
        omics_and_mutations = cancer_object.append_mutations_to_omics(
            mutation_genes = gene, omics_df_name = specific_omics.name, omics_genes = gene).dropna()

    # Check if values in omics data (if not found in proteomics, after na dropped dataframe should be empty)
    if omics_and_mutations[gene+"_"+specific_omics.name].empty:
        print('Not possible to do T-test. No data for', gene, 'in', specific_omics.name)
        return None
    else:
        
        # Step 2 - Create the binary column needed to do the comparison
        omics_and_mutations['binary_mutations'] = np.where(
            omics_and_mutations[gene+'_Mutation_Status'] == 'Wildtype_Tumor', 'Wildtype', 'Mutated')

        # Step 3 - Format the dataframe correctly for the T-test(just omics and binary columns for tumors)
        tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor']
        columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
        omics_binary_mutations = tumors.drop(columns_to_drop, axis = 1)
        #check if only one column of omics data (total 2 columns)
        if len(omics_binary_mutations.columns) != 2:
            print('exeption with columns. check omics data')
            return None
        else:
            # Make a list of the column label of omics to be used in the wrap_ttest function
            omics_col_list = [omics_binary_mutations.columns[0]] 

            # Step 4 - T-test comparing means of mutated vs wildtype effect on cis omics
            print("Doing t-test comparison\n")
            significant_pval_results = al.wrap_ttest(omics_binary_mutations, 'binary_mutations', omics_col_list)
            print(significant_pval_results)

            formated_data_for_boxplot = {'data': omics_binary_mutations, 'x': "binary_mutations", 
                                         'y': gene+"_proteomics", 'pval': significant_pval_results}
            return formated_data_for_boxplot

  

In [71]:
def get_missence_truncation_comparison(cancer_object, specific_omics, gene):
    #get omics data and tumors
    omics_and_mutations = cancer_object.append_mutations_to_omics(
            mutation_genes = gene, omics_df_name = specific_omics.name, omics_genes = gene)
    tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor']
    
    #data for mutation comparison
    somatic_mutations = cancer_object.get_mutations().reset_index()
    gene_df = somatic_mutations.loc[somatic_mutations['Gene'] == gene]
    
    if cancer_object.get_cancer_type() == 'colon':
        missence_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missence_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
        
    mutations_replaced_M_T = gene_df.replace(missence_truncation_groups)
    
    # group mutation categories
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    
    #get lists of unique samples for missence and trucation categories
    miss_unique_samples = list(miss['Sample_ID'].unique())
    trunc_unique_samples = list(trunc['Sample_ID'].unique())
    
    # Step 2 - Create the binary column needed to do the comparison
    # Get mutation catagories with omics data
    missence_omics = tumors.loc[tumors.index.isin(miss_unique_samples)]
    missence_omics['binary_mutations'] = 'M'
    truncation_omics = tumors.loc[tumors.index.isin(trunc_unique_samples)]
    truncation_omics['binary_mutations'] = 'T'
    binary_mut_omics = missence_omics.append(truncation_omics)
    
    # Step 3 - Format the dataframe correctly for the T-test(just omics and binary columns for tumors)
    columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
    omics_binary_mutations = binary_mut_omics.drop(columns_to_drop, axis = 1)
    
    # Make a list of the column label of omics to be used in the wrap_ttest function
    omics_col_list = [omics_binary_mutations.columns[0]] 

    # Step 4 - T-test comparing means of mutated vs wildtype effect on cis omics
    print("Doing t-test comparison\n")
    significant_pval_results = al.wrap_ttest(omics_binary_mutations, 'binary_mutations', omics_col_list)
    print(significant_pval_results)

    formated_data_for_boxplot = {'data': omics_binary_mutations, 'x': "binary_mutations", 
                                 'y': gene+"_"+specific_omics.name, 'pval': significant_pval_results}
    return formated_data_for_boxplot


In [8]:
en_object = cptac.Endometrial()
desired_cutoff = .1

endometrial_freq_mut = al.get_frequently_mutated(en_object, cutoff=desired_cutoff)
print('\n\nNumber of Frequently Mutated Genes:', len(endometrial_freq_mut), '\n', endometrial_freq_mut.head())

[Kmatting dataframes...linear data.....

Number of Frequently Mutated Genes: 232 
      Gene  Unique_Samples_Mut  Missence_Mut  Truncation_Mut
0  ABCA12            0.147368      0.094737        0.073684
1  ABCA13            0.115789      0.105263        0.042105
2  ACVR2A            0.105263      0.010526        0.094737
3  ADGRG4            0.136842      0.126316        0.021053
4  ADGRV1            0.115789      0.094737        0.052632


In [9]:
omics = en_object.get_proteomics()
#omics = en_object.get_transcriptomics()
#omics = en_object.get_phosphoproteomics()
#omics = en_object.get_acetylproteomics()

In [73]:
mutation_type_comparison = get_missence_truncation_comparison(en_object, omics, 'ARID1A')

Doing t-test comparison

No significant comparisons.
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
