# get_frequently_mutated Final 

In [1]:
import cptac
en = cptac.Endometrial()
#ov = cptac.Ovarian()
#cancer = cptac.Colon()

import pandas as pd
import numpy as np

In [None]:
def get_frequently_mutated(cancer_type, cutoff = 0.1):  
    """
    take cancer object and find the frequently 
    mutated genes in the total tumors compared to the cutoff.

    Parameters:
    cancer_type (object): cancer class object from cptac module 
    cutoff (float): used as comparison to determine the 
                    status of gene mutation frequency

    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
                mutated genes passing the cutoff. Columns 
                contain the fractions of total unique 
                mutations,missence type mutations, 
                and truncation type mutations per gene.
    
    The Missence_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
   The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""    
    
    # get data frame
    somatic_mutations = cancer_type.get_mutations()
    sample_status_map = cancer_type._get_sample_status_map()
         
    '''Get total tumors/patients'''
    sample_status_series = sample_status_map.value_counts().reset_index()
    tumors = sample_status_series.loc[sample_status_series['index'] == 'Tumor']
    total_tumor_patients = tumors.iloc[0,1]
    print(total_tumor_patients)
        
    '''Find frequently mutated genes and their fraction of unique mutated samples.'''
    #move 'Sample_ID' from index to col 
    origin_df = somatic_mutations.reset_index()

    #group by gene and count unique samples
    genes_df = origin_df.groupby(['Gene']).nunique()

    #format
    genes_df = genes_df.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
    genes_df = genes_df.drop(['Gene', 'Mutation','Location'], axis = 1)
    
    #filter using the cutoff and create fraction
    genes_df = genes_df.apply(lambda x: x / total_tumor_patients)
    filtered = genes_df.where(lambda x: x > cutoff)
    filtered_gene_df = filtered.dropna()
    
    '''Create Missence and Trucation data frame'''
    #create two categories in Mutation column
    '''
    if cancer_type.get_tumor_type() == 'Colon':
        mutation_equivalents = {'frameshift substitution': 'T' , 'frameshift deletion': 'T', 
            'frameshift insertion': 'T', 'stopgain': 'T ', 'stoploss':'T',
            'nonsynonymous SNV': 'M','nonframeshift insertion': 'M',
            'nonframeshift deletion': 'M', 'nonframeshift substitution': 'M'}
        replaced_M_T = origin_df.replace(mutation_equivalents)
        
    else:   
    
    '''
    missence_mut = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M', 'Missense_Mutation': 'M'}
    truncation_mut = {'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
                      'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    replaced_M = origin_df.replace(missence_mut)
    replaced_M_T = replaced_M.replace(truncation_mut)
    
    # group mutation categories
    miss = replaced_M_T.loc[replaced_M_T['Mutation'] == 'M']
    trunc = replaced_M_T.loc[replaced_M_T['Mutation'] == 'T']

    # group by gene and count unique samples for both categories
    miss_df = miss.groupby(['Gene']).nunique()
    trunc_df = trunc.groupby(['Gene']).nunique()

    #format
    miss_df = miss_df.rename(columns={"Sample_ID": "Missence_Mut"})
    miss_df = miss_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    trunc_df = trunc_df.rename(columns={"Sample_ID": "Truncation_Mut"})
    trunc_df = trunc_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    #join miss and trunc and change nan to 0, then divide by total tumors
    mut = miss_df.join(trunc_df).fillna(0)
    missence_and_truncation_df = mut.apply(lambda x: x / total_tumor_patients)


    '''Join data frames, keeping only the genes that passed the cutoff''' 
    freq_mutated_df = filtered_gene_df.join(missence_and_truncation_df).reset_index()
    freq_mutated_df.name = 'frequently_mutated'
                   
    return freq_mutated_df

In [4]:
import unittest

class TestGetFreqMut(unittest.TestCase):
    
    def setUp(self):
        self.get_frequently_mutated('endometrial', .3)


unittest.main(argv=[''], verbosity=2, exit=False)

test_colon_input (__main__.Testget_frequently_mutated) ... ERROR
test_wrong_input (__main__.Testget_frequently_mutated) ... 

Please enter a valid cancer type. 
Options: endometrial, ovarian, colon


ok

ERROR: test_colon_input (__main__.Testget_frequently_mutated)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-3-cd61a03f403b>", line 9, in test_colon_input
    self.assertTrue(get_frequently_mutated('colon'))
  File "C:\Users\brittany henderson\Anaconda3\envs\Research\lib\unittest\case.py", line 690, in assertTrue
    if not expr:
  File "C:\Users\brittany henderson\Anaconda3\envs\Research\lib\site-packages\pandas\core\generic.py", line 1478, in __nonzero__
    .format(self.__class__.__name__))
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

----------------------------------------------------------------------
Ran 2 tests in 128.438s

FAILED (errors=1)


<unittest.main.TestProgram at 0x2330e513fd0>