# get_frequently_mutated Final 

In [1]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
    """take cancer type to import cptac and find the frequently mutated genes in total tumors compared to the cutoff.
        
        Parameters:
        cancer_type (string): type of  cancer 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated_df (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
            and percent mutated (mutated genes / total tumors), percent Truncated, percent Missence
        
        There are many types of mutations catagorized into the columns 'Truncated' and 'Missence'. 
        The Truncated column includes: Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, Nonsense_Mutation, Nonstop_Mutation
        The Missence column includes: In_Frame_Del, In_Frame_Ins, Missense_Mutation
        These columns count multiple mutations of one gene in the same sample, so percentages in the last two columns may 
        exceed the total percent mutated (which only counts if the gene was mutated once)
            """ 
    
    # import CPTAC and pandas
    
    import pandas as pd
    colon = False
    if cancer_type == "endometrial" or cancer_type == "Endometrial":
        import cptac.endometrial as cptac
        
    elif cancer_type == "colon" or cancer_type == "Colon":
        import cptac.colon as cptac
        colon = True
        
    elif cancer_type == "ovarian" or cancer_type == "Ovarian":
        import cptac.ovarian as cptac
    
    else:
        str_cancer_options = '\n' + 'Options: endometrial, ovarian, colon'
        print("Please enter a valid cancer type.", str_cancer_options)
        return 0
    
    # get data frames
    somatic_mutations = cptac.get_mutations()
    proteomics = cptac.get_proteomics()
    sample_status_map = cptac.get_sample_status_map()
    merged_mutations = somatic_mutations.join(sample_status_map, how="left") 
    
    # standardize mutation names 
    if colon == True:
        mutation_equivalents = {'frameshift substitution': 'Frame_Shift_Del' , 'frameshift deletion': 'Frame_Shift_Del', 
            'frameshift insertion': 'Frame_Shift_Ins', 'stopgain': 'Nonsense_Mutation ', 'stoploss':'Nonstop_Mutation',
            'nonsynonymous SNV': 'Missense_Mutation','nonframeshift insertion': 'In_Frame_Ins',
            'nonframeshift deletion': 'In_Frame_Del', 'nonframeshift substitution': 'Missense_Mutation'}
        merged_mutations = merged_mutations.replace(to_replace = mutation_equivalents)
        
    # get list of unique genes
    unique_genes = somatic_mutations['Gene'].unique()
    
    # get total tumors/patients
    sample_status_series = sample_status_map.value_counts()
    total_tumor_patients = sample_status_series[0]
        
    # find frequently mutated genes and their percent mutated. Create lists for frequently mutated genes and percentage.
    freq_mut = []
    total_percent_mutated = []
    for gene in unique_genes:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        gene_mutated = gene_mutated.index.unique()
        num_gene_mutated = len(gene_mutated)
        percentage = (num_gene_mutated / total_tumor_patients)
        if percentage > cutoff:
            freq_mut.append(gene)
            total_percent_mutated.append(percentage)
    
    # find truncated percentage
    truncated = []
    missence = []
    sample_overlap = []
    
    for gene in freq_mut:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        
        # trunc
        truncated_df = gene_mutated.loc[(gene_mutated['Mutation'] != 'In_Frame_Del') & 
            (gene_mutated['Mutation'] != 'In_Frame_Ins') & (gene_mutated['Mutation'] != 'Missense_Mutation')] 
        samples_trunc = truncated_df.index.unique()
        num_trunc_mut = len(samples_trunc)
        fraction_trunc = (num_trunc_mut / total_tumor_patients)
        truncated.append(fraction_trunc)
        
        #miss
        missence_mutations = gene_mutated.loc[(gene_mutated['Mutation'] == 'In_Frame_Ins') 
                | (gene_mutated['Mutation'] == 'In_Frame_Del') | (gene_mutated['Mutation'] == 'Missense_Mutation')]
        samples_miss = missence_mutations.index.unique()
        num_miss_mut = len(samples_miss)
        
        #count overlap in truncated
        num_overlap = 0
        non_overlap_samples_miss = []
        for sample in samples_miss:
            if sample not in samples_trunc:
                non_overlap_samples_miss.append(sample)
            else:
                num_overlap += 1
        
        num_non_overlap_samples = len(non_overlap_samples_miss)
        fraction_miss = (num_non_overlap_samples / total_tumor_patients)
        missence.append(fraction_miss)
        
    # create dataframe
    merged_lists = list(zip(freq_mut, total_percent_mutated, truncated, missence))
    freq_mutated_df = pd.DataFrame(merged_lists, columns =['Gene', 'Fraction_Mutated', 'Fraction_Truncation', 'Fraction_Missence'])
                   
    return freq_mutated_df

In [4]:
import unittest

class TestGetFreqMut(unittest.TestCase):
    
    def setUp(self):
        self.get_frequently_mutated('endometrial', .3)


unittest.main(argv=[''], verbosity=2, exit=False)

test_colon_input (__main__.Testget_frequently_mutated) ... ERROR
test_wrong_input (__main__.Testget_frequently_mutated) ... 

Please enter a valid cancer type. 
Options: endometrial, ovarian, colon


ok

ERROR: test_colon_input (__main__.Testget_frequently_mutated)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-3-cd61a03f403b>", line 9, in test_colon_input
    self.assertTrue(get_frequently_mutated('colon'))
  File "C:\Users\brittany henderson\Anaconda3\envs\Research\lib\unittest\case.py", line 690, in assertTrue
    if not expr:
  File "C:\Users\brittany henderson\Anaconda3\envs\Research\lib\site-packages\pandas\core\generic.py", line 1478, in __nonzero__
    .format(self.__class__.__name__))
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

----------------------------------------------------------------------
Ran 2 tests in 128.438s

FAILED (errors=1)


<unittest.main.TestProgram at 0x2330e513fd0>