# get_frequently_mutated Final 

In [24]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
    """take cancer type to import cptac and find the frequently mutated genes in total tumors compared to the cutoff.
        
        Parameters:
        cancer_type (string): type of  cancer 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated_df (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
            and Total_Mutated (mutated genes / total tumors), percent Truncated, and percent Missence
        
        There are many types of mutations catagorized into the columns Truncated and Missence. 
        The Truncated column includes: Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, Nonsense_Mutation, Nonstop_Mutation
        The Missence column includes: In_Frame_Del, In_Frame_Ins, Missense_Mutation
        These columns count multiple mutations of one gene in the same sample, so percentages in the last two columns may 
        exceed the Total_Mutated column(which only counts if the gene was mutated once)"""    
    
    # import CPTAC and pandas
    
    import pandas as pd
    colon = False
    if cancer_type == "endometrial" or cancer_type == "Endometrial":
        import cptac.endometrial as cptac
        
    elif cancer_type == "colon" or cancer_type == "Colon":
        import cptac.colon as cptac
        colon = True
        
    elif cancer_type == "ovarian" or cancer_type == "Ovarian":
        import cptac.ovarian as cptac
    
    else:
        str_cancer_options = '\n' + 'Options: endometrial, ovarian, colon'
        print("Please enter a valid cancer type.", str_cancer_options)
        return 0
    
    # get data frames
    somatic_mutations = cptac.get_mutations()
    proteomics = cptac.get_proteomics()
    sample_status_map = cptac.get_sample_status_map()
    merged_mutations = somatic_mutations.join(sample_status_map, how="left") 
    
    # standardize mutation names 
    if colon == True:
        mutation_equivalents = {'frameshift substitution': 'Frame_Shift_Del' , 'frameshift deletion': 'Frame_Shift_Del', 
            'frameshift insertion': 'Frame_Shift_Ins', 'stopgain': 'Nonsense_Mutation ', 'stoploss':'Nonstop_Mutation',
            'nonsynonymous SNV': 'Missense_Mutation','nonframeshift insertion': 'In_Frame_Ins',
            'nonframeshift deletion': 'In_Frame_Del', 'nonframeshift substitution': 'Missense_Mutation'}
        merged_mutations = merged_mutations.replace(to_replace = mutation_equivalents)
        
    # get list of unique genes
    unique_genes = somatic_mutations['Gene'].unique()
    
    # get total tumors/patients
    sample_status_series = sample_status_map.value_counts()
    total_tumor_patients = sample_status_series[0]
        
    # find frequently mutated genes and their percent mutated. Create lists for frequently mutated genes and percentage.
    freq_mut = []
    total_percent_mutated = []
    for gene in unique_genes:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        gene_mutated = gene_mutated.index.unique()
        num_gene_mutated = len(gene_mutated)
        percentage = (num_gene_mutated / total_tumor_patients)
        if percentage > cutoff:
            freq_mut.append(gene)
            total_percent_mutated.append(percentage)
    
    # find truncated percentage
    truncated = []
    for gene in freq_mut:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        trunc = gene_mutated.loc[gene_mutated['Mutation'] != 'In_Frame_Del']
        trunc = trunc.loc[trunc['Mutation'] != 'In_Frame_Ins']
        trunc = trunc.loc[trunc['Mutation'] != 'Missense_Mutation'] 
        num_gene_mutated = len(trunc.index.unique())
        percentage = (num_gene_mutated / total_tumor_patients)
        truncated.append(percentage)

    # find missence percentage 
    missence = []
    for gene in freq_mut:
        num_gene_mutated = 0 
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        missence_mutations = gene_mutated.loc[gene_mutated['Mutation'] == 'In_Frame_Del']
        num_gene_mutated = len(missence_mutations.index.unique())
        missence_mutations = gene_mutated.loc[gene_mutated['Mutation'] == 'In_Frame_Ins']
        num_gene_mutated += len(missence_mutations.index.unique())
        missence_mutations = gene_mutated.loc[gene_mutated['Mutation'] == 'Missense_Mutation']
        num_gene_mutated += len(missence_mutations.index.unique())
        percentage = (num_gene_mutated / total_tumor_patients)
        missence.append(percentage)
        
    # create dataframe
    merged_lists = list(zip(freq_mut, total_percent_mutated, truncated, missence))
    freq_mutated_df = pd.DataFrame(merged_lists, columns =['Gene', 'Fraction_Mutated', 'Truncated', 'Missence'])
                   
    return freq_mutated_df

Test Endometrial

In [25]:
endo_freq_mutated_df = get_frequently_mutated("endometrial")

In [26]:
endo_freq_mutated_df

Unnamed: 0,Gene,Fraction_Mutated,Truncated,Missence
0,RPL22,0.168421,0.157895,0.010526
1,SPEN,0.105263,0.052632,0.063158
2,ARID1A,0.452632,0.400000,0.136842
3,FLG,0.136842,0.010526,0.126316
4,KIF26B,0.105263,0.042105,0.073684
5,PXDN,0.126316,0.042105,0.084211
6,ALK,0.105263,0.031579,0.073684
7,DYSF,0.147368,0.042105,0.105263
8,NEB,0.126316,0.042105,0.115789
9,LRP2,0.136842,0.052632,0.115789


Test Colon

In [9]:
colon_freq_mutated_df = get_frequently_mutated("colon", .25)

You have loaded the cptac colon dataset. To view available dataframes,
use cptac.colon.list_data(). To view available functions for accessing
and manipulating the dataframes, use cptac.colon.list_api().
colon data version: Most recent release

Loading cptac colon data:
Loading clinical data...
Loading miRNA data...
Loading mutation data...
Loading mutation_binary data...
Loading phosphoproteomics_normal data...
Loading phosphoproteomics_tumor data...
Loading proteomics_normal data...
Loading proteomics_tumor data...
Loading transcriptomics data...
['Missense_Mutation' 'Frame_Shift_Del' 'Frame_Shift_Ins'
 'Nonsense_Mutation ' 'In_Frame_Ins' 'In_Frame_Del' 'Nonstop_Mutation']
total_tumor_patients 110


In [10]:
colon_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated,Truncated,Missence
0,APC,0.745455,3.136364,0.381818
1,TP53,0.509091,2.145455,4.845455
2,MUC16,0.354545,0.1,0.627273
3,FAT3,0.254545,0.036364,0.381818
4,SYNE1,0.263636,0.281818,1.018182
5,TTN,0.545455,1.409091,8.318182
6,KRAS,0.318182,0.0,0.636364
7,OBSCN,0.263636,0.190909,1.127273
8,CCDC168,0.263636,0.263636,0.336364


Test Ovarian

In [11]:
ovarian_freq_mutated_df = get_frequently_mutated("ovarian", .25)

You have loaded the cptac ovarian dataset. To view available
dataframes, use cptac.ovarian.list_data(). To view available functions
for accessing and manipulating the dataframes, use
cptac.ovarian.list_api().
ovarian data version: Most recent release

Loading cptac ovarian data:
Loading clinical data...
Loading cnv data...
Loading phosphoproteomics data...
Loading proteomics data...
Loading somatic_38 data...
Loading transcriptomics data...
Loading treatment data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until June 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.
total_tumor_patients 111


In [12]:
ovarian_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated,Truncated,Missence
0,TTN,0.279279,0.252252,0.468468
1,TP53,0.693694,0.243243,0.468468
2,MUC4,0.306306,0.153153,0.432432
