# get_frequently_mutated Final 

In [2]:
import cptac
en = cptac.Endometrial()
#ov = cptac.Ovarian()
#cancer = cptac.Colon()

import pandas as pd
import numpy as np

2.1 not installed. To install, run 'cptac.sync(dataset='endometrial', version='2.1')'.


In [27]:
def get_frequently_mutated(cancer_type, cutoff = 0.1):  
    """
    take cancer object and find the frequently 
    mutated genes in the total tumors compared to the cutoff.

    Parameters:
    cancer_type (object): cancer class object from cptac module 
    cutoff (float): used as comparison to determine the 
                    status of gene mutation frequency

    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
                mutated genes passing the cutoff. Columns 
                contain the fractions of total unique 
                mutations,missence type mutations, 
                and truncation type mutations per gene.
    
    The Missence_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
   The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""    
    
    # get data frame
    somatic_mutations = cancer_type.get_mutations()
    sample_status_map = cancer_type._get_sample_status_map()
         
    '''Get total tumors/patients'''
    sample_status_series = sample_status_map.value_counts().reset_index()
    tumors = sample_status_series.loc[sample_status_series['index'] == 'Tumor']
    total_tumor_patients = tumors.iloc[0,1]
    print(total_tumor_patients)
        
    '''Find frequently mutated genes and their fraction of unique mutated samples.'''
    #move 'Sample_ID' from index to col 
    origin_df = somatic_mutations.reset_index()

    #group by gene and count unique samples
    genes_df = origin_df.groupby(['Gene']).nunique()

    #format
    genes_df = genes_df.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
    genes_df = genes_df.drop(['Gene', 'Mutation','Location'], axis = 1)
    
    #filter using the cutoff and create fraction
    genes_df = genes_df.apply(lambda x: x / total_tumor_patients)
    filtered = genes_df.where(lambda x: x > cutoff)
    filtered_gene_df = filtered.dropna()
    
    '''Create Missence and Trucation data frame'''
    #create two categories in Mutation column
    '''
    if cancer_type.get_tumor_type() == 'Colon':
        mutation_equivalents = {'frameshift substitution': 'T' , 'frameshift deletion': 'T', 
            'frameshift insertion': 'T', 'stopgain': 'T ', 'stoploss':'T',
            'nonsynonymous SNV': 'M','nonframeshift insertion': 'M',
            'nonframeshift deletion': 'M', 'nonframeshift substitution': 'M'}
        replaced_M_T = origin_df.replace(mutation_equivalents)
        
    else:   
    
    '''
    missence_mut = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M', 'Missense_Mutation': 'M'}
    truncation_mut = {'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
                      'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    replaced_M = origin_df.replace(missence_mut)
    replaced_M_T = replaced_M.replace(truncation_mut)
    
    # group mutation categories
    miss = replaced_M_T.loc[replaced_M_T['Mutation'] == 'M']
    trunc = replaced_M_T.loc[replaced_M_T['Mutation'] == 'T']

    # group by gene and count unique samples for both categories
    miss_df = miss.groupby(['Gene']).nunique()
    trunc_df = trunc.groupby(['Gene']).nunique()

    #format
    miss_df = miss_df.rename(columns={"Sample_ID": "Missence_Mut"})
    miss_df = miss_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    trunc_df = trunc_df.rename(columns={"Sample_ID": "Truncation_Mut"})
    trunc_df = trunc_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    #join miss and trunc and change nan to 0, then divide by total tumors
    mut = miss_df.join(trunc_df).fillna(0)
    missence_and_truncation_df = mut.apply(lambda x: x / total_tumor_patients)


    '''Join data frames, keeping only the genes that passed the cutoff''' 
    freq_mutated_df = filtered_gene_df.join(missence_and_truncation_df).reset_index()
    freq_mutated_df.name = 'frequently_mutated'
                   
    return freq_mutated_df

Test Endometrial

In [28]:
df = get_frequently_mutated(en, .15)

95


In [29]:
df

Unnamed: 0,Gene,Unique_Samples_Mut,Missence_Mut,Truncation_Mut
0,AHNAK,0.157895,0.126316,0.042105
1,ARID1A,0.452632,0.136842,0.4
2,CCDC168,0.168421,0.115789,0.115789
3,CTCF,0.284211,0.094737,0.242105
4,CTNNB1,0.305263,0.305263,0.0
5,DNAH17,0.157895,0.136842,0.031579
6,DOCK3,0.2,0.115789,0.157895
7,FBXW7,0.189474,0.157895,0.042105
8,HERC2,0.178947,0.157895,0.063158
9,HUWE1,0.157895,0.157895,0.031579


In [None]:
#test results
m = en.get_mutations()
gene = 'DNAH17'
g = m.loc[m['Gene'] == gene]
print(len(g.index.unique()))

r = g.groupby('Mutation')

r.groups


Test Colon

In [None]:
colon_freq_mutated_df = get_frequently_mutated(cancer, .15)

In [None]:
colon_freq_mutated_df

Test Ovarian

In [32]:
ovarian_freq_mutated_df = get_frequently_mutated(ov,.1)

111


In [33]:
ovarian_freq_mutated_df

Unnamed: 0,Gene,Unique_Samples_Mut,Missence_Mut,Truncation_Mut
0,FCGBP,0.108108,0.063063,0.009009
1,MT-CO1,0.117117,0.09009,0.0
2,MT-ND5,0.144144,0.063063,0.018018
3,MUC12,0.108108,0.108108,0.009009
4,MUC16,0.144144,0.108108,0.009009
5,MUC17,0.126126,0.108108,0.0
6,MUC4,0.306306,0.234234,0.009009
7,MUC5B,0.108108,0.081081,0.0
8,NCOR2,0.108108,0.027027,0.0
9,TP53,0.693694,0.45045,0.243243
