# get_frequently_mutated Final 

In [82]:
import cptac
#en = cptac.Endometrial()
#co = cptac.Colon()
#ov = cptac.Ovarian()
rc = cptac.RenalCcrcc()

import pandas as pd
import numpy as np

                                    

In [None]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    """
    Takes a cancer object and find the frequently 
    mutated genes (in the tumor samples) compared to the cutoff.
    Parameters:
    cancer_object (object): cancer type from cptac module 
    cutoff (float): used as a comparison to determine the 
                    status of gene mutation frequency
    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
        mutated genes passing the cutoff. Columns contain the 
        fractions of total unique mutations,missence type 
        mutations, and truncation type mutations per gene.
    
    The Missence_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
    The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""    
    
    # Get total tumors/patients
    omics_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
    tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor'] 
    total_tumor_samples = len(tumors)
    
    # Get mutations data frame
    somatic_mutations = cancer_object.get_mutations() 

    # Drop silent mutations for Ovarian and RenalCcrcc dataset 
    if 'Silent' in somatic_mutations['Mutation'].unique():
        origin_df = somatic_mutations.loc[somatic_mutations['Mutation'] != 'Silent'].reset_index()
    else:
        origin_df = somatic_mutations.reset_index() #prepare to count unique samples
        
    # Create two categories in Mutation column - 'M': Missence, 'T': Truncation
    if cancer_object.get_cancer_type() == 'colon':
        missence_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missence_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    mutations_replaced_M_T = origin_df.replace(missence_truncation_groups)
    # Check that all mutation names are catagorized
    if len(mutations_replaced_M_T['Mutation'].unique()) != 2:
        print('Warning: New mutation name not classified. Counts will be affected.')
    
    # Find frequently mutated genes (total fraction > cutoff)
    # Step 1 - group by gene and count unique samples
    count_mutations = origin_df.groupby(['Gene']).nunique()
    # Step 2 - format
    count_mutations = count_mutations.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
    count_mutations = count_mutations.drop(['Gene', 'Mutation','Location'], axis = 1)
    # Step 3 - filter using the cutoff and create fraction
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_samples)
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when false
    filtered_gene_df = fraction_greater_than_cutoff.dropna()
    
    # Create and join Missence column (following similar steps as seen above)
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    count_miss = miss.groupby(['Gene']).nunique()
    missence_df = count_miss.rename(columns={"Sample_ID": "Missence_Mut"})
    missence_df = missence_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_missence = missence_df.apply(lambda x: x / total_tumor_samples)
    freq_mutated_df = filtered_gene_df.join(fraction_missence, how='left').fillna(0)
    
    # Create and join Truncation column (following similar steps as seen above)
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    count_trunc = trunc.groupby(['Gene']).nunique()
    truncation_df = count_trunc.rename(columns={"Sample_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_truncation = truncation_df.apply(lambda x: x / total_tumor_samples)
    freq_mutated_df = freq_mutated_df.join(fraction_truncation, how='left').fillna(0)
    freq_mutated_df = freq_mutated_df.reset_index() #move genes to their own column

    return freq_mutated_df

In [84]:
df = get_frequently_mutated(rc, .1)
df

Unnamed: 0,Gene,Unique_Samples_Mut,Missence_Mut,Truncation_Mut
0,BAP1,0.154545,0.063636,0.090909
1,KDM5C,0.172727,0.036364,0.145455
2,PBRM1,0.4,0.072727,0.336364
3,SETD2,0.136364,0.018182,0.118182
4,TTN,0.118182,0.090909,0.036364
5,VHL,0.745455,0.3,0.445455


In [85]:
#test results
#rc = cptac.RenalCcrcc()
m = rc.get_mutations()

m = m.loc[m['Mutation'] != 'Silent']
gene = 'BAP1'
g = m.loc[m['Gene'] == gene]
print(g)
print(len(g.index.unique()))

r = g.groupby('Mutation')

r.groups


           Gene           Mutation          Location
Sample_ID                                           
S003       BAP1    Frame_Shift_Del      p.S473Wfs*95
S014       BAP1    Frame_Shift_Del       p.P618Lfs*5
S019       BAP1    Frame_Shift_Del      p.S473Vfs*98
S029       BAP1  Missense_Mutation           p.F170C
S030       BAP1  Missense_Mutation           p.F170L
S034       BAP1    Frame_Shift_Ins            p.D73*
S040       BAP1  Missense_Mutation           p.G185E
S051       BAP1  Missense_Mutation            p.T93A
S054       BAP1  Missense_Mutation            p.W52G
S085       BAP1        Splice_Site     p.X686_splice
S100       BAP1    Frame_Shift_Ins      p.L183Sfs*10
S105       BAP1    Frame_Shift_Del      p.S113Tfs*70
S112       BAP1  Missense_Mutation             p.N2I
S118       BAP1  Nonsense_Mutation           p.K337*
S124       BAP1    Frame_Shift_Ins      p.T254Dfs*30
S126       BAP1  Nonsense_Mutation           p.Q436*
S133       BAP1       In_Frame_Del  p.Q28_V29d

{'Frame_Shift_Del': Index(['S003', 'S014', 'S019', 'S105'], dtype='object', name='Sample_ID'),
 'Frame_Shift_Ins': Index(['S034', 'S100', 'S124'], dtype='object', name='Sample_ID'),
 'In_Frame_Del': Index(['S133'], dtype='object', name='Sample_ID'),
 'Missense_Mutation': Index(['S029', 'S030', 'S040', 'S051', 'S054', 'S112'], dtype='object', name='Sample_ID'),
 'Nonsense_Mutation': Index(['S118', 'S126'], dtype='object', name='Sample_ID'),
 'Splice_Site': Index(['S085'], dtype='object', name='Sample_ID')}