# get_frequently_mutated cptac 6.0

In [1]:
import cptac
import pandas as pd
import numpy as np

In [20]:
  """
    Takes a cancer object and find the frequently 
    mutated genes (in the tumor samples) compared to the cutoff.
    Parameters:
    cancer_object (object): cancer type from cptac module 
    cutoff (float): used as a comparison to determine the 
                    status of gene mutation frequency
    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
        mutated genes passing the cutoff. Columns contain the 
        fractions of total unique mutations,missense type 
        mutations, and truncation type mutations per gene.
    
    The Missense_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
    The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""   

def get_frequently_mutated(cancer_object, cutoff = 0.1):  
   
    
    # Get total tumors/patients
    omics_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
    tumors = omics_and_mutations.Sample_Status
    v = tumors.value_counts()
    total_tumors = v['Tumor']
    total_tumor_count = int(total_tumors)
    
    # Get mutations data frame
    somatic_mutations = cancer_object.get_somatic_mutation() 

    # Drop silent mutations for Ovarian and RenalCcrcc dataset 
    if 'Silent' in somatic_mutations['Mutation'].unique():
        origin_df = somatic_mutations.loc[somatic_mutations['Mutation'] != 'Silent'].reset_index()
    else:
        origin_df = somatic_mutations.reset_index() #prepare to count unique samples
        
    # Create two categories in Mutation column - 'M': Missense, 'T': Truncation
    if cancer_object.get_cancer_type() == 'colon':
        missense_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missense_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    
    mutations_replaced_M_T = origin_df.replace(missense_truncation_groups)
    
    # replace non_coding mutations for Gbm
    unique_mutations = len(mutations_replaced_M_T['Mutation'].unique())
    if cancer_object.get_cancer_type() == 'gbm':
        gbm = True
        non_coding = {'Intron': 'NC', 'RNA': 'NC', "5'Flank": 'NC', "3'Flank": 'NC', 
            "5'UTR": 'NC', "3'UTR": 'NC', 'Splice_Region' : 'NC'}
        mutations_replaced_M_T = mutations_replaced_M_T.replace(non_coding)
        unique_mutations_2 = len(mutations_replaced_M_T['Mutation'].unique())
        
    elif unique_mutations != 2: # Check that all mutation names are catagorized
        print('Warning: New mutation name not classified. Counts will be affected.')
    
    # Find frequently mutated genes (total fraction > cutoff)
    # Same steps will be repeated for finding the missense and truncation mutation frequencies
    # Step 1 - group by gene and count unique samples
    # Step 2 - format
    # Step 3 - filter using the cutoff and create fraction 
    count_mutations = origin_df.groupby(['Gene']).nunique()
    count_mutations = count_mutations.rename(columns={"Patient_ID": "Unique_Samples_Mut"}) # Step 2 
    count_mutations = count_mutations.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_count) # Step 3 
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when not > cutoff
    filtered_gene_df = fraction_greater_than_cutoff.dropna() # drop genes below cutoff
    
    # Create and join Missense column (following similar steps as seen above)
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    count_miss = miss.groupby(['Gene']).nunique()
    missense_df = count_miss.rename(columns={"Patient_ID": "Missense_Mut"})
    missense_df = missense_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_missense = missense_df.apply(lambda x: x / total_tumor_count)
    freq_mutated_df = filtered_gene_df.join(fraction_missense, how='left').fillna(0)
    
    # Create and join Truncation column (following similar steps as seen above)
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    count_trunc = trunc.groupby(['Gene']).nunique()
    truncation_df = count_trunc.rename(columns={"Patient_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_truncation = truncation_df.apply(lambda x: x / total_tumor_count)
    freq_mutated_df = freq_mutated_df.join(fraction_truncation, how='left').fillna(0)
    freq_mutated_df = freq_mutated_df.reset_index() #move genes to their own column
    
    if gbm == True:
        # Create and join non-coding column (following similar steps as seen above)
        nc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'NC']
        count_nc = nc.groupby(['Gene']).nunique()
        nc_df = count_nc.rename(columns={"Patient_ID": "Non-Coding"})
        nc_df = nc_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
        fraction_nc = nc_df.apply(lambda x: x / total_tumor_count)
        freq_mutated_df = freq_mutated_df.join(fraction_nc, how='left').fillna(0)
    
    return freq_mutated_df

In [23]:
sm = b.get_somatic_mutation()
sm['Mutation'].unique()
#sm.loc[sm['Gene'] == 'MUC16']

array(['Missense_Mutation', 'Silent', 'In_Frame_Del', 'Splice_Site',
       'Nonsense_Mutation', 'Intron', 'Frame_Shift_Del', 'RNA',
       'Frame_Shift_Ins', "3'Flank", 'In_Frame_Ins', 'Splice_Region',
       "5'UTR", "5'Flank", "3'UTR", 'Nonstop_Mutation'], dtype=object)

In [5]:
#en = cptac.Endometrial()
#co = cptac.Colon()
#ov = cptac.Ovarian()
#rc = cptac.Ccrcc()

b = cptac.Gbm()

                                    

In [19]:
b.get_cancer_type() == 'gbm'

True

In [21]:
df = get_frequently_mutated(b,.05)
df



Name,Gene,Unique_Samples_Mut,Missense_Mut,Truncation_Mut,Non-Coding
0,ARHGAP5,0.07,0.05,0.04,0.0
1,ATRX,0.09,0.02,0.07,0.0
2,BRAF,0.06,0.06,0.0,0.0
3,DOCK6,0.06,0.05,0.01,0.0
4,EGFR,0.16,0.16,0.0,0.0
5,FLG,0.06,0.06,0.0,0.0
6,HCN1,0.07,0.06,0.01,0.0
7,HMCN1,0.07,0.07,0.0,0.0
8,IDH1,0.07,0.07,0.0,0.0
9,MUC16,0.08,0.08,0.0,0.0
