# get_frequently_mutated Final 

In [1]:
import cptac
#en = cptac.Endometrial()
#cptac.sync('endometrial', version='2.1')
ov = cptac.Ovarian()
#cptac.sync(dataset='ovarian', version='0.0')
#cptac.sync(dataset='colon', version='0.0')
#co = cptac.Colon()

import pandas as pd
import numpy as np

[Kding treatment data...ata........

In [11]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    """
    take cancer object and find the frequently 
    mutated genes in the total tumors compared to the cutoff.

    Parameters:
    cancer_object (object): cancer class object from cptac module 
    cutoff (float): used as comparison to determine the 
                    status of gene mutation frequency

    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
        mutated genes passing the cutoff. Columns contain the 
        fractions of total unique mutations,missence type 
        mutations, and truncation type mutations per gene.
    
    The Missence_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
   The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""    
    
    # Get mutations data frame (whole exome sequencing)
    somatic_mutations = cancer_object.get_mutations()
    sample_status_map = cancer_object._get_sample_status_map()
         
    # Get total tumors/patients
    sample_status_count = sample_status_map.value_counts().reset_index()
    tumors = sample_status_count.loc[sample_status_count['index'] == 'Tumor']
    total_tumor_patients = tumors.iloc[0,1] #number of tumors 
        
    # Find frequently mutated genes and their fraction of unique mutated samples.
    #move 'Sample_ID' from index to column
    origin_df = somatic_mutations.reset_index()

    #group by gene and count unique samples
    count_mutations = origin_df.groupby(['Gene']).nunique()

    #format
    count_mutations = count_mutations.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
    count_mutations = count_mutations.drop(['Gene', 'Mutation','Location'], axis = 1)
    
    #filter using the cutoff and create fraction
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_patients)
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when false
    filtered_gene_df = fraction_greater_than_cutoff.dropna()
    
    
    #Create Missence and Trucation data frame
    #create two categories in Mutation column
    if cancer_object.get_cancer_type() == 'colon':
        missence_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missence_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
       
    mutations_replaced_M_T = origin_df.replace(missence_truncation_groups)
    
    # group mutation categories
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']

    # group by gene and count unique samples for both categories
    count_miss = miss.groupby(['Gene']).nunique()
    count_trunc = trunc.groupby(['Gene']).nunique()

    #format
    missence_df = count_miss.rename(columns={"Sample_ID": "Missence_Mut"})
    missence_df = missence_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    truncation_df = count_trunc.rename(columns={"Sample_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    #join miss and trunc and change nan to 0, then divide by total tumors
    join_mutations = missence_df.join(truncation_df).fillna(0)
    missence_and_truncation_df = join_mutations.apply(lambda x: x / total_tumor_patients)


    #Join data frames, keeping only the genes that passed the cutoff 
    freq_mutated_df = filtered_gene_df.join(missence_and_truncation_df).reset_index()
    freq_mutated_df.name = 'frequently_mutated'
    
    return freq_mutated_df

Test Colon

In [12]:
df = get_frequently_mutated(ov, .1)

In [13]:
df

Unnamed: 0,Gene,Unique_Samples_Mut,Missence_Mut,Truncation_Mut
0,FCGBP,0.108108,0.063063,0.009009
1,MT-CO1,0.117117,0.09009,0.0
2,MT-ND5,0.144144,0.063063,0.018018
3,MUC12,0.108108,0.108108,0.009009
4,MUC16,0.144144,0.108108,0.009009
5,MUC17,0.126126,0.108108,0.0
6,MUC4,0.306306,0.234234,0.009009
7,MUC5B,0.108108,0.081081,0.0
8,NCOR2,0.108108,0.027027,0.0
9,TP53,0.693694,0.45045,0.243243


In [77]:
#test results
m = co.get_mutations()
gene = 'MUC16'
g = m.loc[m['Gene'] == gene]
print(g)
print(len(g.index.unique()))

r = g.groupby('Mutation')

r.groups


            Gene              Mutation      Location
Sample_ID                                           
S001       MUC16     nonsynonymous SNV       G11040R
S006       MUC16     nonsynonymous SNV        S2024N
S006       MUC16     nonsynonymous SNV        S7242Y
S006       MUC16     nonsynonymous SNV        G7724D
S007       MUC16     nonsynonymous SNV       Y14408C
S009       MUC16     nonsynonymous SNV         H380N
S011       MUC16     nonsynonymous SNV       G13513C
S011       MUC16     nonsynonymous SNV        T5291A
S013       MUC16     nonsynonymous SNV        M4636I
S015       MUC16     nonsynonymous SNV       H11241R
S015       MUC16     nonsynonymous SNV        Q3855K
S018       MUC16     nonsynonymous SNV        V9463F
S019       MUC16     nonsynonymous SNV        Q3855H
S027       MUC16     nonsynonymous SNV       Y13937C
S027       MUC16     nonsynonymous SNV       G10078D
S027       MUC16     nonsynonymous SNV        Q4358R
S029       MUC16     nonsynonymous SNV       F

{'frameshift deletion': Index(['S052', 'S055', 'S062', 'S101'], dtype='object', name='Sample_ID'),
 'frameshift insertion': Index(['S039', 'S082'], dtype='object', name='Sample_ID'),
 'nonsynonymous SNV': Index(['S001', 'S006', 'S006', 'S006', 'S007', 'S009', 'S011', 'S011', 'S013',
        'S015', 'S015', 'S018', 'S019', 'S027', 'S027', 'S027', 'S029', 'S030',
        'S031', 'S032', 'S033', 'S033', 'S035', 'S039', 'S043', 'S046', 'S049',
        'S049', 'S059', 'S065', 'S074', 'S074', 'S074', 'S074', 'S078', 'S080',
        'S080', 'S083', 'S084', 'S084', 'S084', 'S084', 'S084', 'S084', 'S084',
        'S084', 'S084', 'S084', 'S084', 'S084', 'S084', 'S084', 'S084', 'S084',
        'S084', 'S084', 'S084', 'S084', 'S084', 'S084', 'S091', 'S094', 'S101',
        'S101', 'S102', 'S104', 'S106', 'S108', 'S110'],
       dtype='object', name='Sample_ID'),
 'stopgain': Index(['S084', 'S084', 'S084', 'S084', 'S084'], dtype='object', name='Sample_ID')}

Test Colon

In [None]:
colon_freq_mutated_df = get_frequently_mutated(cancer, .25)

In [None]:
colon_freq_mutated_df

Test Ovarian

In [None]:
ovarian_freq_mutated_df = get_frequently_mutated(ov,.1)

In [None]:
ovarian_freq_mutated_df