# get_frequently_mutated Final 

In [1]:
import cptac
#en = cptac.Endometrial()
#cptac.sync('endometrial', version='2.1')
ov = cptac.Ovarian()
#cptac.sync(dataset='ovarian', version='0.0')
#cptac.sync(dataset='colon', version='0.0')
#co = cptac.Colon()

import pandas as pd
import numpy as np

[Kding treatment data...ata........

In [2]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    """
    take cancer object and find the frequently 
    mutated genes in the total tumors compared to the cutoff.

    Parameters:
    cancer_object (object): cancer class object from cptac module 
    cutoff (float): used as comparison to determine the 
                    status of gene mutation frequency

    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
        mutated genes passing the cutoff. Columns contain the 
        fractions of total unique mutations,missence type 
        mutations, and truncation type mutations per gene.
    
    The Missence_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
   The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""    
    
    # Get mutations data frame (whole exome sequencing)
    somatic_mutations = cancer_object.get_mutations()
    sample_status_map = cancer_object._get_sample_status_map()
         
    # Get total tumors/patients
    sample_status_count = sample_status_map.value_counts().reset_index()
    tumors = sample_status_count.loc[sample_status_count['index'] == 'Tumor']
    total_tumor_patients = tumors.iloc[0,1] #number of tumors 
        
    # Find frequently mutated genes and their fraction of unique mutated samples.
    #move 'Sample_ID' from index to column
    origin_df = somatic_mutations.reset_index()

    #group by gene and count unique samples
    count_mutations = origin_df.groupby(['Gene']).nunique()

    #format
    count_mutations = count_mutations.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
    count_mutations = count_mutations.drop(['Gene', 'Mutation','Location'], axis = 1)
    
    #filter using the cutoff and create fraction
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_patients)
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when false
    filtered_gene_df = fraction_greater_than_cutoff.dropna()
    
    
    #Create Missence and Trucation data frame
    #create two categories in Mutation column
    if cancer_object.get_cancer_type() == 'colon':
        missence_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missence_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
       
    mutations_replaced_M_T = origin_df.replace(missence_truncation_groups)
    
    # group mutation categories
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']

    # group by gene and count unique samples for both categories
    count_miss = miss.groupby(['Gene']).nunique()
    count_trunc = trunc.groupby(['Gene']).nunique()

    #format
    missence_df = count_miss.rename(columns={"Sample_ID": "Missence_Mut"})
    missence_df = missence_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    truncation_df = count_trunc.rename(columns={"Sample_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    #join miss and trunc and change nan to 0, then divide by total tumors
    join_mutations = missence_df.join(truncation_df).fillna(0)
    missence_and_truncation_df = join_mutations.apply(lambda x: x / total_tumor_patients)


    #Join data frames, keeping only the genes that passed the cutoff 
    freq_mutated_df = filtered_gene_df.join(missence_and_truncation_df).reset_index()
    freq_mutated_df.name = 'frequently_mutated'
    
    return freq_mutated_df

Test Colon

In [3]:
df = get_frequently_mutated(ov, .1)

In [4]:
df

Unnamed: 0,Gene,Unique_Samples_Mut,Missence_Mut,Truncation_Mut
0,FCGBP,0.108108,0.063063,0.009009
1,MT-CO1,0.117117,0.09009,0.0
2,MT-ND5,0.144144,0.063063,0.018018
3,MUC12,0.108108,0.108108,0.009009
4,MUC16,0.144144,0.108108,0.009009
5,MUC17,0.126126,0.108108,0.0
6,MUC4,0.306306,0.234234,0.009009
7,MUC5B,0.108108,0.081081,0.0
8,NCOR2,0.108108,0.027027,0.0
9,TP53,0.693694,0.45045,0.243243


In [5]:
#test results
m = ov.get_mutations()
gene = 'TP53'
g = m.loc[m['Gene'] == gene]
print(g)
print(len(g.index.unique()))

r = g.groupby('Mutation')

r.groups


           Gene           Mutation        Location
Sample_ID                                         
S002       TP53    Frame_Shift_Del          p.D61*
S006       TP53    Frame_Shift_Del     p.S90Vfs*55
S007       TP53  Missense_Mutation         p.R273C
S009       TP53  Missense_Mutation         p.I195T
S011       TP53    Frame_Shift_Ins    p.C124Lfs*25
S013       TP53        Splice_Site   p.X261_splice
S015       TP53  Missense_Mutation         p.C141Y
S016       TP53    Frame_Shift_Ins     p.T81Nfs*68
S022       TP53  Nonsense_Mutation         p.S183*
S023       TP53        Splice_Site   p.X307_splice
S024       TP53  Missense_Mutation         p.V173M
S026       TP53    Frame_Shift_Del    p.R110Lfs*13
S027       TP53  Missense_Mutation         p.Y205D
S028       TP53    Frame_Shift_Del     p.P75Lfs*48
S029       TP53  Missense_Mutation         p.R282W
S031       TP53  Nonsense_Mutation         p.Q136*
S033       TP53        Splice_Site   p.X187_splice
S035       TP53        Splice_S

{'Frame_Shift_Del': Index(['S002', 'S006', 'S026', 'S028', 'S040', 'S045', 'S052', 'S070', 'S079',
        'S089', 'S091', 'S096', 'S100', 'S107'],
       dtype='object', name='Sample_ID'),
 'Frame_Shift_Ins': Index(['S011', 'S016', 'S038'], dtype='object', name='Sample_ID'),
 'In_Frame_Del': Index(['S037', 'S041', 'S064'], dtype='object', name='Sample_ID'),
 'Missense_Mutation': Index(['S007', 'S009', 'S015', 'S024', 'S027', 'S029', 'S039', 'S043', 'S044',
        'S046', 'S047', 'S049', 'S050', 'S051', 'S053', 'S054', 'S055', 'S056',
        'S057', 'S059', 'S060', 'S061', 'S062', 'S063', 'S065', 'S066', 'S068',
        'S069', 'S071', 'S076', 'S078', 'S084', 'S084', 'S088', 'S090', 'S093',
        'S094', 'S095', 'S097', 'S098', 'S101', 'S102', 'S103', 'S104', 'S106',
        'S106', 'S108', 'S110', 'S111'],
       dtype='object', name='Sample_ID'),
 'Nonsense_Mutation': Index(['S022', 'S031', 'S042', 'S067', 'S080'], dtype='object', name='Sample_ID'),
 'Splice_Site': Index(['S013',

In [15]:
# count wt
omics = ov.get_proteomics()
somatic_mutations = ov.get_mutations()
a = ov.append_mutations_to_omics(mutation_genes = gene, omics_df_name = omics.name, omics_genes = gene)

p53 = a.loc[a['TP53_Mutation_Status'] == 'Wildtype_Tumor']
p53

Unnamed: 0_level_0,TP53_proteomics,TP53_Mutation,TP53_Location,TP53_Mutation_Status,Sample_Status
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S012,-1.01732,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
S019,-1.195933,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
S025,-1.072294,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
S048,-1.429764,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
S074,-0.96324,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
S087,-0.547332,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor


Test Colon

In [None]:
colon_freq_mutated_df = get_frequently_mutated(cancer, .25)

In [None]:
colon_freq_mutated_df

Test Ovarian

In [None]:
ovarian_freq_mutated_df = get_frequently_mutated(ov,.1)

In [None]:
ovarian_freq_mutated_df