# get_frequently_mutated Final 

In [49]:
import cptac
#en = cptac.Endometrial()
#cptac.sync('endometrial', version='2.1')
cancer_object = cptac.Colon()
#cptac.sync(dataset='ovarian', version='0.0')
#cptac.sync(dataset='colon', version='0.0')
#ov = cptac.Ovarian()

import pandas as pd
import numpy as np

[Kmatting dataframes...data.....ata....

In [46]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    """
    take cancer object and find the frequently 
    mutated genes in the total tumors compared to the cutoff.

    Parameters:
    cancer_object (object): cancer class object from cptac module 
    cutoff (float): used as comparison to determine the 
                    status of gene mutation frequency

    Returns:
    freq_mutated_df (pd.DataFrame): DataFrame of frequently 
        mutated genes passing the cutoff. Columns contain the 
        fractions of total unique mutations,missence type 
        mutations, and truncation type mutations per gene.
    
    The Missence_Mut column includes: 
        In_Frame_Del, In_Frame_Ins, Missense_Mutation
   
   The Truncation_Mut column includes: 
        Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, 
        Nonsense_Mutation, Nonstop_Mutation
        
    These columns count multiple mutations of one gene in the 
    same sample, so fractions in the last two columns may 
    exceed the Unique_Samples_Mut column which only counts if 
    the gene was mutated once per sample."""    
    
    # Step 1 - Get data frames and total tumor count
    somatic_mutations = cancer_object.get_mutations() 
    maps = cancer_object._get_sample_status_map()
    t = somatic_mutations.join(maps, how= 'left')

    omics_and_mutations = cancer_object.append_mutations_to_omics(
        mutation_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
    tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
    print(total_tumor_patients)
    
    # Step 2 - Find frequently mutated genes and their fraction of unique mutated samples.

    # Drop silent mutations for Ovarian dataset
    if cancer_object.get_cancer_type() == 'ovarian':
        origin_df = origin_df.loc[origin_df['Mutation'] != 'Silent']
        
    #group by gene and count unique samples
    origin_df = somatic_mutations.reset_index() #move 'Sample_ID' in order to count sample labels
    count_mutations = origin_df.groupby(['Gene']).nunique()
    #format
    count_mutations = count_mutations.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
    count_mutations = count_mutations.drop(['Gene', 'Mutation','Location'], axis = 1)
    #filter using the cutoff and create fraction
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_patients)
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when false
    filtered_gene_df = fraction_greater_than_cutoff.dropna()
    freq_mut_gene_list = list(filtered_gene_df.loc[filtered_gene_df['Gene']])
    '''
    # Step 3 - Create Missence and Trucation data frame
    if cancer_object.get_cancer_type() == 'colon':
        missence_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missence_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    mutations_replaced_M_T = origin_df.replace(missence_truncation_groups)
    
    # group mutation categories
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    # group by gene and count unique samples for both categories
    count_miss = miss.groupby(['Gene']).nunique()
    count_trunc = trunc.groupby(['Gene']).nunique()
    #format
    missence_df = count_miss.rename(columns={"Sample_ID": "Missence_Mut"})
    missence_df = missence_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    truncation_df = count_trunc.rename(columns={"Sample_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

    # Step 4 - Combine the dataframes 
    #join miss and trunc, change nan to 0, then divide by total tumors
    join_mutations = missence_df.join(truncation_df).fillna(0)
    missence_and_truncation_df = join_mutations.apply(lambda x: x / total_tumor_patients)
    #Join data frames, keeping only the genes that passed the cutoff 
    freq_mutated_df = filtered_gene_df.join(missence_and_truncation_df).reset_index()
    freq_mutated_df.name = 'frequently_mutated'
    '''
    return freq_mut_gene_list

In [20]:
somatic_mutations = co.get_mutations() 
maps = co._get_sample_status_map()
t = somatic_mutations.join(maps, how= 'left')
g = t.loc[t['Gene'] == 'TP53']
wt = g.loc[g['Sample_Status'] == 'Tumor']
len(wt.index.unique())

56

In [57]:
cutoff = .2
somatic_mutations = cancer_object.get_mutations() 
maps = cancer_object._get_sample_status_map()
t = somatic_mutations.join(maps, how= 'left')

omics_and_mutations = cancer_object.append_mutations_to_omics(
    mutation_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor']
total_tumor_patients = len(tumors)
print(total_tumor_patients)

# Step 2 - Find frequently mutated genes and their fraction of unique mutated samples.

# Drop silent mutations for Ovarian dataset
if cancer_object.get_cancer_type() == 'ovarian':
    origin_df = origin_df.loc[origin_df['Mutation'] != 'Silent']

#group by gene and count unique samples
origin_df = somatic_mutations.reset_index() #move 'Sample_ID' in order to count sample labels
count_mutations = origin_df.groupby(['Gene']).nunique()
#format
count_mutations = count_mutations.rename(columns={"Sample_ID": "Unique_Samples_Mut"})
count_mutations = count_mutations.drop(['Gene', 'Mutation','Location'], axis = 1)
#filter using the cutoff and create fraction
fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_patients)
fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when false
filtered_gene_df = fraction_greater_than_cutoff.dropna()

filtered_gene_df = filtered_gene_df.reset_index()
freq_mut_gene_list = filtered_gene_df.loc[filtered_gene_df['Gene']]

97


KeyError: "None of [Index(['ACVR2A', 'AHNAK2', 'APC', 'ARID1A', 'CCDC168', 'COL5A1', 'CSMD3',\n       'DNAH5', 'DOCK3', 'FAT3', 'FAT4', 'FSIP2', 'HERC2', 'HMCN1', 'KMT2C',\n       'KRAS', 'MUC16', 'MUC4', 'MUC5B', 'MYCBP2', 'NBEA', 'NCOR2', 'NRXN1',\n       'OBSCN', 'PCLO', 'PIK3CA', 'PLEC', 'RYR1', 'RYR2', 'SACS', 'SLC4A3',\n       'SPEG', 'SSPO', 'SYNE1', 'TCF7L2', 'TP53', 'TTN', 'USF3', 'ZNF469'],\n      dtype='object')] are in the [index]"

In [None]:
# Step 3 - Create Missence and Trucation data frame
if cancer_object.get_cancer_type() == 'colon':
    missence_truncation_groups = {'frameshift substitution': 'T', 
        'frameshift deletion': 'T', 'frameshift insertion': 'T', 
        'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
        'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
        'nonframeshift substitution': 'M'}
else: 
    missence_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
        'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
        'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
mutations_replaced_M_T = origin_df.replace(missence_truncation_groups)

# group mutation categories
miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
# group by gene and count unique samples for both categories
count_miss = miss.groupby(['Gene']).nunique()
count_trunc = trunc.groupby(['Gene']).nunique()
#format
missence_df = count_miss.rename(columns={"Sample_ID": "Missence_Mut"})
missence_df = missence_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
truncation_df = count_trunc.rename(columns={"Sample_ID": "Truncation_Mut"})
truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)

# Step 4 - Combine the dataframes 
#join miss and trunc, change nan to 0, then divide by total tumors
join_mutations = missence_df.join(truncation_df).fillna(0)
missence_and_truncation_df = join_mutations.apply(lambda x: x / total_tumor_patients)
#Join data frames, keeping only the genes that passed the cutoff 
freq_mutated_df = filtered_gene_df.join(missence_and_truncation_df).reset_index()
freq_mutated_df.name = 'frequently_mutated'

In [47]:
df = get_frequently_mutated(co, .15)

97


KeyError: 'Gene'

{'A1BG': Int64Index([464, 21891, 38480, 90921, 118020], dtype='int64'),
 'A1CF': Int64Index([55281, 55288, 55290, 55291, 55298, 55303], dtype='int64'),
 'A2M': Int64Index([  3512,   3514,   3515,   3521,  17912,  17920,  17933,  17935,
              69994,  69997,  69998,  69999,  86773,  86796,  86815,  86816,
              95297,  95302, 117845, 117997, 118024, 118026, 118275, 118298,
             118326, 118474, 118475, 118483, 118493, 118500, 123627, 123705,
             123828, 123835],
            dtype='int64'),
 'A2ML1': Int64Index([53830, 53844, 77146, 77149, 83582, 90917, 115984], dtype='int64'),
 'A3GALT2': Int64Index([17880, 41785, 119950, 130795], dtype='int64'),
 'A4GALT': Int64Index([36952, 36954], dtype='int64'),
 'A4GNT': Int64Index([115897, 116417, 119226], dtype='int64'),
 'AACS': Int64Index([3471, 3473, 3486, 48412, 48413, 48417, 125504, 125532, 125533], dtype='int64'),
 'AADAC': Int64Index([8289], dtype='int64'),
 'AADACL4': Int64Index([8287, 10157], dtype='int64')

In [18]:
omics_and_mutations = co.append_mutations_to_omics(
        mutation_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
tumors = omics_and_mutations.loc[omics_and_mutations['Sample_Status'] == 'Tumor']
mut = tumors.loc[tumors['TP53_Mutation_Status'] != 'Wildtype_Tumor']
len(mut)

53

In [34]:
#df.loc[df['Sample_Status']]
df.loc[df['Gene'] == 'TP53']

missence_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}

mutations_replaced_M_T = df.replace(missence_truncation_groups)
    
# group mutation categories
miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']

r = miss.reset_index()
r = r.loc[r['Gene'] == 'TP53']
r['Sample_ID'].unique()

count_miss = r.groupby(['Gene']).nunique()
count_miss

Unnamed: 0_level_0,Sample_ID,Gene,Mutation,Location,Sample_Status
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TP53,38,1,1,102,1


In [6]:
co = cptac.Colon()

[Kmatting dataframes...data.....ata....

In [7]:
colon_freq_mutated_df = get_frequently_mutated(co, .15)

97


In [8]:
colon_freq_mutated_df

Unnamed: 0_level_0,Gene,Mutation,Location,Sample_Status
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S001,SCN2A,nonsynonymous SNV,E999K,Tumor
S001,AKAP13,nonsynonymous SNV,R572Q,Tumor
S001,AKAP13,nonsynonymous SNV,R572Q,Tumor
S001,TSSC4,nonsynonymous SNV,P151L,Tumor
S001,TSSC4,nonsynonymous SNV,P151L,Tumor
S001,TSSC4,nonsynonymous SNV,P151L,Tumor
S001,TSSC4,nonsynonymous SNV,P151L,Tumor
S001,MRC2,nonsynonymous SNV,P1117L,Tumor
S001,NUTM2A,frameshift substitution,T266Hfs*4,Tumor
S001,LRGUK,nonsynonymous SNV,D411G,Tumor


In [9]:
#test results
m = co.get_mutations()
gene = 'CASP5'
g = m.loc[m['Gene'] == gene]
print(g)
print(len(g.index.unique()))

r = g.groupby('Mutation')

r.groups


            Gene                 Mutation    Location
Sample_ID                                            
S006       CASP5  frameshift substitution     K9Nfs*2
S006       CASP5  frameshift substitution    K80Nfs*2
S006       CASP5  frameshift substitution    K67Nfs*2
S009       CASP5  frameshift substitution    K67Nfs*2
S009       CASP5  frameshift substitution   T68Qfs*25
S009       CASP5  frameshift substitution   T10Qfs*25
S009       CASP5  frameshift substitution   T81Qfs*25
S009       CASP5  frameshift substitution    K80Nfs*2
S009       CASP5  frameshift substitution     K9Nfs*2
S014       CASP5  frameshift substitution   T10Qfs*25
S014       CASP5  frameshift substitution   T68Qfs*25
S014       CASP5  frameshift substitution   T81Qfs*25
S018       CASP5  frameshift substitution    K67Nfs*2
S018       CASP5  frameshift substitution   T68Qfs*25
S018       CASP5  frameshift substitution   T81Qfs*25
S018       CASP5  frameshift substitution   T10Qfs*25
S018       CASP5  frameshift

{'frameshift deletion': Index(['S055', 'S055', 'S055', 'S055', 'S074', 'S074', 'S074', 'S080', 'S080',
        'S080', 'S081', 'S081'],
       dtype='object', name='Sample_ID'),
 'frameshift insertion': Index(['S031', 'S031', 'S031', 'S031'], dtype='object', name='Sample_ID'),
 'frameshift substitution': Index(['S006', 'S006', 'S006', 'S009', 'S009', 'S009', 'S009', 'S009', 'S009',
        'S014', 'S014', 'S014', 'S018', 'S018', 'S018', 'S018', 'S018', 'S018',
        'S023', 'S023', 'S023', 'S023', 'S023', 'S023', 'S023', 'S023', 'S028',
        'S028', 'S028', 'S028', 'S030', 'S030', 'S030', 'S033', 'S033', 'S033',
        'S033', 'S035', 'S035', 'S035', 'S035', 'S035', 'S052', 'S052', 'S052',
        'S052', 'S052', 'S052', 'S052', 'S052', 'S052', 'S052', 'S055', 'S055',
        'S055', 'S055', 'S064', 'S064', 'S064', 'S065', 'S065', 'S065', 'S065',
        'S065', 'S065', 'S065', 'S065', 'S074', 'S074', 'S074', 'S080', 'S080',
        'S080', 'S101', 'S101', 'S101', 'S108', 'S108',

In [10]:
en_freq_mutated_df = get_frequently_mutated(en,.1)

NameError: name 'en' is not defined

In [None]:
ovarian_freq_mutated_df