# get_frequently_mutated Final 

In [56]:
import cptac
en = cptac.Endometrial()
import pandas as pd
import numpy as np

Checking that data files are up-to-date...
100% [..................................................................................] 649 / 649
Data check complete.
endometrial data version: 2.1

Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading definitions data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic_binary data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but analysis may not be
published until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter cptac.embargo() to open the webpage for more details.


In [12]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
    """
    take cancer object and find the frequently 
    mutated genes in total tumors compared to the cutoff.
        
        Parameters:
        cancer_type (object): object of cancer class from cptac module 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated_df (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
            and Total_Mutated (mutated genes / total tumors), percent Truncated, and percent Missence
        
        There are many types of mutations catagorized into the columns Truncated and Missence. 
        The Truncated column includes: Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, Nonsense_Mutation, Nonstop_Mutation
        The Missence column includes: In_Frame_Del, In_Frame_Ins, Missense_Mutation
        These columns count multiple mutations of one gene in the same sample, so percentages in the last two columns may 
        exceed the Total_Mutated column(which only counts if the gene was mutated once)"""    
    
    # get data frames
    somatic_mutations = cancer_type.get_mutations()
    sample_status_map = cancer_type._get_sample_status_map()
    merged_mutations = somatic_mutations.join(sample_status_map, how="left") 
    """
    # standardize mutation names 
    if cancer_type.get_tumor_type() == 'Colon':
        mutation_equivalents = {'frameshift substitution': 'Frame_Shift_Del' , 'frameshift deletion': 'Frame_Shift_Del', 
            'frameshift insertion': 'Frame_Shift_Ins', 'stopgain': 'Nonsense_Mutation ', 'stoploss':'Nonstop_Mutation',
            'nonsynonymous SNV': 'Missense_Mutation','nonframeshift insertion': 'In_Frame_Ins',
            'nonframeshift deletion': 'In_Frame_Del', 'nonframeshift substitution': 'Missense_Mutation'}
        merged_mutations = merged_mutations.replace(to_replace = mutation_equivalents)
    """  
    # get list of unique genes
    unique_genes = somatic_mutations['Gene'].unique()
    
    # get total tumors/patients
    sample_status_series = sample_status_map.value_counts()
    total_tumor_patients = sample_status_series[0]
        
    # find frequently mutated genes and their total mutated fraction. Create lists for frequently mutated genes and fraction.
    freq_mut = [] # .where, pandas already vectorized, faster to use their function than a for loop
    total_fraction_mutated = []
    for gene in unique_genes:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        gene_mutated = gene_mutated.index.unique()
        num_gene_mutated = len(gene_mutated)
        fraction = (num_gene_mutated / total_tumor_patients)
        if fraction > cutoff:
            freq_mut.append(gene)
            total_fraction_mutated.append(fraction)
    
    # find truncated fraction
    list_missence_mut = ['In_Frame_Del', 'In_Frame_Ins', 'Mutation']
    
    truncated = []
    missence = []
    for gene in freq_mut:
        gene_mut = merged_mutations.loc[merged_mutations['Gene'] == gene] #possibly change var name
        
        truncated_df = gene_mut.loc[(gene_mut['Mutation'] not in list_missence_mut)]
        samples_trunc = truncated_df.index.unique()
        num_trunc_mut = len(samples_trunc)
        fraction_trunc = (num_trunc_mut / total_tumor_patients)
        truncated.append(fraction_trunc)
        
        missence_mutations = gene_mutated.loc[(gene_mutated['Mutation'] in list_missence_mut)]
        samples_miss = missence_mutations.index.unique()
        num_miss_mut = len(samples_miss)
        fraction_miss = (num_miss_mut / total_tumor_patients)
        missence.append(fraction_miss)
        
        
    # create dataframe
    merged_lists = list(zip(freq_mut, total_fraction_mutated, truncated, missence))
    freq_mutated_df = pd.DataFrame(merged_lists, columns =['Gene', 'Fraction_Mutated', 'Truncation', 'Missence'])
    freq_mutated_df.name = 'frequently_mutated'
                   
    return freq_mutated_df

Test Endometrial

In [69]:
somatic_mutations = en.get_mutations()
somatic_mutations.head()
somatic_mutations.index.drop_duplicates()

Index(['S001', 'S002', 'S003', 'S005', 'S006', 'S007', 'S008', 'S009', 'S010',
       'S011', 'S012', 'S014', 'S016', 'S017', 'S018', 'S019', 'S020', 'S021',
       'S022', 'S023', 'S024', 'S025', 'S026', 'S027', 'S028', 'S029', 'S030',
       'S031', 'S032', 'S033', 'S034', 'S036', 'S037', 'S038', 'S039', 'S040',
       'S041', 'S042', 'S044', 'S045', 'S046', 'S048', 'S049', 'S050', 'S051',
       'S053', 'S054', 'S055', 'S056', 'S057', 'S058', 'S059', 'S060', 'S061',
       'S062', 'S063', 'S064', 'S065', 'S066', 'S067', 'S068', 'S069', 'S070',
       'S071', 'S072', 'S073', 'S074', 'S075', 'S076', 'S077', 'S078', 'S079',
       'S080', 'S081', 'S082', 'S083', 'S084', 'S085', 'S086', 'S087', 'S088',
       'S090', 'S091', 'S092', 'S093', 'S094', 'S095', 'S096', 'S097', 'S098',
       'S099', 'S100', 'S101', 'S102', 'S103'],
      dtype='object', name='Sample_ID')

In [81]:
g = somatic_mutations.groupby('Gene').groups
g
g['PTEN'] = len(g['PTEN'].value_counts())
#.size()


75

In [46]:

df = pd.DataFrame([('bird', 'Falconiformes', 389.0),
                       ('bird', 'Psittaciformes', 24.0),
                       ('mammal', 'Carnivora', 80.2),
                       ('mammal', 'Primates', np.nan),
                       ('mammal', 'Carnivora', 58)],
                      index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'],
                      columns=('class', 'order', 'max_speed'))
df

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [55]:
d = df.groupby(['class'])
d.head(2)

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,


In [15]:
endo_freq_mutated_df = get_frequently_mutated(en, .25)

NameError: name 'my_list' is not defined

In [None]:
endo_freq_mutated_df


Test Colon

In [None]:
colon_freq_mutated_df = get_frequently_mutated("colon")

You have loaded the cptac colon dataset. To view available dataframes,
use cptac.colon.list_data(). To view available functions for accessing
and manipulating the dataframes, use cptac.colon.list_api().
colon data version: Most recent release

Loading cptac colon data:
Loading clinical data...
Loading miRNA data...
Loading mutation data...
Loading mutation_binary data...
Loading phosphoproteomics_normal data...
Loading phosphoproteomics_tumor data...
Loading proteomics_normal data...
Loading proteomics_tumor data...
Loading transcriptomics data...


In [None]:
colon_freq_mutated_df

Test Ovarian

In [4]:
ovarian_freq_mutated_df = get_frequently_mutated("ovarian")

Welcome to the cptac data service package. Available datasets may be
viewed using cptac.list_data(). In order to access a specific data
set, import a cptac subfolder using either 'import cptac.dataset' or
'from cptac import dataset'.
******
Version: 0.4.1
******
You have loaded the cptac ovarian dataset. To view available
dataframes, use cptac.ovarian.list_data(). To view available functions
for accessing and manipulating the dataframes, use
cptac.ovarian.list_api().
ovarian data version: Most recent release

Loading cptac ovarian data:
Loading clinical data...
Loading cnv data...
Loading phosphoproteomics data...
Loading proteomics data...
Loading somatic_38 data...
Loading transcriptomics data...
Loading treatment data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until June 1, 2019. Please 

In [5]:
ovarian_freq_mutated_df

Unnamed: 0,Gene,Fraction_Mutated,Truncation,Missence
0,TTN,0.279279,0.135135,0.198198
1,TP53,0.693694,0.243243,0.45045
2,MUC16,0.144144,0.054054,0.108108
3,MT-CO1,0.117117,0.027027,0.09009
4,MT-ND5,0.144144,0.081081,0.063063
5,FCGBP,0.108108,0.054054,0.063063
6,MUC5B,0.108108,0.027027,0.081081
7,MUC4,0.306306,0.135135,0.234234
8,MUC17,0.126126,0.036036,0.108108
9,NCOR2,0.108108,0.081081,0.027027
