# get_frequently_mutated Final 

In [39]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
    """take cancer type to import cptac and find the frequently mutated genes in total tumors compared to the cutoff.
        
        Parameters:
        cancer_type (string): type of  cancer 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated_df (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
            and percent mutated (mutated genes / total tumors), percent Truncated, percent Missence
        
        There are many types of mutations catagorized into the columns 'Truncated' and 'Missence'. 
        The Truncated column includes: Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, Nonsense_Mutation, Nonstop_Mutation
        The Missence column includes: In_Frame_Del, In_Frame_Ins, Missense_Mutation
        These columns count multiple mutations of one gene in the same sample, so percentages in the last two columns may 
        exceed the total percent mutated (which only counts if the gene was mutated once)
            """ 
    
    # import CPTAC and pandas
    
    import pandas as pd
    colon = False
    if cancer_type == "endometrial" or cancer_type == "Endometrial":
        import cptac.endometrial as cptac
        
    elif cancer_type == "colon" or cancer_type == "Colon":
        import cptac.colon as cptac
        colon = True
        
    elif cancer_type == "ovarian" or cancer_type == "Ovarian":
        import cptac.ovarian as cptac
    
    else:
        str_cancer_options = '\n' + 'Options: endometrial, ovarian, colon'
        print("Please enter a valid cancer type.", str_cancer_options)
        return 0
    
    # get data frames
    somatic_mutations = cptac.get_mutations()
    proteomics = cptac.get_proteomics()
    sample_status_map = cptac.get_sample_status_map()
    merged_mutations = somatic_mutations.join(sample_status_map, how="left") 
    
    # standardize mutation names 
    if colon == True:
        mutation_equivalents = {'frameshift substitution': 'Frame_Shift_Del' , 'frameshift deletion': 'Frame_Shift_Del', 
            'frameshift insertion': 'Frame_Shift_Ins', 'stopgain': 'Nonsense_Mutation ', 'stoploss':'Nonstop_Mutation',
            'nonsynonymous SNV': 'Missense_Mutation','nonframeshift insertion': 'In_Frame_Ins',
            'nonframeshift deletion': 'In_Frame_Del', 'nonframeshift substitution': 'Missense_Mutation'}
        merged_mutations = merged_mutations.replace(to_replace = mutation_equivalents)
        
    # get list of unique genes
    unique_genes = ['PIK3CA']
    #somatic_mutations['Gene'].unique()
    
    # get total tumors/patients
    sample_status_series = sample_status_map.value_counts()
    total_tumor_patients = sample_status_series[0]
        
    # find frequently mutated genes and their percent mutated. Create lists for frequently mutated genes and percentage.
    freq_mut = []
    total_percent_mutated = []
    for gene in unique_genes:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        gene_mutated = gene_mutated.index.unique()
        num_gene_mutated = len(gene_mutated)
        print('num_gene_mutated',num_gene_mutated)
        percentage = (num_gene_mutated / total_tumor_patients)
        if percentage > cutoff:
            freq_mut.append(gene)
            total_percent_mutated.append(percentage)
    
    # find truncated percentage
    truncated = []
    missence = []
    sample_overlap = []
    non_overlap_samples_miss = []
    for gene in freq_mut:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        
        # trunc
        truncated_df = gene_mutated.loc[(gene_mutated['Mutation'] != 'In_Frame_Del') & 
            (gene_mutated['Mutation'] != 'In_Frame_Ins') & (gene_mutated['Mutation'] != 'Missense_Mutation')] 
        samples_trunc = truncated_df.index.unique()
        num_trunc_mut = len(samples_trunc)
        print('num_trunc_mut', num_trunc_mut)
        fraction_trunc = (num_trunc_mut / total_tumor_patients)
        truncated.append(fraction_trunc)
        print('trunc', samples_trunc)
        #miss
        missence_mutations = gene_mutated.loc[(gene_mutated['Mutation'] == 'In_Frame_Ins') 
                | (gene_mutated['Mutation'] == 'In_Frame_Del') | (gene_mutated['Mutation'] == 'Missense_Mutation')]
        samples_miss = missence_mutations.index.unique()
        num_miss_mut = len(samples_miss)
        
        #count overlap in truncated
        print('miss mut num', num_miss_mut)
        for sample in samples_miss:
            if sample not in samples_trunc:
                non_overlap_samples_miss.append(sample)
        print(non_overlap_samples_miss)
        num_non_overlap_samples = len(non_overlap_samples_miss)
        print('num_non_overlap_samples', num_non_overlap_samples)
        fraction_miss = (num_non_overlap_samples / total_tumor_patients)
        missence.append(fraction_miss)
        
        """
        print(num_gene_mutated)
        print(missence_mutations.index.unique())
        print(missence_mutations)
    
        print(truncated_df.index.unique())
        print(truncated)
        print('trunc:', num_gene_mutated)
        print(truncated_df)
        """
        
    # create dataframe
    merged_lists = list(zip(freq_mut, total_percent_mutated, truncated, missence))
    freq_mutated_df = pd.DataFrame(merged_lists, columns =['Gene', 'Fraction_Mutated', 'Truncation', 'Missence'])
                   
    return freq_mutated_df

In [40]:
endo_freq_mutated_df = get_frequently_mutated("endometrial", .25)

num_gene_mutated 47
num_trunc_mut 1
trunc Index(['S053'], dtype='object', name='Sample_ID')
miss mut num 46
['S001', 'S003', 'S009', 'S010', 'S012', 'S014', 'S018', 'S021', 'S022', 'S023', 'S024', 'S026', 'S027', 'S028', 'S030', 'S031', 'S032', 'S033', 'S036', 'S038', 'S040', 'S055', 'S060', 'S061', 'S063', 'S064', 'S065', 'S066', 'S067', 'S068', 'S070', 'S074', 'S075', 'S077', 'S081', 'S084', 'S086', 'S088', 'S090', 'S092', 'S095', 'S096', 'S097', 'S098', 'S099', 'S103']
num_non_overlap_samples 46


In [30]:
endo_freq_mutated_df

Unnamed: 0,Gene,Fraction_Mutated,Truncation,Missence
0,ARID1A,0.452632,0.4,0.052632
1,PIK3CA,0.494737,0.010526,0.536842
2,PTEN,0.789474,0.568421,0.757895
3,CTCF,0.284211,0.242105,0.8
4,KRAS,0.326316,0.0,1.126316
5,PIK3R1,0.389474,0.189474,1.326316
6,CTNNB1,0.305263,0.0,1.631579


Test Colon

In [23]:
colon_freq_mutated_df = get_frequently_mutated("colon", .25)

total_tumor_patients 110


In [24]:
colon_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated,Truncated,Missence
0,APC,0.745455,3.136364,0.381818
1,TP53,0.509091,2.145455,4.845455
2,MUC16,0.354545,0.1,0.627273
3,FAT3,0.254545,0.036364,0.381818
4,SYNE1,0.263636,0.281818,1.018182
5,TTN,0.545455,1.409091,8.318182
6,KRAS,0.318182,0.0,0.636364
7,OBSCN,0.263636,0.190909,1.127273
8,CCDC168,0.263636,0.263636,0.336364


Test Ovarian, cutoff = .1 (default)

In [25]:
ovarian_freq_mutated_df = get_frequently_mutated("ovarian")

You have loaded the cptac ovarian dataset. To view available
dataframes, use cptac.ovarian.list_data(). To view available functions
for accessing and manipulating the dataframes, use
cptac.ovarian.list_api().
ovarian data version: Most recent release

Loading cptac ovarian data:
Loading clinical data...
Loading cnv data...
Loading phosphoproteomics data...
Loading proteomics data...
Loading somatic_38 data...
Loading transcriptomics data...
Loading treatment data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until June 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.
total_tumor_patients 111


In [26]:
ovarian_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated,Truncated,Missence
0,TTN,0.279279,0.252252,0.468468
1,TP53,0.693694,0.243243,0.468468
2,MUC16,0.144144,0.072072,0.225225
3,MT-CO1,0.117117,0.027027,0.09009
4,MT-ND5,0.144144,0.081081,0.063063
5,FCGBP,0.108108,0.054054,0.081081
6,MUC5B,0.108108,0.027027,0.099099
7,MUC4,0.306306,0.153153,0.432432
8,MUC17,0.126126,0.162162,0.54955
9,NCOR2,0.108108,0.09009,0.027027


Debug 

In [17]:
#find frequently mutated - check Ovarian PTEN percentage = .9277 
somatic = c.get_mutations()
prot = c.get_proteomics()
gene = 'PTEN'
omics_mutations = c.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
unique_genes = ['TP53','PTEN']
#somatic['Gene'].unique()
gene_mutated = somatic.loc[somatic['Gene'] == 'PTEN']
gene_and_freq_d = {}
tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
total_tumor_patients = len(tumors)
print('total_tumor_patients:', total_tumor_patients)
cutoff = .25
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene].index
    num_gene_mutated = len(gene_mutated.unique())
    print('num_gene_mutated', gene, ':', num_gene_mutated)
   
    percentage = (num_gene_mutated / total_tumor_patients)
    if percentage > cutoff:
        gene_and_freq_d[gene] = percentage

total_tumor_patients: 83
num_gene_mutated TP53 : 77
num_gene_mutated PTEN : 4


In [19]:
gene_and_freq_d

{'TP53': 0.927710843373494}

In [20]:
gene = 'TP53'
gene_mutated = somatic.loc[somatic['Gene'] == gene].index
num_gene_mutated = float(len(gene_mutated.unique()))
print('gene_mutated: ', num_gene_mutated, '/', total_tumor_patients, 'total_tumor_patients')
percentage = (num_gene_mutated / total_tumor_patients)
print('percent:',percentage)
gene_mutated.unique()

gene_mutated:  77.0 / 83 total_tumor_patients
percent: 0.927710843373494


Index(['S002', 'S006', 'S007', 'S009', 'S011', 'S013', 'S014', 'S015', 'S020',
       'S021', 'S022', 'S024', 'S025', 'S026', 'S027', 'S029', 'S031', 'S033',
       'S035', 'S036', 'S037', 'S038', 'S039', 'S040', 'S041', 'S042', 'S043',
       'S044', 'S045', 'S047', 'S048', 'S049', 'S050', 'S051', 'S052', 'S053',
       'S054', 'S055', 'S057', 'S058', 'S059', 'S060', 'S061', 'S062', 'S063',
       'S064', 'S065', 'S066', 'S067', 'S068', 'S069', 'S073', 'S074', 'S075',
       'S076', 'S082', 'S083', 'S086', 'S087', 'S089', 'S090', 'S093', 'S095',
       'S096', 'S097', 'S098', 'S100', 'S103', 'S106', 'S107', 'S108', 'S109',
       'S111', 'S112', 'S113', 'S115', 'S116'],
      dtype='object', name='Sample_ID')