# get_frequently_mutated Final 

In [12]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
    """take cancer type to import cptac and find the frequently mutated genes in total tumors compared to the cutoff.
        
        Parameters:
        cancer_type (string): type of  cancer 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated_df (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
            and percent mutated (mutated genes / total tumors), percent Truncated, percent Missence
        
        There are many types of mutations catagorized into the columns 'Truncated' and 'Missence'. 
        The Truncated column includes: Frame_Shift_Del, Frame_Shift_Ins, Splice_Site, Nonsense_Mutation, Nonstop_Mutation
        The Missence column includes: In_Frame_Del, In_Frame_Ins, Missense_Mutation
        These columns count multiple mutations of one gene in the same sample, so percentages in the last two columns may 
        exceed the total percent mutated (which only counts if the gene was mutated once)
            """    
    
    #import CPTAC and pandas
    
    import pandas as pd
    if cancer_type == "endometrial" or cancer_type == "Endometrial":
        import cptac.endometrial as cptac
        
    elif cancer_type == "colon" or cancer_type == "Colon":
        import cptac.colon as cptac
        
    elif cancer_type == "ovarian" or cancer_type == "Ovarian":
        import cptac.ovarian as cptac
    
    else:
        print("Please enter a valid cancer type.")
    
    # get data frames
    somatic_mutations = cptac.get_mutations()
    proteomics = cptac.get_proteomics()
    sample_status_map = cptac.get_sample_status_map()
    merged_mutations = somatic_mutations.join(sample_status_map, how="left") 
    
    # get list of unique genes
    unique_genes = somatic_mutations['Gene'].unique()
    
    # get total tumors/patients
    sample_status_series = sample_status_map.value_counts()
    total_tumor_patients = sample_status_series[0]
    print('total_tumor_patients', total_tumor_patients)
        
    #find frequently mutated and create list for genes to include in dataframe of frequently mutated
    freq_mut = []
    total_percent_mutated = []
    
    for gene in unique_genes:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene].index
        num_gene_mutated = len(gene_mutated.unique())
        percentage = (num_gene_mutated / total_tumor_patients)
        if percentage > cutoff:
            freq_mut.append(gene)
            total_percent_mutated.append(percentage)
    
    #find truncated percentage
    truncated = []
    for gene in freq_mut:
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        trunc = gene_mutated.loc[gene_mutated['Mutation'] != 'In_Frame_Del']
        trunc = trunc.loc[trunc['Mutation'] != 'In_Frame_Ins']
        trunc = trunc.loc[trunc['Mutation'] != 'Missense_Mutation'] 
        num_gene_mutated = len(trunc)
        percentage = (num_gene_mutated / total_tumor_patients)
        truncated.append(percentage)
      
    #find missence percentage 
    missence = []
    for gene in freq_mut:
        num_gene_mutated = 0 
        gene_mutated = merged_mutations.loc[merged_mutations['Gene'] == gene]
        missence_mutations = gene_mutated.loc[gene_mutated['Mutation'] == 'In_Frame_Del']
        num_gene_mutated = len(missence_mutations)
        missence_mutations = gene_mutated.loc[gene_mutated['Mutation'] == 'In_Frame_Ins']
        num_gene_mutated += len(missence_mutations)
        missence_mutations = gene_mutated.loc[gene_mutated['Mutation'] == 'Missense_Mutation']
        num_gene_mutated += len(missence_mutations)
        percentage = (num_gene_mutated / total_tumor_patients)
        missence.append(percentage)
        
    # create dataframe
    merged_lists = list(zip(freq_mut, total_percent_mutated, truncated, missence))
    freq_mutated_df = pd.DataFrame(merged_lists, columns =['Gene', 'Percent Mutated', 'Truncated', 'Missence'])
                   
    return freq_mutated_df

Test Endometrial

In [13]:
endo_freq_mutated_df = get_frequently_mutated("endometrial", .25)

total_tumor_patients 95


In [14]:
endo_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated,Truncated,Missence
0,ARID1A,0.452632,0.536842,0.147368
1,PIK3CA,0.494737,0.010526,0.642105
2,PTEN,0.789474,0.694737,0.631579
3,CTCF,0.284211,0.263158,0.105263
4,KRAS,0.326316,0.0,0.347368
5,PIK3R1,0.389474,0.231579,0.252632
6,CTNNB1,0.305263,0.0,0.315789


Test Colon

In [13]:
colon_freq_mutated_df = get_frequently_mutated("colon", .25)

110


In [14]:
colon_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated,Truncated,Missence
0,APC,0.745455,3.518182,0.0
1,TP53,0.509091,6.990909,0.0
2,MUC16,0.354545,0.727273,0.0
3,FAT3,0.254545,0.418182,0.0
4,SYNE1,0.263636,1.3,0.0
5,TTN,0.545455,9.727273,0.0
6,KRAS,0.318182,0.636364,0.0
7,OBSCN,0.263636,1.318182,0.0
8,CCDC168,0.263636,0.6,0.0


Test Ovarian, cutoff = .1 (default)

In [13]:
ovarian_freq_mutated_df = get_frequently_mutated("ovarian")

In [14]:
ovarian_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated
0,TTN,0.373494
1,TP53,0.927711
2,FMN2,0.108434
3,WDFY4,0.13253
4,MUC16,0.192771
5,MT-CO1,0.156627
6,KMT2D,0.120482
7,HERC1,0.108434
8,MT-ND5,0.192771
9,FCGBP,0.144578


Debug 

In [17]:
#find frequently mutated - check Ovarian PTEN percentage = .9277 
somatic = c.get_mutations()
prot = c.get_proteomics()
gene = 'PTEN'
omics_mutations = c.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
unique_genes = ['TP53','PTEN']
#somatic['Gene'].unique()
gene_mutated = somatic.loc[somatic['Gene'] == 'PTEN']
gene_and_freq_d = {}
tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
total_tumor_patients = len(tumors)
print('total_tumor_patients:', total_tumor_patients)
cutoff = .25
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene].index
    num_gene_mutated = len(gene_mutated.unique())
    print('num_gene_mutated', gene, ':', num_gene_mutated)
   
    percentage = (num_gene_mutated / total_tumor_patients)
    if percentage > cutoff:
        gene_and_freq_d[gene] = percentage

total_tumor_patients: 83
num_gene_mutated TP53 : 77
num_gene_mutated PTEN : 4


In [19]:
gene_and_freq_d

{'TP53': 0.927710843373494}

In [20]:
gene = 'TP53'
gene_mutated = somatic.loc[somatic['Gene'] == gene].index
num_gene_mutated = float(len(gene_mutated.unique()))
print('gene_mutated: ', num_gene_mutated, '/', total_tumor_patients, 'total_tumor_patients')
percentage = (num_gene_mutated / total_tumor_patients)
print('percent:',percentage)
gene_mutated.unique()

gene_mutated:  77.0 / 83 total_tumor_patients
percent: 0.927710843373494


Index(['S002', 'S006', 'S007', 'S009', 'S011', 'S013', 'S014', 'S015', 'S020',
       'S021', 'S022', 'S024', 'S025', 'S026', 'S027', 'S029', 'S031', 'S033',
       'S035', 'S036', 'S037', 'S038', 'S039', 'S040', 'S041', 'S042', 'S043',
       'S044', 'S045', 'S047', 'S048', 'S049', 'S050', 'S051', 'S052', 'S053',
       'S054', 'S055', 'S057', 'S058', 'S059', 'S060', 'S061', 'S062', 'S063',
       'S064', 'S065', 'S066', 'S067', 'S068', 'S069', 'S073', 'S074', 'S075',
       'S076', 'S082', 'S083', 'S086', 'S087', 'S089', 'S090', 'S093', 'S095',
       'S096', 'S097', 'S098', 'S100', 'S103', 'S106', 'S107', 'S108', 'S109',
       'S111', 'S112', 'S113', 'S115', 'S116'],
      dtype='object', name='Sample_ID')