# get_freq_mutations test colon

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import CPTAC.Colon as co

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
You have loaded the CPTAC Colon dataset. To view available dataframes,
use CPTAC.Colon.list_data(). To view available functions for accessing
and manipulating the dataframes, use CPTAC.Colon.list_api().
Colon Data Version: Most recent release

Loading CPTAC Colon data:
Loading clinical data...
Loading miRNA data...
Loading mutation data...
Loading mutation_binary data...
Loading phosphoproteomics_normal data...
Loading phosphoproteomics_tumor data...
Loading proteomics_normal data...
Loading proteomics_tumor data...
Loading transcriptomics data...


In [8]:
def get_frequently_mutated(somatic_df, omics_mutations_df, cutoff=.1, show_percentage = False):  
    import operator
    unique_genes = somatic_df['Gene'].unique() # get series of all mutated genes
    freq_mutated = []
    gene_and_freq_d = {}
    
    # get total tumor patients
    tumors = omics_mutations_df.loc[omics_mutations_df['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
    
    #find sample (col or index) in gene_mutated (samples represent patients)
    #gene_mutated: Endometrial and Colon samples found in column 1, Ovarian found in index
    gene = 'PTEN'
    gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene]
    ovarian = False
    if gene_mutated.columns[0] == 'Gene':
        ovarian = True
        
    if ovarian == True:
        print('ovarian')
        for gene in unique_genes:
            gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene].index
            num_gene_mutated = len(gene_mutated.unique())
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage
    
    #find frequently mutated
    else:
        for gene in unique_genes:
            gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene].iloc[:, 0]
            gene_mutated.drop_duplicates(keep='first',inplace=True)
            num_gene_mutated = len(gene_mutated)
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage

    
    sorted_d = sorted(gene_and_freq_d.items(), key=operator.itemgetter(1), reverse=True)  
    
    for i in range(0,len(sorted_d)):
        certain_tuple = sorted_d[i]
        gene, percent_mutated = certain_tuple
        if show_percentage == True:
            string_gene_percent = gene + ': %' + str('%.2f'%percent_mutated)
            freq_mutated.append(string_gene_percent)
        else:
            freq_mutated.append(gene)  
                   
    return freq_mutated

# find number of tumor patients

In [3]:
somatic = co.get_mutations()
prot = co.get_proteomics()
clinical = co.get_clinical()
total = clinical.loc[clinical['Tumor.Status'] == 'With tumor']
print(len(total)) #NOT RIGHT TOTAL 16

16


In [4]:
gene_mutated = somatic.loc[somatic['Gene'] == 'PTEN'].iloc[:, 0] #.loc gets rows pten, .iloc gets col 0 (sample)
print(gene_mutated.head())
print('length gene_mutated: ', len(gene_mutated))


gene_mutated.drop_duplicates(keep='first',inplace=True) #subset= colname when df
print('num gene mutated:', len(gene_mutated))

Sample_ID
S035    PTEN
S035    PTEN
S035    PTEN
S052    PTEN
S052    PTEN
Name: Gene, dtype: object
length gene_mutated:  14
num gene mutated: 1


In [27]:
# test to finding sample
gene = 'PTEN'
gene_mutated = somatic.loc[somatic['Gene'] == gene]
sample = 0
if gene_mutated.columns[0] == 'Gene':
    sample = gene_mutated.index
sample

0

In [6]:
#find all tumor patients
gene = 'TP53'
omics_mutations = co.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
total_tumor_count = len(tumors)
print(total_tumor_count) #97
omics_mutations.head() 

97


Unnamed: 0_level_0,TP53_proteomics,TP53_Mutation,TP53_Location,TP53_Mutation_Status,Sample_Status
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S002,-0.489,"[frameshift insertion, frameshift insertion, f...","[E287Wfs*10, E194Wfs*11, E194Wfs*10, E167Wfs*2...",Multiple_mutation,Tumor
S003,-0.796,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
S004,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
S005,-1.39,"[nonsynonymous SNV, nonsynonymous SNV, nonsyno...","[C143Y, C143Y, C116Y, C116Y, C116Y, S260Qfs*29...",Multiple_mutation,Tumor
S006,,"[nonsynonymous SNV, nonsynonymous SNV, nonsyno...","[R273C, R273C, R273C, R273C, R234C, R234C, R23...",Multiple_mutation,Tumor


Test 1 colon: 

In [9]:
list1 = get_frequently_mutated(somatic, omics_mutations, cutoff=.25, show_percentage=True)

ovarian


In [30]:
print('num sig mutated genes:', len(list1))
list1

num sig mutated genes: 16


['APC: %0.85',
 'TTN: %0.62',
 'TP53: %0.58',
 'MUC16: %0.40',
 'KRAS: %0.36',
 'SYNE1: %0.30',
 'OBSCN: %0.30',
 'CCDC168: %0.30',
 'FAT3: %0.29',
 'MUC5B: %0.28',
 'CSMD3: %0.27',
 'FAT4: %0.27',
 'DNAH5: %0.27',
 'ZNF469: %0.26',
 'RYR1: %0.26',
 'ACVR2A: %0.26']

In [26]:
#How many patients in colon df?
tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
total_tumor_patients = len(tumors)
print('total_tumor_patients', total_tumor_patients)

print('freq_mutated: ',len(list1))
total_genes = somatic['Gene'].unique()
print('total_genes: ',len(total_genes))

total_tumor_patients 97
freq_mutated:  16
total_genes:  15175


In [21]:
# check genes in freq_mutated list
gene = 'ZNF469'
total_tumor_patients = 97
cutoff = .1
gene_mutated = somatic.loc[somatic['Gene'] == gene].iloc[:, 0]
gene_mutated.drop_duplicates(keep = 'first', inplace = True)
num_gene_mutated = len(gene_mutated)
print(num_gene_mutated, ' / ', total_tumor_patients, ' ', (num_gene_mutated / total_tumor_patients))
if (num_gene_mutated / total_tumor_patients) > cutoff:
    print('true')
else:
    print('false')
gene_mutated.head()

25  /  97   0.25773195876288657
true


130467    01CO014
130435    01CO022
130431    05CO006
130458    05CO007
130459    05CO015
Name: SampleID, dtype: object

In [85]:
#check to see if mulitple mutations in one gene
gene = 'APC'
gene_mutated = somatic.loc[somatic['Gene'] == gene]#series of mutations
gene_mutated.rename(columns={ gene_mutated.iloc[0]: "sample" }, inplace = True)

patients = gene_mutated.loc[gene_mutated['sample']]#series of ids
#print('num of patients with mutation counted once: ',len(patients.unique()))


#check each patient counted once #fixme check omics_mutations? 
#print(gene_mutated['SampleID'].value_counts()) #gives num rows for each patient, descending order

#gene_mutated.columns[0]
#new = patients[patients == 'C3L-00586'] #gets index for certain patient
#print(new)
gene_mutated

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [15]:
#test gene_mutated.columns[sample]].unique())
gene_and_freq_d = {}
unique_genes = ['PTEN']
#somatic['Gene'].unique()

for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene] 
    count_gene_once = gene_mutated.iloc[:,[0]]
    num_gene_mutated = gene_mutated.loc[gene_mutated.columns[sample]]     
    
    """
    percentage = (num_gene_mutated / total_tumor_patients)
    if percentage > cutoff:
        gene_and_freq_d[gene] = percentage
        """

AttributeError: 'str' object has no attribute 'unique'

test2: using list1, test if all returned

In [46]:
list1_somatic = somatic.loc[somatic['Gene'] == list1]

ValueError: Lengths must match to compare