# get_freq_mutations test Endometrial

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import operator

import CPTAC.Endometrial as en

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


Function definition

In [2]:
def get_frequently_mutated(somatic_df, omics_mutations_df, cutoff=.1, show_percentage=False):  
     
    """take DataFrames of somatic mutations and omics_mutations to determine the percent of 
    mutated genes for all tumors. Frequently mutated genes are greater than the cutoff.

    Parameters:
    somatic_df (pandas.core.frame.DataFrame): Somatic mutations dataframe.
    omics_mutations_df (pandas.core.frame.DataFrame): merged dataframe of any gene and proteomics dataframe
    (used to find total_tumor_patients)
    cutoff (float): used as comparison to determine status of gene mutation frequency

    Returns:
    freq_mutated (list): list of frequently mutated genes passing the cutoff"""
    
    import operator
    unique_genes = somatic_df['Gene'].unique() # Get series of all mutated genes
    freq_mutated = []
    gene_and_freq_d = {}
    
    # Get total tumor patients
    tumors = omics_mutations_df.loc[omics_mutations_df['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
    
    # Find sample (col or index) in gene_mutated (samples represent patients)
    # gene_mutated: Endometrial and Colon samples found in column 0, Ovarian found in index
    gene = 'PTEN'
    gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene]
    ovarian = False
    if gene_mutated.columns[0] == 'Gene':
        ovarian = True
    
    # Find percentage of gene mutation and add frequently mutated genes to dictionary
    if ovarian == True:
        print('ovarian')
        for gene in unique_genes:
            gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene].index
            num_gene_mutated = len(gene_mutated.unique())
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage
    
    else:
        for gene in unique_genes:
            gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene].iloc[:, 0]
            gene_mutated.drop_duplicates(keep='first',inplace=True)
            num_gene_mutated = len(gene_mutated)
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage

    # Sort dictionary descending order based on percent mutated
    sorted_d = sorted(gene_and_freq_d.items(), key=operator.itemgetter(1), reverse=True)  
    
    # Add frequently mutated gene to list. Option to include percentage.
    for i in range(0,len(sorted_d)):
        certain_tuple = sorted_d[i]
        gene, percent_mutated = certain_tuple
        if show_percentage == True:
            string_gene_percent = gene + ': %' + str('%.2f'%percent_mutated)
            freq_mutated.append(string_gene_percent)
        else:
            freq_mutated.append(gene)  
                   
    return freq_mutated

Get data

In [3]:
somatic = en.get_mutations()
clinical = en.get_clinical()
prot = en.get_proteomics()
gene = 'PTEN'
omics_mutations = en.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
gene_mutated = somatic[somatic['Gene'] == 'PTEN']

# Test:  cutoff = .25

In [4]:
list1 = get_frequently_mutated(somatic, omics_mutations, .25, show_percentage=True)

ovarian


In [5]:
print(len(list1))
list1

7


['PTEN: %0.79',
 'PIK3CA: %0.49',
 'ARID1A: %0.45',
 'PIK3R1: %0.39',
 'KRAS: %0.33',
 'CTNNB1: %0.31',
 'CTCF: %0.28']

Test: check genes in freq_mutated list

In [6]:
gene = 'KRAS'
total_tumor_patients = 95
cutoff = 0.25
gene_mutated = somatic.loc[somatic['Gene'] == gene]
patients_gene_mutated = len(gene_mutated['Patient_Id'].unique()) #series of patients with mutated gene

print(patients_gene_mutated, ' / ', total_tumor_patients, ' ', (patients_gene_mutated / total_tumor_patients))
if (patients_gene_mutated / total_tumor_patients) > cutoff: #>24
    print('true')
else:
    print('false')

KeyError: 'Patient_Id'

In [None]:
gene_mutated = somatic.loc[somatic['Gene'] == 'KRAS'].iloc[:, 0] #.loc gets rows pten, .iloc gets col 0 (sample)
print(gene_mutated.head())
print('length gene_mutated (all mutations): ', len(gene_mutated))

gene_mutated.drop_duplicates(keep='first',inplace=True) #subset= colname when df
print('num gene mutated count once per patient:', len(gene_mutated))

Testing ideas

In [None]:
#check to see if mulitple mutations in one gene

patients = gene_mutated['Patient_Id']#series of ids
print('num of patients: ',len(patients))

#check each patient counted once #fixme check omics_mutations? 
print(len(gene_mutated['Patient_Id'].value_counts())) #gives num rows for each patient, descending order


new = patients[patients == 'C3L-00586'] #gets index for certain patient
print(new)


In [None]:
# sort list
#make dictionary
mydict = {}
total_tumor_patients = 95
cutoff = .1
freq_mutated = []
unique_genes = somatic['Gene'].unique()

for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    num_gene_mutated = len(gene_mutated['Patient_Id'].unique())     
    mydict[gene] = num_gene_mutated    
        

sorted_d = sorted(mydict.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:8]

In [None]:
if (num_gene_mutated / total_tumor_patients) > cutoff:
        freq_mutated.append(gene)

In [None]:
unique_genes = somatic['Gene'].unique()
print('unique_genes: ', len(unique_genes))
not_in_proteomics = []
in_proteomics = []

In [None]:
#challenges with append_mutations_to_omics, can't merge with somatic (prot vs gene names)
#prepare list for .append_mutations_to_omics
for gene in unique_genes:
    if gene in proteomics.columns:
        in_proteomics.append(gene)
    else:
        not_in_proteomics.append(gene)

In [None]:
len(in_proteomics) #8265
len(not_in_proteomics) #6549

In [None]:
# test for loop to count genes
total_tumor_patients = 95
cutoff = .1
freq_mutated = []
unique_genes = somatic['Gene'].unique()
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    num_gene_mutated = len(gene_mutated)
    if (num_gene_mutated / total_tumor_patients) > cutoff:
            freq_mutated.append(gene)