In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import CPTAC.Endometrial as en

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.0
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [2]:
def get_frequently_mutated(somatic_df, omics_mutations_df, cutoff = .1): #use default for the 1st parameters?
    """take DataFrames of somatic mutations and omics_mutations to determine the frequently mutated genes compared to the cutoff
        
        Parameters:
        somatic_df (pandas.core.frame.DataFrame): Somatic mutation dataframe.
        omics_mutations_df (pandas.core.frame.DataFrame): merged dataframe of any gene and proteomics (used to find total_tumor_patients)
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated (list): list of frequently mutated genes passing the cutoff"""
    
    freq_mutated = []
    # get series of all mutated genes
    unique_genes = somatic_df['Gene'].unique() 
    
    # get number of tumor patients
    tumors = omics_mutations_df.loc[omics_mutations_df['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors) 
    
    for gene in unique_genes:
        gene_mutated = somatic_df.loc[somatic['Gene'] == gene]
        num_gene_mutated = len(gene_mutated)
        if (num_gene_mutated / total_tumor_patients) > cutoff:
            freq_mutated.append(gene)
        
    return freq_mutated
           

In [3]:
somatic = en.get_mutations()
list1 = get_frequently_mutated(somatic, 0.0001)

In [4]:
print(len(list1))

2505
