In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import CPTAC.Endometrial as en

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [2]:
def get_frequently_mutated(somatic_df, omics_mutations_df, cutoff = .1): #use default for the 1st parameters?
    """take DataFrames of somatic mutations and omics_mutations to determine the frequently mutated genes compared to the cutoff
        
        Parameters:
        somatic_df (pandas.core.frame.DataFrame): Somatic mutation dataframe.
        omics_mutations_df (pandas.core.frame.DataFrame): merged dataframe of any gene and proteomics (used to find total_tumor_patients)
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated (list): list of frequently mutated genes passing the cutoff"""
    
    freq_mutated = []
    # get series of all mutated genes
    unique_genes = somatic_df['Gene'].unique() 
    
    # get number of tumor patients
    tumors = omics_mutations_df.loc[omics_mutations_df['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors) 
    
    for gene in unique_genes:
        gene_mutated = somatic_df.loc[somatic['Gene'] == gene]
        num_gene_mutated = len(gene_mutated)
        if (num_gene_mutated / total_tumor_patients) > cutoff:
            freq_mutated.append(gene)
        
    return freq_mutated
           

In [4]:
somatic = en.get_mutations()
prot = en.get_proteomics()
gene = 'PTEN'
omics_mutations = en.append_mutations_to_omics(mutation_genes = gene, omics_df = prot, omics_genes = gene)


In [5]:
list1 = get_frequently_mutated(somatic, omics_mutations, .25)
print(len(list1))

75


In [6]:
list1

['ARID1A',
 'NEB',
 'LRP2',
 'PIK3CA',
 'FAT4',
 'DNAH5',
 'ADGRV1',
 'APC',
 'NSD1',
 'DST',
 'SYNE1',
 'MAP3K4',
 'DNAH11',
 'KMT2C',
 'ZFHX4',
 'PTEN',
 'LRRK2',
 'RYR3',
 'CTCF',
 'KMT2B',
 'ADGRG4',
 'USH2A',
 'DNAH14',
 'IGSF10',
 'KRAS',
 'DNAH10',
 'PIK3R1',
 'COL6A3',
 'CTNNB1',
 'FAT2',
 'PCLO',
 'SPTA1',
 'KMT2D',
 'HERC2',
 'ZFHX3',
 'RNF213',
 'OBSCN',
 'APOB',
 'FSIP2',
 'DNAH7',
 'ZDBF2',
 'ANK2',
 'DCHS2',
 'DNAH8',
 'EYS',
 'REV3L',
 'ABCA13',
 'PTPRZ1',
 'RP1',
 'CSMD3',
 'ANK3',
 'DYNC2H1',
 'SACS',
 'CCDC168',
 'AHNAK2',
 'WDR87',
 'DMD',
 'ATRX',
 'TENM1',
 'HMCN1',
 'DOCK3',
 'BRWD3',
 'MUC16',
 'HUWE1',
 'TENM3',
 'LYST',
 'RYR2',
 'LRP1B',
 'XIRP2',
 'SI',
 'MUC17',
 'FAT3',
 'SYNE2',
 'DNAH3',
 'INPPL1']

In [8]:
# check genes in freq_mutated list
gene = 'ATRX'
total_tumor_patients = 97
cutoff = .1
gene_mutated = somatic.loc[somatic['Gene'] == gene]
num_gene_mutated = len(gene_mutated)
print(num_gene_mutated, ' / ', total_tumor_patients, ' ', (num_gene_mutated / total_tumor_patients))
if (num_gene_mutated / total_tumor_patients) > cutoff:
    print('true')
else:
    print('false')

25  /  97   0.25773195876288657
true
