In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import CPTAC.Endometrial as c

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
You have loaded the CPTAC Endometrial dataset. To view available
dataframes, use CPTAC.Endometrial.list_data(). To view available
functions for accessing and manipulating the dataframes, use
CPTAC.Endometrial.list_api().
Endometrial Data Version: 2.1

Loading Dictionary...
Loading CPTAC Endometrial data:
Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading definitions data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

 ******PLEASE READ******
CPTAC is a community resource project and da

In [52]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
    """take cancer type to import CPTAC and find the frequently mutated genes compared to the cutoff
        
        Parameters:
        cancer_type (string): type of  cancer 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
            and percent mutated (mutated genes / total tumors)"""    
    
    #import CPTAC 
    if cancer_type == "endometrial" or cancer_type == "Endometrial":
        import CPTAC.Endometrial as CPTAC
        
    elif cancer_type == "colon" or cancer_type == "Colon":
        import CPTAC.Colon as CPTAC
        
    elif cancer_type == "ovarian" or cancer_type == "Ovarian":
        import CPTAC.Ovarian as CPTAC
    
    else:
        print("Please enter a valid cancer type.")
        
    gene_and_freq_d = {}  
        
    # get data frames
    somatic = CPTAC.get_mutations()
    proteomics = CPTAC.get_proteomics()
    gene = 'PTEN'
    omics_mutations = CPTAC.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    
    # unique genes
    unique_genes = somatic['Gene'].unique()
    
    # get total tumor patients
    tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
        
    #find frequently mutated
    if gene_mutated.columns[0] == 'Gene':
        for gene in unique_genes:
            gene_mutated = somatic.loc[somatic['Gene'] == gene].index
            num_gene_mutated = len(gene_mutated.unique())
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage
    
    # create dataframe
    freq_mutated_df = pd.DataFrame(gene_and_freq_d.items())
    freq_mutated_df.columns = ['Gene', 'Percent Mutated']
                   
    return freq_mutated_df

In [53]:
dataf = get_frequently_mutated("endometrial", .25)

In [54]:
dataf

Unnamed: 0,Gene,Percent Mutated
0,ARID1A,0.452632
1,PIK3CA,0.494737
2,PTEN,0.789474
3,CTCF,0.284211
4,KRAS,0.326316
5,PIK3R1,0.389474
6,CTNNB1,0.305263


In [40]:
#find frequently mutated
somatic = c.get_mutations()
prot = c.get_proteomics()
gene = 'PTEN'
omics_mutations = c.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
unique_genes = ['PIK3CA','PTEN']
#somatic['Gene'].unique()
gene_mutated = somatic.loc[somatic['Gene'] == 'PTEN']
gene_and_freq_d = {}
tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
total_tumor_patients = len(tumors)
cutoff = .25
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene].index
    num_gene_mutated = len(gene_mutated.unique())
    print('num_gene_mutated:', num_gene_mutated)
   
    percentage = (num_gene_mutated / total_tumor_patients)
    if percentage > cutoff:
        gene_and_freq_d[gene] = percentage

num_gene_mutated: 47
num_gene_mutated: 75


In [41]:
#shows difference between percentage and then dictionary value?

print('percentage for',gene, ':',percentage)
gene_and_freq_d

percentage for PTEN : 0.7894736842105263


{'PIK3CA': 0.49473684210526314, 'PTEN': 0.7894736842105263}

In [35]:
gene = 'PIK3CA'
gene_mutated = somatic.loc[somatic['Gene'] == gene].index
num_gene_mutated = float(len(gene_mutated.unique()))
print('gene_mutated: ', num_gene_mutated, '/', total_tumor_patients, 'total_tumor_patients')
percentage = (num_gene_mutated / total_tumor_patients)
print('percent:',percentage)
gene_mutated.unique()

gene_mutated:  47.0 / 95 total_tumor_patients
percent: 0.49473684210526314


Index(['S001', 'S003', 'S009', 'S010', 'S012', 'S014', 'S018', 'S021', 'S022',
       'S023', 'S024', 'S026', 'S027', 'S028', 'S030', 'S031', 'S032', 'S033',
       'S036', 'S038', 'S040', 'S053', 'S055', 'S060', 'S061', 'S063', 'S064',
       'S065', 'S066', 'S067', 'S068', 'S070', 'S074', 'S075', 'S077', 'S081',
       'S084', 'S086', 'S088', 'S090', 'S092', 'S095', 'S096', 'S097', 'S098',
       'S099', 'S103'],
      dtype='object', name='Sample_ID')