In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import CPTAC.Ovarian as c

In [3]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
    """take cancer type to import CPTAC and find the frequently mutated genes compared to the cutoff
        
        Parameters:
        cancer_type (string): type of  cancer 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
            and percent mutated (mutated genes / total tumors)"""    
    
    #import CPTAC and pandas
    import pandas as pd
    if cancer_type == "endometrial" or cancer_type == "Endometrial":
        import CPTAC.Endometrial as CPTAC
        
    elif cancer_type == "colon" or cancer_type == "Colon":
        import CPTAC.Colon as CPTAC
        
    elif cancer_type == "ovarian" or cancer_type == "Ovarian":
        import CPTAC.Ovarian as CPTAC
    
    else:
        print("Please enter a valid cancer type.")
        
    gene_and_freq_d = {}  
        
    # get data frames
    somatic = CPTAC.get_mutations()
    proteomics = CPTAC.get_proteomics()
    gene = 'PTEN'
    omics_mutations = CPTAC.append_mutations_to_omics(mutation_genes=gene, omics_df=proteomics, omics_genes=gene)
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    
    # unique genes
    unique_genes = somatic['Gene'].unique()
    
    # get total tumor patients
    tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
        
    #find frequently mutated
    if gene_mutated.columns[0] == 'Gene':
        for gene in unique_genes:
            gene_mutated = somatic.loc[somatic['Gene'] == gene].index
            num_gene_mutated = len(gene_mutated.unique())
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage
    
    # create dataframe
    freq_mutated_df = pd.DataFrame(gene_and_freq_d.items())
    freq_mutated_df.columns = ['Gene', 'Percent Mutated']
                   
    return freq_mutated_df

Test Endometrial

In [9]:
endo_freq_mutated_df = get_frequently_mutated("endometrial", .25)

You have loaded the CPTAC Endometrial dataset. To view available
dataframes, use CPTAC.Endometrial.list_data(). To view available
functions for accessing and manipulating the dataframes, use
CPTAC.Endometrial.list_api().
Endometrial Data Version: 2.1

Loading Dictionary...
Loading CPTAC Endometrial data:
Loading acetylproteomics data...
Loading clinical data...
Loading CNA data...
Loading definitions data...
Loading miRNA data...
Loading phosphoproteomics_gene data...
Loading phosphoproteomics_site data...
Loading proteomics data...
Loading somatic data...
Loading somatic data...
Loading transcriptomics_circular data...
Loading transcriptomics_linear data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreem

In [10]:
endo_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated
0,ARID1A,0.452632
1,PIK3CA,0.494737
2,PTEN,0.789474
3,CTCF,0.284211
4,KRAS,0.326316
5,PIK3R1,0.389474
6,CTNNB1,0.305263


Test Colon

In [11]:
colon_freq_mutated_df = get_frequently_mutated("colon", .25)

In [12]:
colon_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated
0,APC,0.845361
1,TP53,0.57732
2,MUC16,0.402062
3,FAT3,0.28866
4,SYNE1,0.298969
5,TTN,0.618557
6,CSMD3,0.268041
7,KRAS,0.360825
8,OBSCN,0.298969
9,MUC5B,0.278351


Test Ovarian, cutoff = .1 (default)

In [13]:
ovarian_freq_mutated_df = get_frequently_mutated("ovarian")

In [14]:
ovarian_freq_mutated_df

Unnamed: 0,Gene,Percent Mutated
0,TTN,0.373494
1,TP53,0.927711
2,FMN2,0.108434
3,WDFY4,0.13253
4,MUC16,0.192771
5,MT-CO1,0.156627
6,KMT2D,0.120482
7,HERC1,0.108434
8,MT-ND5,0.192771
9,FCGBP,0.144578


Debug 

In [17]:
#find frequently mutated - check Ovarian PTEN percentage = .9277 
somatic = c.get_mutations()
prot = c.get_proteomics()
gene = 'PTEN'
omics_mutations = c.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
unique_genes = ['TP53','PTEN']
#somatic['Gene'].unique()
gene_mutated = somatic.loc[somatic['Gene'] == 'PTEN']
gene_and_freq_d = {}
tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
total_tumor_patients = len(tumors)
print('total_tumor_patients:', total_tumor_patients)
cutoff = .25
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene].index
    num_gene_mutated = len(gene_mutated.unique())
    print('num_gene_mutated', gene, ':', num_gene_mutated)
   
    percentage = (num_gene_mutated / total_tumor_patients)
    if percentage > cutoff:
        gene_and_freq_d[gene] = percentage

total_tumor_patients: 83
num_gene_mutated TP53 : 77
num_gene_mutated PTEN : 4


In [19]:
gene_and_freq_d

{'TP53': 0.927710843373494}

In [20]:
gene = 'TP53'
gene_mutated = somatic.loc[somatic['Gene'] == gene].index
num_gene_mutated = float(len(gene_mutated.unique()))
print('gene_mutated: ', num_gene_mutated, '/', total_tumor_patients, 'total_tumor_patients')
percentage = (num_gene_mutated / total_tumor_patients)
print('percent:',percentage)
gene_mutated.unique()

gene_mutated:  77.0 / 83 total_tumor_patients
percent: 0.927710843373494


Index(['S002', 'S006', 'S007', 'S009', 'S011', 'S013', 'S014', 'S015', 'S020',
       'S021', 'S022', 'S024', 'S025', 'S026', 'S027', 'S029', 'S031', 'S033',
       'S035', 'S036', 'S037', 'S038', 'S039', 'S040', 'S041', 'S042', 'S043',
       'S044', 'S045', 'S047', 'S048', 'S049', 'S050', 'S051', 'S052', 'S053',
       'S054', 'S055', 'S057', 'S058', 'S059', 'S060', 'S061', 'S062', 'S063',
       'S064', 'S065', 'S066', 'S067', 'S068', 'S069', 'S073', 'S074', 'S075',
       'S076', 'S082', 'S083', 'S086', 'S087', 'S089', 'S090', 'S093', 'S095',
       'S096', 'S097', 'S098', 'S100', 'S103', 'S106', 'S107', 'S108', 'S109',
       'S111', 'S112', 'S113', 'S115', 'S116'],
      dtype='object', name='Sample_ID')