In [100]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import CPTAC.Endometrial as c

In [92]:
def get_frequently_mutated(cancer_type, cutoff=.1):  
"""take cancer type to import CPTAC and find the frequently mutated genes compared to the cutoff
        
        Parameters:
        cancer_type (string): type of  cancer 
        cutoff (float): used as comparison to determine status of gene mutation frequency
        
        Returns:
        freq_mutated (pd.DataFrame): DataFrame of frequently mutated genes passing the cutoff
        and percent mutated (mutated genes / total tumors)"""    
    
    #import CPTAC 
    if cancer_type == "endometrial" or cancer_type == "Endometrial":
        import CPTAC.Endometrial as CPTAC
        
    elif cancer_type == "colon" or cancer_type == "Colon":
        import CPTAC.Colon as CPTAC
        
    elif cancer_type == "ovarian" or cancer_type == "Ovarian":
        import CPTAC.Ovarian as CPTAC
    
    else:
        print("Please enter a valid cancer type.")
        
    gene_and_freq_d = {}  
        
    # get data frames
    somatic = CPTAC.get_mutations()
    proteomics = CPTAC.get_proteomics()
    gene = 'PTEN'
    omics_mutations = CPTAC.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    
    # unique genes
    unique_genes = somatic['Gene'].unique()
    
    # get total tumor patients
    tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
        
    #find frequently mutated
    if gene_mutated.columns[0] == 'Gene':
        for gene in unique_genes:
            gene_mutated = somatic.loc[somatic['Gene'] == gene].index
            num_gene_mutated = len(gene_mutated.unique())
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage
    
    # create dataframe
    freq_mutated_df = pd.DataFrame(gene_and_freq_d.items())
    #col names               
    return freq_mutated_df

In [97]:
dataf = get_frequently_mutated("endometrial", .25)

{'ARID1A': 0.5, 'PIK3CA': 0.5465116279069767, 'PTEN': 0.872093023255814, 'CTCF': 0.313953488372093, 'KMT2B': 0.26744186046511625, 'KRAS': 0.36046511627906974, 'PIK3R1': 0.43023255813953487, 'CTNNB1': 0.3372093023255814}


In [98]:
dataf

Unnamed: 0,0,1
0,ARID1A,0.5
1,PIK3CA,0.546512
2,PTEN,0.872093
3,CTCF,0.313953
4,KMT2B,0.267442
5,KRAS,0.360465
6,PIK3R1,0.430233
7,CTNNB1,0.337209


In [75]:
d = {'TTN': 0.30097087378640774, 'TP53': 0.7475728155339806, 'MUC4': 0.3300970873786408}

In [91]:
df = pd.DataFrame(d.items())
df

Unnamed: 0,0,1
0,TTN,0.300971
1,TP53,0.747573
2,MUC4,0.330097


In [16]:
#find frequently mutated
unique_genes = somatic['Gene'].unique()
gene_mutated = somatic.loc[somatic['Gene'] == 'PTEN']
gene_and_freq_d = {}
total_tumor_patients = 97
cutoff = .25
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene].iloc[:, 0]
    gene_mutated.drop_duplicates(keep='first',inplace=True)
    num_gene_mutated = len(gene_mutated)
    percentage = (num_gene_mutated / total_tumor_patients)
    if percentage > cutoff:
        gene_and_freq_d[gene] = percentage

In [116]:
gene = 'PTEN'
proteomics = c.get_proteomics()
omics_mutations = c.append_mutations_to_omics(mutation_genes=gene, omics_df=proteomics, omics_genes=gene)
gene_mutated = somatic.loc[somatic['Gene'] == gene]

In [117]:
tumors = omics_mutations.loc[omics_mutations['Sample_Status'] == 'Tumor']
total_tumor_patients = len(tumors)
total_tumor_patients

95

In [121]:
gene = 'ARID1A'
gene_mutated = somatic.loc[somatic['Gene'] == gene].index
num_gene_mutated = float(len(gene_mutated.unique()))
print('gene_mutated: ', num_gene_mutated, '/', total_tumor_patients, 'total_tumor_patients')
percentage = (num_gene_mutated / total_tumor_patients)
print('percent:',percentage)
gene_mutated.unique()

gene_mutated:  23.0 / 95 total_tumor_patients
percent: 0.24210526315789474


Index(['S003', 'S005', 'S010', 'S014', 'S015', 'S016', 'S018', 'S023', 'S026',
       'S029', 'S033', 'S035', 'S038', 'S045', 'S055', 'S060', 'S062', 'S068',
       'S080', 'S082', 'S091', 'S101', 'S108'],
      dtype='object', name='Sample_ID')

In [134]:
gene = 'ARID1A'
proteomics = c.get_proteomics()
omics_mutations = c.append_mutations_to_omics(mutation_genes=gene, omics_df=proteomics, omics_genes=gene)
ari = omics_mutations.loc[omics_mutations['ARID1A_Mutation']]
ari

TypeError: unhashable type: 'list'