# get_freq_mutations test Endometrial

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import operator

import CPTAC.Endometrial as en

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


Function definition

In [14]:
def get_frequently_mutated(somatic_df, omics_mutations_df, cutoff=.1, show_percentage=False):  
    import operator
    unique_genes = somatic_df['Gene'].unique() # get series of all mutated genes
    freq_mutated = []
    gene_and_freq_d = {}
    percentage = 0.0
    
    # get total tumor patients
    tumors = omics_mutations_df.loc[omics_mutations_df['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
    
    #find sample (col or index) in gene_mutated (samples represent patients)
    #gene_mutated: Endometrial and Colon samples found in column 1, Ovarian found in index
    gene = 'PTEN'
    gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene]
    ovarian = False
    sample = 0
    if gene_mutated.columns[0] == 'Gene':
        sample = gene_mutated.index
        ovarian = True
        
    #find frequently mutated
    unique_genes = somatic['Gene'].unique()
    for gene in unique_genes:
        gene_mutated = somatic.loc[somatic['Gene'] == gene]
        num_gene_mutated = len(gene_mutated['Patient_Id'].unique())     
        percentage = (num_gene_mutated / total_tumor_patients)
        if percentage > cutoff:
            gene_and_freq_d[gene] = percentage

    sorted_d = sorted(gene_and_freq_d.items(), key=operator.itemgetter(1), reverse=True)  
    
    
    for i in range(0,len(sorted_d)):
        certain_tuple = sorted_d[i]
        gene, percent_mutated = certain_tuple
        if show_percentage == True:
            string_gene_percent = gene + ': %' + str('%.2f'%percent_mutated)
            freq_mutated.append(string_gene_percent)
        else:
            freq_mutated.append(gene)
    
  
    
             
    return freq_mutated

Get data

In [8]:
somatic = en.get_mutations()
clinical = en.get_clinical()
prot = en.get_proteomics()
gene = 'PTEN'
omics_mutations = en.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
omics_mutations.head()
gene_mutated = somatic[somatic['Gene'] == 'PTEN']
gene_mutated.head()
omics_mutations.head()

Unnamed: 0,PTEN_proteomics,PTEN_Mutation,PTEN_Location,Sample_Status
S001,-0.526,"[Missense_Mutation, Nonsense_Mutation]","[p.R130Q, p.R233*]",Tumor
S002,-0.83,[Missense_Mutation],[p.G127R],Tumor
S003,-0.941,[Nonsense_Mutation],[p.W111*],Tumor
S005,0.73,[Missense_Mutation],[p.R130G],Tumor
S006,-0.379,[Wildtype_Tumor],[No_mutation],Tumor


In [32]:
#get column of sample = patient
gene = 'PTEN'
gene_mutated = somatic.loc[somatic['Gene'] == gene]
col_of_sample = 0
if gene_mutated.columns[0] == 'Gene':
    col_of_sample = gene_mutated.index
col_of_sample

0

Test cutoff = .25

In [15]:
list1 = get_frequently_mutated(somatic, omics_mutations, .25, show_percentage=True)

In [16]:
print(len(list1))
list1

7


['PTEN: %0.79',
 'PIK3CA: %0.49',
 'ARID1A: %0.45',
 'PIK3R1: %0.39',
 'KRAS: %0.33',
 'CTNNB1: %0.31',
 'CTCF: %0.28']

test: check genes in freq_mutated list

In [16]:
gene = 'PIK3CA'
total_tumor_patients = 95.0
cutoff = 0.25
gene_mutated = somatic.loc[somatic['Gene'] == gene]
patients_gene_mutated = len(gene_mutated['Patient_Id'].unique()) #series of patients with mutated gene

print(patients_gene_mutated, ' / ', total_tumor_patients, ' ', (patients_gene_mutated / total_tumor_patients))
if (num_gene_mutated / total_tumor_patients) > cutoff: #>24
    print('true')
else:
    print('false')
gene_mutated.head()

47  /  95.0   0.49473684210526314
true


Unnamed: 0,Clinical_Patient_Key,Patient_Id,Gene,Mutation,Location
160,S001,C3L-00006,PIK3CA,Missense_Mutation,p.E545K
1147,S003,C3L-00032,PIK3CA,Missense_Mutation,p.E545K
1357,S009,C3L-00139,PIK3CA,Missense_Mutation,p.Q546P
1505,S010,C3L-00143,PIK3CA,Missense_Mutation,p.R115L
2191,S012,C3L-00156,PIK3CA,Missense_Mutation,p.L628R


In [20]:
#check to see if mulitple mutations in one gene

patients = gene_mutated['Patient_Id']#series of ids
print('num of patients: ',len(patients))

#check each patient counted once #fixme check omics_mutations? 
print(len(gene_mutated['Patient_Id'].value_counts())) #gives num rows for each patient, descending order


new = patients[patients == 'C3L-00586'] #gets index for certain patient
print(new)

gene_mutated


num of patients:  62
47
7089    C3L-00586
7090    C3L-00586
7091    C3L-00586
7092    C3L-00586
Name: Patient_Id, dtype: object


Unnamed: 0,Clinical_Patient_Key,Patient_Id,Gene,Mutation,Location
160,S001,C3L-00006,PIK3CA,Missense_Mutation,p.E545K
1147,S003,C3L-00032,PIK3CA,Missense_Mutation,p.E545K
1357,S009,C3L-00139,PIK3CA,Missense_Mutation,p.Q546P
1505,S010,C3L-00143,PIK3CA,Missense_Mutation,p.R115L
2191,S012,C3L-00156,PIK3CA,Missense_Mutation,p.L628R
2192,S012,C3L-00156,PIK3CA,Missense_Mutation,p.T1025A
3315,S014,C3L-00161,PIK3CA,Missense_Mutation,p.R38C
3316,S014,C3L-00161,PIK3CA,Missense_Mutation,p.V344M
4521,S018,C3L-00362,PIK3CA,Missense_Mutation,p.H1047R
4629,S021,C3L-00563,PIK3CA,Missense_Mutation,p.G118D


In [97]:
# sort list
#make dictionary
mydict = {}
total_tumor_patients = 95
cutoff = .1
freq_mutated = []
unique_genes = somatic['Gene'].unique()

for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    num_gene_mutated = len(gene_mutated['Patient_Id'].unique())     
    mydict[gene] = num_gene_mutated    
        

sorted_d = sorted(mydict.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:8]

[('PTEN', 75),
 ('PIK3CA', 47),
 ('ARID1A', 43),
 ('PIK3R1', 37),
 ('KRAS', 31),
 ('CTNNB1', 29),
 ('CTCF', 27),
 ('KMT2B', 23)]

In [None]:
if (num_gene_mutated / total_tumor_patients) > cutoff:
        freq_mutated.append(gene)

In [25]:
unique_genes = somatic['Gene'].unique()
print('unique_genes: ', len(unique_genes))
not_in_proteomics = []
in_proteomics = []

unique_genes:  14814


In [15]:
#challenges with append_mutations_to_omics, can't merge with somatic (prot vs gene names)
#prepare list for .append_mutations_to_omics
for gene in unique_genes:
    if gene in proteomics.columns:
        in_proteomics.append(gene)
    else:
        not_in_proteomics.append(gene)

In [20]:
len(in_proteomics) #8265
len(not_in_proteomics) #6549

6549

In [93]:
# test for loop to count genes
total_tumor_patients = 95
cutoff = .1
freq_mutated = []
unique_genes = somatic['Gene'].unique()
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    num_gene_mutated = len(gene_mutated)
    if (num_gene_mutated / total_tumor_patients) > cutoff:
            freq_mutated.append(gene)

In [46]:
freq_mutated = freq_mutated.sort(reverse = True)
len(freq_mutated)

814