# get_freq_mutations test Endometrial

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import operator

import CPTAC.Endometrial as en

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


Function definition

In [25]:
def get_frequently_mutated(somatic_df, omics_mutations_df, cutoff=.1):  
     
    """take DataFrames of somatic mutations and omics_mutations to determine the percent of 
    mutated genes for all tumors. Frequently mutated genes are greater than the cutoff.
        
    Parameters:
    somatic_df (pandas.core.frame.DataFrame): Somatic mutations dataframe.
    omics_mutations_df (pandas.core.frame.DataFrame): merged dataframe of any gene and proteomics dataframe
    (used to find total_tumor_patients)
    cutoff (float): used as comparison to determine status of gene mutation frequency
        
    Returns:
    freq_mutated (pandas.core.frame.DataFrame): dataframe of frequently mutated genes passing the cutoff 
    as columns and their percent mutated over total tumors as the first row"""
    
    import operator
    unique_genes = somatic_df['Gene'].unique() # Get series of all mutated genes
    gene_and_freq_d = {}
    sorted_gene_and_freq_d = {}
    
    # Get total tumor patients
    tumors = omics_mutations_df.loc[omics_mutations_df['Sample_Status'] == 'Tumor']
    total_tumor_patients = len(tumors)
    
    # Find sample (col or index) in gene_mutated (samples represent patients)
    # gene_mutated: Endometrial and Colon samples found in column 0, Ovarian found in index
    gene = 'PTEN'
    gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene]
    ovarian = False
    if gene_mutated.columns[0] == 'Gene':
        ovarian = True
    
    # Find percentage of gene mutation and add frequently mutated genes to dictionary
    if ovarian == True:
        for gene in unique_genes:
            gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene].index
            num_gene_mutated = len(gene_mutated.unique())
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage
    
    else:
        for gene in unique_genes:
            gene_mutated = somatic_df.loc[somatic_df['Gene'] == gene].iloc[:, 0]
            gene_mutated.drop_duplicates(keep='first',inplace=True)
            num_gene_mutated = len(gene_mutated)
            percentage = (num_gene_mutated / total_tumor_patients)
            if percentage > cutoff:
                gene_and_freq_d[gene] = percentage

    # Sort dictionary descending order based on percent mutated
    sorted_d = sorted(gene_and_freq_d.items(), key=operator.itemgetter(1), reverse=True)  
    for i in range(0,len(sorted_d)):
        certain_tuple = sorted_d[i]
        gene, percent_mutated = certain_tuple
        sorted_gene_and_freq_d[gene] = percent_mutated
        
    # Make dataframe of frequently mutated genes and percent mutated as index[0]
    freq_mutated_df = pd.DataFrame(sorted_gene_and_freq_d, index=['percent_mutated'])
                   
    return freq_mutated_df

Get data

In [19]:
somatic = en.get_mutations()
clinical = en.get_clinical()
prot = en.get_proteomics()
gene = 'PTEN'
omics_mutations = en.append_mutations_to_omics(mutation_genes=gene, omics_df=prot, omics_genes=gene)
gene_mutated = somatic[somatic['Gene'] == 'PTEN']

# Test:  cutoff = .1

In [28]:
df = get_frequently_mutated(somatic, omics_mutations, .1)

In [29]:
df

Unnamed: 0,PTEN,PIK3CA,ARID1A,PIK3R1,KRAS,CTNNB1,CTCF,KMT2B,TP53,ZFHX3,...,ASCC3,MGAM,CUBN,WDFY4,KCNH5,SYNE2,MAP1A,PKD1,PIEZO2,HELZ2
percent_mutated,0.789474,0.494737,0.452632,0.389474,0.326316,0.305263,0.284211,0.242105,0.221053,0.221053,...,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263


Test: check genes in freq_mutated list

In [6]:
unique_genes = somatic['Gene'].unique() # Get series of all mutated genes
gene_and_freq_d = {}
total_tumor_patients = 97
cutoff = .1

for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene].iloc[:, 0]
    gene_mutated.drop_duplicates(keep='first',inplace=True)
    num_gene_mutated = len(gene_mutated)
    percentage = (num_gene_mutated / total_tumor_patients)
    if percentage > cutoff:
        gene_and_freq_d[gene] = percentage
        

In [13]:
freq_mutated_df = pd.DataFrame(gene_and_freq_d, index=['percent_mutated'])
freq_mutated_df['ARID1A']

percent_mutated    0.443299
Name: ARID1A, dtype: float64

In [6]:
gene = 'KRAS'
total_tumor_patients = 95
cutoff = 0.25
gene_mutated = somatic.loc[somatic['Gene'] == gene]
patients_gene_mutated = len(gene_mutated['Patient_Id'].unique()) #series of patients with mutated gene

print(patients_gene_mutated, ' / ', total_tumor_patients, ' ', (patients_gene_mutated / total_tumor_patients))
if (patients_gene_mutated / total_tumor_patients) > cutoff: #>24
    print('true')
else:
    print('false')

31  /  95.0   0.3263157894736842
true


In [8]:
gene_mutated = somatic.loc[somatic['Gene'] == 'KRAS'].iloc[:, 0] #.loc gets rows pten, .iloc gets col 0 (sample)
print(gene_mutated.head())
print('length gene_mutated (all mutations): ', len(gene_mutated))

gene_mutated.drop_duplicates(keep='first',inplace=True) #subset= colname when df
print('num gene mutated count once per patient:', len(gene_mutated))

1004    S002
1005    S002
1161    S003
1695    S010
3584    S014
Name: Clinical_Patient_Key, dtype: object
length gene_mutated (all mutations):  33
num gene mutated count once per patient: 31


Testing ideas

In [12]:
#check to see if mulitple mutations in one gene

patients = gene_mutated['Patient_Id']#series of ids
print('num of patients: ',len(patients))

#check each patient counted once #fixme check omics_mutations? 
print(len(gene_mutated['Patient_Id'].value_counts())) #gives num rows for each patient, descending order


new = patients[patients == 'C3L-00586'] #gets index for certain patient
print(new)


num of patients:  27
23
14133    C3L-00586
Name: Patient_Id, dtype: object


In [97]:
# sort list
#make dictionary
mydict = {}
total_tumor_patients = 95
cutoff = .1
freq_mutated = []
unique_genes = somatic['Gene'].unique()

for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    num_gene_mutated = len(gene_mutated['Patient_Id'].unique())     
    mydict[gene] = num_gene_mutated    
        

sorted_d = sorted(mydict.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:8]

[('PTEN', 75),
 ('PIK3CA', 47),
 ('ARID1A', 43),
 ('PIK3R1', 37),
 ('KRAS', 31),
 ('CTNNB1', 29),
 ('CTCF', 27),
 ('KMT2B', 23)]

In [None]:
if (num_gene_mutated / total_tumor_patients) > cutoff:
        freq_mutated.append(gene)

In [25]:
unique_genes = somatic['Gene'].unique()
print('unique_genes: ', len(unique_genes))
not_in_proteomics = []
in_proteomics = []

unique_genes:  14814


In [15]:
#challenges with append_mutations_to_omics, can't merge with somatic (prot vs gene names)
#prepare list for .append_mutations_to_omics
for gene in unique_genes:
    if gene in proteomics.columns:
        in_proteomics.append(gene)
    else:
        not_in_proteomics.append(gene)

In [20]:
len(in_proteomics) #8265
len(not_in_proteomics) #6549

6549

In [14]:
# test for loop to count genes
total_tumor_patients = 95
cutoff = .1
freq_mutated = []
unique_genes = somatic['Gene'].unique()
for gene in unique_genes:
    gene_mutated = somatic.loc[somatic['Gene'] == gene]
    num_gene_mutated = len(gene_mutated)
    if (num_gene_mutated / total_tumor_patients) > cutoff:
            freq_mutated.append(gene)