In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u


In [47]:
def get_frequently_mutated(cancer_object, cutoff = 0.1):  
    # Get total tumors/patients
    omics_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = 'TP53', omics_df_name = 'proteomics', omics_genes = 'TP53')
    tumors = omics_and_mutations.Sample_Status

    if isinstance(tumors, pd.DataFrame): # This would happen if our proteomics dataframe has a column multiindex, which leads to a joined df with a column multiindex, and causes our selection to be a dataframe instead of a series.
        tumors = tumors.iloc[:, 0]
        tumors.name = "Sample_Status"

    v = tumors.value_counts()
    total_tumors = v['Tumor']
    total_tumor_count = int(total_tumors)
    
    # Get mutations data frame
    somatic_mutations = cancer_object.get_somatic_mutation() 

    # Drop silent mutations for Hnscc, Ovarian, and Ccrcc dataset, and synonymous SNV (i.e. silent) mutations in HNSCC
    if 'Silent' in somatic_mutations['Mutation'].unique():
        origin_df = somatic_mutations.loc[somatic_mutations['Mutation'] != 'Silent'].reset_index()
    elif 'synonymous SNV' in somatic_mutations['Mutation'].unique():
        origin_df = somatic_mutations.loc[somatic_mutations['Mutation'] != 'synonymous SNV'].reset_index()
    else:
        origin_df = somatic_mutations.reset_index() #prepare to count unique samples
        
    # Create two categories in Mutation column - 'M': Missense, 'T': Truncation
    if cancer_object.get_cancer_type() in ('colon'):
        missense_truncation_groups = {'frameshift substitution': 'T', 
            'frameshift deletion': 'T', 'frameshift insertion': 'T', 
            'stopgain': 'T', 'stoploss': 'T', 'nonsynonymous SNV': 'M',
            'nonframeshift insertion': 'M','nonframeshift deletion': 'M', 
            'nonframeshift substitution': 'M'}
    else: 
        missense_truncation_groups = {'In_Frame_Del': 'M', 'In_Frame_Ins': 'M',
            'Missense_Mutation': 'M', 'Frame_Shift_Del': 'T','Nonsense_Mutation': 'T', 
            'Splice_Site': 'T', 'Frame_Shift_Ins': 'T','Nonstop_Mutation':'T'}
    
    mutations_replaced_M_T = origin_df.replace(missense_truncation_groups)
    
    # replace non_coding mutations for Gbm
    unique_mutations = len(mutations_replaced_M_T['Mutation'].unique())
    gbm = False
    if cancer_object.get_cancer_type() == 'gbm':
        gbm = True
        non_coding = {'Intron': 'NC', 'RNA': 'NC', "5'Flank": 'NC', "3'Flank": 'NC', 
            "5'UTR": 'NC', "3'UTR": 'NC', 'Splice_Region' : 'NC'}
        mutations_replaced_M_T = mutations_replaced_M_T.replace(non_coding)
        unique_mutations_2 = len(mutations_replaced_M_T['Mutation'].unique())
        
    elif unique_mutations != 2: # Check that all mutation names are catagorized
        print('Warning: New mutation name not classified. Counts will be affected.')
    
    # Find frequently mutated genes (total fraction > cutoff)
    # Same steps will be repeated for finding the missense and truncation mutation frequencies
    # Step 1 - group by gene and count unique samples
    # Step 2 - format
    # Step 3 - filter using the cutoff and create fraction 
    count_mutations = origin_df.groupby(['Gene']).nunique()
    count_mutations = count_mutations.rename(columns={"Patient_ID": "Unique_Samples_Mut"}) # Step 2 
    count_mutations = count_mutations.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_mutated = count_mutations.apply(lambda x: x / total_tumor_count) # Step 3 
    fraction_greater_than_cutoff = fraction_mutated.where(lambda x: x > cutoff) #na used when not > cutoff
    filtered_gene_df = fraction_greater_than_cutoff.dropna() # drop genes below cutoff
    
    # Create and join Missense column (following similar steps as seen above)
    miss = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'M']
    count_miss = miss.groupby(['Gene']).nunique()
    missense_df = count_miss.rename(columns={"Patient_ID": "Missense_Mut"})
    missense_df = missense_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_missense = missense_df.apply(lambda x: x / total_tumor_count)
    freq_mutated_df = filtered_gene_df.join(fraction_missense, how='left').fillna(0)
    
    # Create and join Truncation column (following similar steps as seen above)
    trunc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'T']
    count_trunc = trunc.groupby(['Gene']).nunique()
    truncation_df = count_trunc.rename(columns={"Patient_ID": "Truncation_Mut"})
    truncation_df = truncation_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
    fraction_truncation = truncation_df.apply(lambda x: x / total_tumor_count)
    freq_mutated_df = freq_mutated_df.join(fraction_truncation, how='left').fillna(0)
    
    
    if gbm == True:
        # Create and join non-coding column (following similar steps as seen above)
        nc = mutations_replaced_M_T.loc[mutations_replaced_M_T['Mutation'] == 'NC']
        count_nc = nc.groupby(['Gene']).nunique()
        nc_df = count_nc.rename(columns={"Patient_ID": "Non-Coding"})
        nc_df = nc_df.drop(['Gene', 'Mutation', 'Location'], axis = 1)
        fraction_nc = nc_df.apply(lambda x: x / total_tumor_count)
        freq_mutated_df = freq_mutated_df.join(fraction_nc, how='left').fillna(0)
        
    freq_mutated_df = freq_mutated_df.reset_index() #move genes to their own column
    
    return freq_mutated_df


In [3]:
e = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
c = cptac.Ccrcc()
col = cptac.Colon()
g = cptac.Gbm()
b = cptac.Brca()

Checking that lscc index is up-to-date...       



Checking that brca index is up-to-date...   



                                         

In [46]:
cancers = [g,h,l,ls,o,e,c,col,b]
a = []

for c in cancers:
    s = c.get_somatic_mutation()
    n = list(s.Mutation.unique()) 
    a.append(n)

flat_list = [item for sublist in a for item in sublist] #change list of lists, to just one list
m = set(flat_list) # remove duplicates
m
a

[['Missense_Mutation',
  'Silent',
  'In_Frame_Del',
  'Splice_Site',
  'Nonsense_Mutation',
  'Intron',
  'Frame_Shift_Del',
  'RNA',
  'Frame_Shift_Ins',
  "3'Flank",
  'In_Frame_Ins',
  'Splice_Region',
  "5'UTR",
  "5'Flank",
  "3'UTR",
  'Nonstop_Mutation'],
 ['Frame_Shift_Del',
  'Missense_Mutation',
  'Silent',
  'Splice_Site',
  'Nonsense_Mutation',
  'Frame_Shift_Ins',
  'In_Frame_Del',
  'In_Frame_Ins',
  'Nonstop_Mutation'],
 ['Missense_Mutation',
  'Nonsense_Mutation',
  'Silent',
  'Frame_Shift_Del',
  'In_Frame_Del',
  'Frame_Shift_Ins',
  'Splice_Site',
  'Nonstop_Mutation',
  'In_Frame_Ins'],
 ['Missense_Mutation',
  'Silent',
  'Nonsense_Mutation',
  'Splice_Site',
  'Frame_Shift_Del',
  'Frame_Shift_Ins',
  'In_Frame_Ins',
  'Nonstop_Mutation',
  'In_Frame_Del'],
 ['Missense_Mutation',
  'In_Frame_Ins',
  'Silent',
  'Nonsense_Mutation',
  'In_Frame_Del',
  'Splice_Site',
  'Frame_Shift_Del',
  'Frame_Shift_Ins',
  'Nonstop_Mutation'],
 ['Missense_Mutation',
  'Frame_

In [4]:
u.get_frequently_mutated(h)





Name,Gene,Unique_Samples_Mut,Missense_Mut,Truncation_Mut
0,ABCA13,0.12844,0.0,0.0
1,ADGRV1,0.110092,0.0,0.0
2,AHNAK,0.100917,0.0,0.0
3,AJUBA,0.110092,0.0,0.0
4,APOB,0.110092,0.0,0.0
5,ASPM,0.100917,0.0,0.0
6,ATP10A,0.119266,0.0,0.0
7,CCDC168,0.100917,0.0,0.0
8,CDKN2A,0.266055,0.0,0.0
9,COL22A1,0.100917,0.0,0.0
