# Create Figure 1 for AKT

This notebook creates a histogram that shows the different PTEN mutations found in 9 cancers.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as pu

# Step 1: Create data frames with Mutation Types

For each cancer type, create a data frame that has the mutation type for each sample.

First, load in the cancer data sets from cptac.

In [2]:
en = cptac.Endometrial()
hn = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
c = cptac.Ccrcc()
col = cptac.Colon()
g = cptac.Gbm()
b = cptac.Brca()

Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...



Checking that ovarian index is up-to-date...



Checking that brca index is up-to-date...   



                                         

Second, call get_geneotype_all_vars for PTEN for each cancer type. This returns a df with columns for Mutation (type of mutation), Location (location of the mutation), and Mutation_Status (wildtype or mutation count).

In [3]:
gene = "AKT"

In [4]:
m = ls.get_somatic_mutation()
m.loc[m['Gene'] == 'MCM7']

Name,Gene,Mutation,Location
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [5]:
cnv = l.get_CNV()
cnv['MCM7']

Patient_ID
C3L-00001   -0.0446
C3L-00009    0.0728
C3L-00080    0.1830
C3L-00083    0.0002
C3L-00093    0.2076
              ...  
C3N-02729    0.2709
X11LU013     0.1610
X11LU016     0.2907
X11LU022     0.1332
X11LU035     0.4829
Name: MCM7, Length: 109, dtype: float64

In [6]:
#gbm = g.get_genotype_all_vars(gene) # no somatic mutations, but there is cnv
#ld = l.get_genotype_all_vars(gene)
#lscc = ls.get_genotype_all_vars(gene)
#cc = c.get_genotype_all_vars(gene)

In [7]:
endo = en.get_genotype_all_vars(gene) 
hnscc = hn.get_genotype_all_vars(gene)
ovar = o.get_genotype_all_vars(gene)
colon = col.get_genotype_all_vars(gene)
brca = b.get_genotype_all_vars(gene)

KeyError: 'AKT'

# Step 2: Match Mutation Labels

The Colon dataset labeled their mutations slightly differently. Nonsynonymous snv in this case is a missense mutation. Nonframeshift insertion was changed to match In_Frame_Ins. Frameshift deletion was canged to match Frame_Shift_Del.

In [None]:
colon["Mutation"] = colon['Mutation'].replace(['nonsynonymous SNV'], 'Missense_Mutation')

colon["Mutation"] = colon['Mutation'].replace(['nonframeshift insertion'], 'In_Frame_Ins')

colon["Mutation"] = colon['Mutation'].replace(['frameshift deletion'], 'Frame_Shift_Del')

The get_genotype_all_var function created the No_Mutation label when no somatic mutations were found for PTEN in the Luad dataset. This is the same as Wildtype_Tumor. 

In [None]:
#ld["Mutation"] = ld['Mutation'].replace(['No_Mutation'], 'Wildtype_Tumor')

In [None]:
def plot_mutations(dflist = None, names_of_df=None):
    number_of_df = len(dflist)
    
    allLabels = []
    for df in dflist:
        #get the labels for each and make a combined label that they'll all use
        mutation = df["Mutation"]
        labels = set(mutation)
        allLabels.append(labels)

    
    flat_list = [item for sublist in allLabels for item in sublist]
    all_labels = list(set(flat_list))
    all_labels.sort()
    allLabels = all_labels
    
#     For each df, add na to their labels if it doesn't exist in all_labels
    labels_for_each_df = []
    frequencies_for_each_df = []
    for df in dflist:
        mutation = df["Mutation"].tolist()
        mutationlist = list(set(mutation))
        mutationlist.sort()
        ordered_mut_list = []
        match = True
        mutPosition = 0
        
        for position in range(len(all_labels)):
            try:
                
                if mutationlist[mutPosition] == all_labels[position]:
                    ordered_mut_list.append(mutationlist[mutPosition])
                    mutPosition += 1

                else:
                    ordered_mut_list.append("na")

            except IndexError:  
                ordered_mut_list.append("na")
       
        
        labels_for_each_df.append(ordered_mut_list)

        #get the freq of each mutation type
        freq = []
        for mutation_type in ordered_mut_list:
            freq.append(mutation.count(mutation_type))

        PercentFreq = [x*100 / sum(freq) for x in freq]
        frequencies_for_each_df.append(PercentFreq)
        
        
#     import pdb; pdb.set_trace()
    #Now plot it using arrays
    width = 0.1
    x = np.arange(len(allLabels))
    a4_dims = (13, 10) #dimensions for bigger plot
    fig, ax = plt.subplots(figsize=a4_dims)
    for position in range(0, number_of_df):
        r = ax.bar(x+(width*position), frequencies_for_each_df[position], 
                   width,label=names_of_df[position], alpha=.5, linewidth=0)

    ax.set_ylabel('Percent Sample')
    ax.set_title('Mutation Frequency and Effect')
    ax.set_xticks(x)
    ax.set_xticklabels(allLabels)
    ax.legend()
    
    fig.tight_layout()
    plt.setp(ax.get_xticklabels(),rotation='vertical')
    plt.show()
    
    #save fig
    #fig.savefig("Step_1_PTEN.png", bbox_inches="tight")
    

# Step 3: Create Figure

Create a list of the mutation data frames. Create a list of cancer names for the figure legend. Call the plot_mutations function.

In [None]:
dfs = [endo, hnscc, ovar, colon,brca]
names = ['Endo','Hnscc', 'Ovar', 'Colon', 'Brca']
plot_mutations(dfs, names)