# MMS19 Pancancer Boxplot - PTEN trans effect

This notebook creates a boxplot for the trans effect of PTEN cnv deletions in 8 cancers. These 8 cancers were chosen because they have enough samples with PTEN cnv deletions to do a t-test. Only Ccrcc does not have enough samples with deletions. 

Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import sys
import plot_utils as p



Functions:

format_df prepares a data frame with PTEN proteomics and PTEN mutation types from get_genotype_all_vars.

format_pval_annotation is used to add marks to the boxplot. A star represents significant p-value and "ns" represents a nonsignificant p-value.

In [2]:
# Returns a dataframe with proteomics and mutation type

def format_df(cancer_object, trans_gene, gene_in = 'PTEN'):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = trans_gene)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
        
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2=trans_gene)
        omics = u.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = cancer_object.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [3]:
def format_pval_annotation(pval_symbol, x1, x2, line_start = .05, line_height=.05):
    # for manual adjustment to pval annotations
    
    y, h = line_start, line_height
    plt.plot([x1, x1, x2, x2], #draw horizontal line
             [y, y+h, y+h, y], #vertical line
             lw=1.5, color= '.3')
    plt.text((x1+x2)*.5, # half between x coord
             y+h, pval_symbol, horizontalalignment='center', verticalalignment='bottom', color = "black")


# Step 1: Get corrected p-values  

In [7]:
df_fdr_pvals = pd.read_csv(r"..\Make_Tables\csv\all_proteins.csv")

In [8]:
df = df_fdr_pvals.loc[df_fdr_pvals['Proteomics'] == 'MMS19']
gene = df[['Proteomics']]
pvals = df[df.columns[1::2]]
pval_df = gene.join(pvals)
pval_df

Unnamed: 0,Proteomics,Gbm_P_Value,Hnscc_P_Value,Luad_P_Value,Lscc_P_Value,Brca_P_Value,Ov_P_Value,Endo_P_Value,Colon_P_Value
66,MMS19,0.000629,0.14501,0.027361,0.366903,0.403055,0.00011,0.519462,0.624087


# Step 2: Create data frames with PTEN cnv deletion and Proteomics

Each cancer needs a data frame containing only samples that have PTEN cnv deletions or PTEN wildtype tumors and PTEN proteomics. Use format_df to create the specific data frame for each cancer.  

First, load in cancer data sets from cptac. 

In [9]:
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
g = cptac.Gbm()
b = cptac.Brca()

Loading hnscc v2.0........                      

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

# Gbm

In [None]:
t_gene = 'MMS19'
gene = 'PTEN'
g_del_wt = format_df(g, t_gene)
print(g_del_wt.head())

# Luad

In [None]:
l_del_wt = format_df(l, t_gene)

# Lscc

In [None]:
ls_del_wt = format_df(ls, t_gene)

# Ovarian

In [None]:
o_del_wt = format_df(o, t_gene)

# Brca

In [None]:
b_del_wt = format_df(b, t_gene)

# Colon

In [None]:
c_del_wt = format_df(col, t_gene)

#  Hnscc

In [None]:
h_del_wt = format_df(h, t_gene)

# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frame shifts). Different code is needed to create the data frame for Endometrial.

In [None]:
# Step 1 - Create proteomics and truncations dataframe
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = t_gene, 
    mutations_filter = ['Nonsense_Mutation','Frame_Shift_Del','Frame_Shift_Ins']) 
keep = ['Nonsense_Mutation','Frame_Shift_Del','Frame_Shift_Ins','Wildtype_Tumor']
in_keep = prot_and_mutations['PTEN_Mutation'].isin(keep)
trunc_mutations = prot_and_mutations[in_keep]
print(trunc_mutations['PTEN_Mutation'].value_counts())

# Step 2 - Create binary column 
trunc_mutations['Mutation'] = np.where(
            trunc_mutations[gene+'_Mutation_Status'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')

# Step 3 - Format the dataframe correctly for the t-test(just omics and binary columns for tumors)
tumors = trunc_mutations[trunc_mutations.Sample_Status == 'Tumor'] #drop Normal samples
columns_to_drop = [gene+"_Mutation", gene+"_Location", gene+"_Mutation_Status", "Sample_Status"]
e_trunc_wt = tumors.drop(columns_to_drop, axis = 1)
e_trunc_wt = e_trunc_wt.dropna(axis=1,how='all')
e_trunc_wt['Mutation'].value_counts()

# Step 2: Create a long dataframe for the boxplot

The boxplot will take three columns: Proteomics, Mutation, and Cancer. We need to append all the individual cancer dfs into one long pancancer df. 

First create the Cancer column.

In [None]:
g_del_wt = g_del_wt.assign(cancer = 'Gbm')
l_del_wt = l_del_wt.assign(cancer = 'Luad')
ls_del_wt = ls_del_wt.assign(cancer = 'Lscc')
b_del_wt = b_del_wt.assign(cancer = 'Brca')
o_del_wt = o_del_wt.assign(cancer = 'Ovarian')
c_del_wt = c_del_wt.assign(cancer = 'Colon')
h_del_wt = h_del_wt.assign(cancer = 'Hnscc') # higher scale
e_trunc_wt = e_trunc_wt.assign(cancer = 'Endometrial')

Next append the dfs.

In [None]:
df = g_del_wt.append(l_del_wt)
df2 = df.append(ls_del_wt)
df3 = df2.append(b_del_wt)
df4 = df3.append(o_del_wt)
df5 = df4.append(c_del_wt)
df6 = df5.append(e_trunc_wt)
#df7 = df6.append(h_del_wt) # Leave out Hnscc because of high proteomics numbers

df6.cancer.unique()

# Step 3: Create the Pancancer Boxplot

In [None]:
gene = 'PTEN'
plt.rcParams['figure.figsize']=(15,10) #size of plot
sns.set(font_scale = 2)

boxplot = sns.boxplot(x='cancer', y=t_gene+"_proteomics", data = df6, hue = 'Mutation',
                      hue_order = ["Wildtype_Tumor", "Deletion",'Truncation'], showfliers = False)    
boxplot.set_title('Pancancer trans effect of PTEN CNV Deletions on '+t_gene)
boxplot = sns.stripplot(x='cancer', y=t_gene+"_proteomics", data = df6, jitter = True, 
                           color = ".3", hue = 'Mutation', hue_order = ["Wildtype_Tumor", "Deletion", 'Truncation'],dodge = True)
boxplot.set(xlabel = "\n"+gene+" Wildtype/CNV Deletion", ylabel = t_gene+' Proteomics')

# format legend
handles, labels = boxplot.get_legend_handles_labels()
plt.legend(handles[0:3], labels[0:3])

cancer_list = ['Gbm','Hnscc','Luad','Lscc','Brca','Ov','En','Colon']
# create pval annotations
symbols = {}
for cancer in cancer_list:
    pval = float(pval_df[cancer+'_P_Value'])
    if pval  <= 0.001:
        symbols[cancer] = '***'
    elif pval  <= 0.01:
        symbols[cancer] = '**'  
    elif pval <= 0.05:
        symbols[cancer] = '*'
    else:
        symbols[cancer] = 'ns'

format_pval_annotation(symbols['Gbm'], -.3, 0, 0.7) # Gbm
format_pval_annotation(symbols['Luad'], .7, 1, 1.8) # Luad
format_pval_annotation(symbols['Lscc'], 1.7, 2, 1.9) # Lscc
format_pval_annotation(symbols['Brca'], 2.7, 3, 1.5) # Brca
format_pval_annotation(symbols['Ov'], 3.7, 4, 1.1) # Ovarian
format_pval_annotation(symbols['Colon'], 4.7, 5, 1)  # Colon
format_pval_annotation(symbols['En'], 5.7, 6.2, 1) # Endometrial'''

boxplot.figure.savefig("PTEN_Supplemental_MMS19_boxplot.png", dpi = 300)

plt.show()
plt.clf()
plt.close()

In [None]:
pval_df

In [None]:
# Figure for Hnscc (higher scale)
gene = 'PTEN'
plt.rcParams['figure.figsize']=(8,5) #size of plot
sns.set(font_scale = 1.2)

boxplot = sns.boxplot(x='cancer', y=t_gene+"_proteomics", data = h_del_wt, hue = 'Mutation',
                      hue_order = ["Wildtype_Tumor", "Deletion"], showfliers = False)    
boxplot.set_title('trans effect of PTEN CNV Deletions on '+t_gene+' in Hnscc')
boxplot = sns.stripplot(x='cancer', y=t_gene+"_proteomics", data = h_del_wt, jitter = True, 
                           color = ".3", hue = 'Mutation', hue_order = ["Wildtype_Tumor", "Deletion"],dodge = True)
boxplot.set(xlabel = "\n"+gene+" Wildtype/CNV Deletion", ylabel = t_gene+' Proteomics')

# format legend
handles, labels = boxplot.get_legend_handles_labels()
plt.legend(handles[0:2], labels[0:2])

format_pval_annotation(symbols['Hnscc'], -.2, .2, 27) 

plt.show()
plt.clf()
plt.close()