# Pancancer Boxplot for PTEN trans effect on MCM2 phosphoproteomics

This notebook creates a boxplot for the trans effect of PTEN cnv deletions in 8 cancers. These 8 cancers were chosen because they have enough samples with PTEN cnv deletions to do a t-test. Only Ccrcc does not have enough samples with deletions. 

Imports

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import sys
import plot_utils as ut

Functions:

format_df prepares a data frame with PTEN proteomics and PTEN mutation types from get_genotype_all_vars.

format_pval_annotation is used to add marks to the boxplot. A star represents significant p-value and "ns" represents a nonsignificant p-value.

In [69]:
# Returns a dataframe with proteomics and mutation type

def format_df(cancer_object, trans_gene, gene_in = 'PTEN', drop_level = 3):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'phosphoproteomics', omics_genes = trans_gene)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
        
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = cancer_object.reduce_multiindex(prot_and_mutations,
                                                                 levels_to_drop=[drop_level], flatten=True)

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='phosphoproteomics',genes1= gene_in, 
            genes2=trans_gene)
        omics = cancer_object.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV__')
        # Get only tumor samples
        p = l.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [34]:
prot_and_mutations = l.join_omics_to_mutations(
    mutations_genes = ['PTEN'], omics_df_name = 'phosphoproteomics', omics_genes = 'MCM2')
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

# Reduce a multiindex 
if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
    prot_and_mutations = ls.reduce_multiindex(prot_and_mutations, levels_to_drop=[3], flatten=True)
prot_and_mutations



Name,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDEDVEELTAsQR,MCM2_phosphoproteomics_S139_GLLYDsDEEDEERPAR,MCM2_phosphoproteomics_S170_ATEDGEEDEEMIEsIENLEDLK,MCM2_phosphoproteomics_S26S27_RGNDPLTssPGR,MCM2_phosphoproteomics_S27_GNDPLTSsPGR,MCM2_phosphoproteomics_S381_IQEsPGK,MCM2_phosphoproteomics_S40S41_TDALTssPGR,MCM2_phosphoproteomics_S41M77_RTDALTSsPGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGmER,MCM2_phosphoproteomics_T158_QVERAtEDGEEDEEMIESIENLEDLK,MCM2_phosphoproteomics_T25_RRGNDPLtSSPGR,PTEN_Mutation__,PTEN_Location__,PTEN_Mutation_Status__,Sample_Status__
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C3L-00081,,1.0437,,-0.0374,0.5615,,0.3830,0.3563,,1.9446,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00415,1.4818,0.7056,1.1065,,,-1.2280,0.9578,0.7101,-0.3497,0.3243,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00445,0.0843,0.5319,-0.0160,0.5257,1.3807,1.1300,0.6222,1.7907,,0.3802,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00568,-0.1885,0.6663,0.4402,,,1.1167,-0.1636,1.4081,1.3506,0.8407,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00603,0.0742,1.4027,2.1734,2.3722,1.4256,1.1982,1.1453,0.7978,,0.8536,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03886,1.8236,2.6035,2.4655,1.9592,,3.3977,2.6647,2.9886,1.8224,3.0354,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-04124,-0.4913,-0.4429,-0.4319,1.7152,,-1.1387,-0.4164,-1.0705,-1.2665,-0.3636,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-04127,0.1444,-0.6074,0.4294,-1.2400,,-0.3851,-0.2032,-1.5512,,-0.4215,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-04155,0.8447,1.8117,1.3487,,,1.3865,0.2179,1.3739,2.0212,1.2022,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor


In [50]:
omics = l.join_omics_to_omics(df1_name = 'CNV', df2_name='phosphoproteomics',genes1= 'PTEN', 
            genes2='MCM2')
omics = l.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
omics = omics.drop(columns='PTEN_CNV__')
omics



Name,MCM2_phosphoproteomics_AIPELDAYEAEGLALDDEDVEELTAsQR_NP_004517.2,MCM2_phosphoproteomics_GLLYDsDEEDEERPAR_NP_004517.2,MCM2_phosphoproteomics_RGNDPLTSsPGR_NP_004517.2,MCM2_phosphoproteomics_IQEsPGK_NP_004517.2,MCM2_phosphoproteomics_TDALTssPGR_NP_004517.2,MCM2_phosphoproteomics_RTDALtSSPGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGmER_NP_004517.2
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C3L-00001,-0.3442,-2.1796,-1.0381,-2.8939,,-0.9930
C3L-00009,-1.3712,-0.0050,-0.3403,0.2616,,-0.6235
C3L-00080,0.7855,2.4530,2.5073,4.6866,,2.4019
C3L-00083,-0.2696,-1.4357,0.0017,,,-0.2083
C3L-00093,-0.4590,0.8262,-0.2096,-1.7823,-0.1550,-0.8178
...,...,...,...,...,...,...
C3N-02582.N,0.5485,-1.2192,-0.6149,,-0.1586,0.2833
C3N-02586.N,-0.6541,-3.0871,-1.9174,,-1.3776,-1.1223
C3N-02587.N,-1.4323,-1.7396,-2.3258,,,0.2958
C3N-02588.N,0.6357,-1.7150,-1.7763,,,-1.4599


In [36]:
def format_pval_annotation(pval_symbol, x1, x2, line_start = .05, line_height=.05):
    # for manual adjustment to pval annotations
    
    y, h = line_start, line_height
    plt.plot([x1, x1, x2, x2], #draw horizontal line
             [y, y+h, y+h, y], #vertical line
             lw=1.5, color= '.3')
    plt.text((x1+x2)*.5, # half between x coord
             y+h, pval_symbol, horizontalalignment='center', verticalalignment='bottom', color = "black")


# Step 1: Create data frames with PTEN cnv deletion and Proteomics

Each cancer needs a data frame containing only samples that have PTEN cnv deletions or PTEN wildtype tumors and PTEN proteomics.

First, load in cancer data sets from cptac. Note: Keep the variable names the same because they are used in format_df to reduce multiindexs found in certain cancer data sets. 

In [29]:
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
g = cptac.Gbm()
b = cptac.Brca()

Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...



Checking that ovarian index is up-to-date...



Checking that brca index is up-to-date...   



                                         

Next, use format_df to create the specific data frame for each cancer. Use wrap_ttest to test PTEN deletion vs. wildtype. 

# Gbm

In [43]:
gene = 'PTEN'
t_gene = 'MCM2'
g_del_wt = format_df(g, t_gene)
g_del_wt.head()



Name,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDEDVEELTAS*QR,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDEDVEELTAS*QREAAER,MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPAR,MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPARK,MCM2_phosphoproteomics_S139_RGLLYDS*DEEDEERPAR,MCM2_phosphoproteomics_S139_RGLLYDS*DEEDEERPARK,MCM2_phosphoproteomics_S26S27_GNDPLTS*S*PGR,MCM2_phosphoproteomics_S27_GNDPLTSS*PGR,MCM2_phosphoproteomics_S27_RGNDPLTSS*PGR,MCM2_phosphoproteomics_S27S31_GNDPLTSS*PGRS*SR,MCM2_phosphoproteomics_S27S31_RGNDPLTSS*PGRS*SR,MCM2_phosphoproteomics_S40S41_RTDALTS*S*PGR,MCM2_phosphoproteomics_S40S41_RTDALTS*S*PGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGMER,MCM2_phosphoproteomics_S41_RTDALTSS*PGR,MCM2_phosphoproteomics_S41_TDALTSS*PGR,MCM2_phosphoproteomics_S41_TDALTSS*PGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGMER,MCM2_phosphoproteomics_T39_TDALT*SSPGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGMER,MCM2_phosphoproteomics_T59_RTDALTSSPGRDLPPFEDESEGLLGT*EGPLEEEEDGEELIGDGMER,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C3L-00104,0.250213,0.469123,0.553357,1.007214,0.116577,,,0.526739,,,,,,,0.846266,,,0.743703,Deletion
C3L-00365,0.680272,,0.592142,1.128073,,,,,0.940525,,,0.48957,,1.212528,,,,0.843078,Deletion
C3L-00674,-1.100628,1.074279,-0.417157,-0.105052,,,,-0.412701,-0.455602,,,,,-0.108264,-0.367349,,,1.049988,Deletion
C3L-00677,,0.539857,,0.118192,1.192966,,,-0.194149,-0.680688,,,,,,,,,-0.80254,Deletion
C3L-01040,-1.149843,-1.26967,-1.141344,-0.707192,-0.493904,,,-0.524123,,,,,,,-1.339159,,,-1.55689,Deletion


In [38]:
cols = list(g_del_wt.columns[:-1])
cols

['MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDEDVEELTAS*QR',
 'MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDEDVEELTAS*QREAAER',
 'MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPAR',
 'MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPARK',
 'MCM2_phosphoproteomics_S139_RGLLYDS*DEEDEERPAR',
 'MCM2_phosphoproteomics_S139_RGLLYDS*DEEDEERPARK',
 'MCM2_phosphoproteomics_S26S27_GNDPLTS*S*PGR',
 'MCM2_phosphoproteomics_S27_GNDPLTSS*PGR',
 'MCM2_phosphoproteomics_S27_RGNDPLTSS*PGR',
 'MCM2_phosphoproteomics_S27S31_GNDPLTSS*PGRS*SR',
 'MCM2_phosphoproteomics_S27S31_RGNDPLTSS*PGRS*SR',
 'MCM2_phosphoproteomics_S40S41_RTDALTS*S*PGR',
 'MCM2_phosphoproteomics_S40S41_RTDALTS*S*PGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGMER',
 'MCM2_phosphoproteomics_S41_RTDALTSS*PGR',
 'MCM2_phosphoproteomics_S41_TDALTSS*PGR',
 'MCM2_phosphoproteomics_S41_TDALTSS*PGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGMER',
 'MCM2_phosphoproteomics_T39_TDALT*SSPGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGMER',
 'MCM2_phosphoproteomics_T59_RTDALTSSPGRDLPPFEDESEGLL

In [39]:
g_pval = u.wrap_ttest(g_del_wt, 'Mutation', cols, return_all = True)
gp = g_pval['P_Value'][0]
g_pval

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPARK,0.000192
1,MCM2_phosphoproteomics_S139_RGLLYDS*DEEDEERPAR,0.001449
2,MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPAR,0.006748
3,MCM2_phosphoproteomics_S27_GNDPLTSS*PGR,0.007865
4,MCM2_phosphoproteomics_S41_RTDALTSS*PGR,0.034238
5,MCM2_phosphoproteomics_T59_RTDALTSSPGRDLPPFEDE...,0.043905
6,MCM2_phosphoproteomics_S26S27_GNDPLTS*S*PGR,0.04689
7,MCM2_phosphoproteomics_S27S31_RGNDPLTSS*PGRS*SR,0.062649
8,MCM2_phosphoproteomics_T39_TDALT*SSPGRDLPPFEDE...,0.207886
9,MCM2_phosphoproteomics_S40S41_RTDALTS*S*PGRDLP...,0.255586


# Luad

In [52]:
l_del_wt = format_df(l, t_gene)

# t-test
cols = list(l_del_wt.columns[:-1])
l_pval = u.wrap_ttest(l_del_wt, 'Mutation', cols, return_all = True)
lp = l_pval['P_Value'][0]
l_pval

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_RTDALtSSPGRDLPPFEDESEGL...,0.232525
1,MCM2_phosphoproteomics_TDALTssPGR_NP_004517.2,0.340057
2,MCM2_phosphoproteomics_GLLYDsDEEDEERPAR_NP_004...,0.348227
3,MCM2_phosphoproteomics_RGNDPLTSsPGR_NP_004517.2,0.364915
4,MCM2_phosphoproteomics_IQEsPGK_NP_004517.2,0.67716
5,MCM2_phosphoproteomics_AIPELDAYEAEGLALDDEDVEEL...,0.728188


# Lscc

In [53]:
ls_del_wt = format_df(ls, t_gene)

# t-test
cols = list(ls_del_wt.columns[:-1])
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', cols, return_all = True)
lsp = ls_pval['P_Value'][0]
ls_pval



Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_T25_RRGNDPLtSSPGR,0.000268
1,MCM2_phosphoproteomics_S41M77_RTDALTSsPGRDLPPF...,0.000956
2,MCM2_phosphoproteomics_S139_GLLYDsDEEDEERPAR,0.001413
3,MCM2_phosphoproteomics_S381_IQEsPGK,0.030974
4,MCM2_phosphoproteomics_S40S41_TDALTssPGR,0.081259
5,MCM2_phosphoproteomics_S170_ATEDGEEDEEMIEsIENL...,0.210675
6,MCM2_phosphoproteomics_S27_GNDPLTSsPGR,0.292314
7,MCM2_phosphoproteomics_T158_QVERAtEDGEEDEEMIES...,0.34454
8,MCM2_phosphoproteomics_S26S27_RGNDPLTssPGR,0.46278
9,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDE...,0.906169


# Ovarian

In [54]:
o_del_wt = format_df(o, t_gene)

# t-test
cols = list(o_del_wt.columns[:-1])
o_pval = u.wrap_ttest(o_del_wt, 'Mutation', cols, return_all = True)
op = o_pval['P_Value'][0]
o_pval

  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_S139_G.LLYDS*DEEDEERPAR.K,0.093656
1,MCM2_phosphoproteomics_S139_R.GLLYDS*DEEDEERPAR.K,0.106059
2,MCM2_phosphoproteomics_S139_R.GLLYDS*DEEDEER.P,0.156596
3,MCM2_phosphoproteomics_S27_R.GNDPLTSS*PGR.S,0.373581
4,MCM2_phosphoproteomics_S41_R.RTDALTSS*PGR.D,0.473149
5,MCM2_phosphoproteomics_S139_L.YDS*DEEDEERPAR.K,0.581648
6,MCM2_phosphoproteomics_S139_R.GLLYDS*DEEDEERPA.R,0.61675
7,MCM2_phosphoproteomics_S41_R.TDALTSS*PGR.D,0.637227
8,MCM2_phosphoproteomics_S27_R.RGNDPLTSS*PGR.S,0.663227
9,MCM2_phosphoproteomics_S13_F.TMASS*PAQR.R,


# Brca

In [55]:
b_del_wt = format_df(b, t_gene)

# t-test
cols = list(b_del_wt.columns[:-1])
b_pval = u.wrap_ttest(b_del_wt, 'Mutation', cols, return_all = True)
bp = b_pval['P_Value'][0]
b_pval



Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_S381_IQEsPGK,0.227976
1,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDE...,0.376895
2,MCM2_phosphoproteomics_S26S27_RGNDPLTssPGR,0.382376
3,MCM2_phosphoproteomics_S40S41_RTDALTssPGR,0.58141
4,MCM2_phosphoproteomics_S41_RTDALTSsPGRDLPPFEDE...,0.728327
5,MCM2_phosphoproteomics_T158_AtEDGEEDEEMIESIENL...,0.761973
6,MCM2_phosphoproteomics_S139_GLLYDsDEEDEERPAR,0.773645
7,MCM2_phosphoproteomics_S170_ATEDGEEDEEMIEsIENL...,0.808375
8,MCM2_phosphoproteomics_S27_RGNDPLTSsPGR,0.830809
9,MCM2_phosphoproteomics_S41_TDALTSsPGRDLPPFEDES...,0.849992


# Colon

In [58]:
c_del_wt = format_df(col, t_gene, drop_level = 2)

# t-test
cols = list(c_del_wt.columns[:-1])
c_pval = u.wrap_ttest(c_del_wt, 'Mutation', cols, return_all = True)
cp = c_pval['P_Value'][0]
c_pval

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_S53,0.345397
1,MCM2_phosphoproteomics_S27,0.656717
2,MCM2_phosphoproteomics_S139,0.87957
3,MCM2_phosphoproteomics_S41,0.98534
4,MCM2_phosphoproteomics_S108,
5,MCM2_phosphoproteomics_T59,


#  Hnscc

In [68]:
h_del_wt = format_df(h, t_gene, drop_level=3)

# t-test
cols = list(h_del_wt.columns[:-1])
h_pval = u.wrap_ttest(h_del_wt, 'Mutation', cols, return_all = True)
hp = h_pval['P_Value'][0]
h_pval

  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_S41_RTDALTSsPGR,0.031083
1,MCM2_phosphoproteomics_T39_RTDALtSSPGRDLPPFEDE...,0.0337
2,MCM2_phosphoproteomics_S381_IQEsPGK,0.049361
3,MCM2_phosphoproteomics_S27_RRGNDPLTSsPGR,0.055931
4,MCM2_phosphoproteomics_S170_ATEDGEEDEEMIEsIENL...,0.119922
5,MCM2_phosphoproteomics_S139_RGLLYDsDEEDEERPARK,0.189839
6,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDE...,0.203755
7,MCM2_phosphoproteomics_S27S31_RGNDPLTSsPGRsSR,0.216745
8,MCM2_phosphoproteomics_S700_EEEGLANGsAAEPAMPNT...,0.244018
9,MCM2_phosphoproteomics_S53_DLPPFEDEsEGLLGTEGPL...,0.281392


# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frame shifts). Different code is needed to create the data frame for Endometrial.

In [86]:
# Step 1 - Create proteomics and truncations dataframe
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'phosphoproteomics', omics_genes = t_gene, 
    mutations_filter = ['Nonsense_Mutation','Frame_Shift_Del','Frame_Shift_Ins']) 
prot_and_mutations = en.reduce_multiindex(prot_and_mutations, flatten = True)
keep = ['Nonsense_Mutation','Frame_Shift_Del','Frame_Shift_Ins','Wildtype_Tumor']
in_keep = prot_and_mutations['PTEN_Mutation_'].isin(keep) 
trunc_mutations = prot_and_mutations[in_keep]
print(trunc_mutations['PTEN_Mutation_'].value_counts()) # may need to change extra char

# Step 2 - Create binary column 
trunc_mutations['Mutation'] = np.where(
            trunc_mutations[gene+'_Mutation_Status_'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')

# Step 3 - Format the dataframe correctly for the t-test(just omics and binary columns for tumors)
tumors = trunc_mutations[trunc_mutations.Sample_Status_ == 'Tumor'] #drop Normal samples
columns_to_drop = [gene+"_Mutation_", gene+"_Location_", gene+"_Mutation_Status_", "Sample_Status_"]
e_trunc_wt = tumors.drop(columns_to_drop, axis = 1)
e_trunc_wt = e_trunc_wt.dropna(axis=1,how='all')
e_trunc_wt['Mutation'].value_counts()

# t-test
cols = list(e_trunc_wt.columns[:-1])
e_pval = u.wrap_ttest(e_trunc_wt, 'Mutation', cols, return_all=True)
ep = e_pval['P_Value'][0]
e_pval

Nonsense_Mutation    25
Frame_Shift_Del      22
Wildtype_Tumor       20
Frame_Shift_Ins       6
Name: PTEN_Mutation_, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Comparison,P_Value
0,MCM2_phosphoproteomics_S139,0.012954
1,MCM2_phosphoproteomics_S41,0.028419
2,MCM2_phosphoproteomics_S27,0.033836
3,MCM2_phosphoproteomics_T59,0.076825
4,MCM2_phosphoproteomics_S40,0.149126
5,MCM2_phosphoproteomics_S170,0.187925
6,MCM2_phosphoproteomics_T39,0.448562
7,MCM2_phosphoproteomics_S108,0.476061
8,MCM2_phosphoproteomics_Y137,0.62966
9,MCM2_phosphoproteomics_S26,0.667774


# Step 2: Create a long dataframe for the boxplot

The boxplot will take three columns: Proteomics, Mutation, and Cancer. We need to append all the individual cancer dfs into one long pancancer df. 

First create the Cancer column.

In [87]:
g_del_wt = g_del_wt.assign(cancer = 'Gbm')
l_del_wt = l_del_wt.assign(cancer = 'Luad')
ls_del_wt = ls_del_wt.assign(cancer = 'Lscc')
b_del_wt = b_del_wt.assign(cancer = 'Brca')
o_del_wt = o_del_wt.assign(cancer = 'Ovarian')
c_del_wt = c_del_wt.assign(cancer = 'Colon')
h_del_wt = h_del_wt.assign(cancer = 'Hnscc') # higher scale
e_trunc_wt = e_trunc_wt.assign(cancer = 'Endometrial')

Next append the dfs.

In [88]:
df = g_del_wt.append(l_del_wt)
df2 = df.append(ls_del_wt)
df3 = df2.append(b_del_wt)
df4 = df3.append(o_del_wt)
df5 = df4.append(c_del_wt)
df6 = df5.append(e_trunc_wt)
#df7 = df6.append(h_del_wt) # Leave out Hnscc because of high proteomics numbers

df6

Unnamed: 0_level_0,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDEDVEELTAS*QR,MCM2_phosphoproteomics_S108_AIPELDAYEAEGLALDDEDVEELTAS*QREAAER,MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPAR,MCM2_phosphoproteomics_S139_GLLYDS*DEEDEERPARK,MCM2_phosphoproteomics_S139_RGLLYDS*DEEDEERPAR,MCM2_phosphoproteomics_S139_RGLLYDS*DEEDEERPARK,MCM2_phosphoproteomics_S26S27_GNDPLTS*S*PGR,MCM2_phosphoproteomics_S27_GNDPLTSS*PGR,MCM2_phosphoproteomics_S27_RGNDPLTSS*PGR,MCM2_phosphoproteomics_S27S31_GNDPLTSS*PGRS*SR,...,MCM2_phosphoproteomics_S41,MCM2_phosphoproteomics_S53,MCM2_phosphoproteomics_T59,MCM2_phosphoproteomics_S170,MCM2_phosphoproteomics_S26,MCM2_phosphoproteomics_S31,MCM2_phosphoproteomics_S40,MCM2_phosphoproteomics_T39,MCM2_phosphoproteomics_Y137,MCM2_phosphoproteomics_Y90
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,0.250213,0.469123,0.553357,1.007214,0.116577,,,0.526739,,,...,,,,,,,,,,
C3L-00365,0.680272,,0.592142,1.128073,,,,,0.940525,,...,,,,,,,,,,
C3L-00674,-1.100628,1.074279,-0.417157,-0.105052,,,,-0.412701,-0.455602,,...,,,,,,,,,,
C3L-00677,,0.539857,,0.118192,1.192966,,,-0.194149,-0.680688,,...,,,,,,,,,,
C3L-01040,-1.149843,-1.269670,-1.141344,-0.707192,-0.493904,,,-0.524123,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01520,,,,,,,,,,,...,-0.9590,-1.3700,,,,,,,,
C3N-01521,,,,,,,,,,,...,0.4945,0.1825,,,,,,,,
C3N-01537,,,,,,,,,,,...,,,0.4560,0.265,,,,,0.199,
C3N-01802,,,,,,,,,,,...,-0.0144,,,,,,,,,


# Step 3: Create the Pancancer Boxplot

In [89]:
gene = 'PTEN'
plt.rcParams['figure.figsize']=(15,10) #size of plot
sns.set(font_scale = 1.2)

boxplot = sns.boxplot(x='cancer', y=t_gene+"_proteomics", data = df6, hue = 'Mutation',
                      hue_order = ["Wildtype_Tumor", "Deletion",'Truncation'], showfliers = False)    
boxplot.set_title('Pancancer trans effect of PTEN CNV Deletions on '+t_gene)
boxplot = sns.stripplot(x='cancer', y=t_gene+"_proteomics", data = df6, jitter = True, 
                           color = ".3", hue = 'Mutation', hue_order = ["Wildtype_Tumor", "Deletion", 'Truncation'],dodge = True)
boxplot.set(xlabel = "\n"+gene+" Wildtype/CNV Deletion", ylabel = t_gene+' Phosphoproteomics')

# format legend
handles, labels = boxplot.get_legend_handles_labels()
plt.legend(handles[0:3], labels[0:3])


# Create significance symbols:
# * P ≤ 0.05   ** P ≤ 0.01   *** P ≤ 0.001 
pvals = [gp, lp, lsp, bp, op, cp, ep] # same order as appended in long df
symbols = {}

for p in pvals:
    if p <= 0.001:
        symbols[p] = '***'
    elif p <= 0.01:
        symbols[p] = '**'
    elif p <= .05:
        symbols[p] = '*'
    else:
        symbols[p] = 'ns'
#print(symbols)

format_pval_annotation(symbols[gp], -.3, 0, 4) # Gbm
format_pval_annotation(symbols[lp], .7, 1, 6) # Luad
format_pval_annotation(symbols[lsp], 1.7, 2, 5) # Lscc
format_pval_annotation(symbols[bp], 2.7, 3, 4) # Brca
format_pval_annotation(symbols[op], 3.7, 4, 2.6) # Ovarian
format_pval_annotation(symbols[cp], 4.7, 5, 2.6)  # Colon
format_pval_annotation(symbols[ep], 5.7, 6.2, 2.6) # Endometrial

plt.show()
plt.clf()
plt.close()

ValueError: Could not interpret input 'MCM2_proteomics'

In [17]:
# Save figure
#fig = boxplot.get_figure()
#fig.savefig(".png")