# Create p-value and correlation tables for multiple cancers (includes figures)

Create a dataframe with p-value results from t-tests for all proteins (trans gene proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians (correlation) between deletions and wildtype. Create Heat Maps for interacting proteins.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

In [2]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN'):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = cancer_object.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = l.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot)
        omics = l.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = l.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [3]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    

    for prot in all_prot_list:
        dif = del_med[prot] - wt_med[prot]
        med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

#  Step 1: Create Data frames with p-values and differences in median

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run many T-tests for all genes in the proteomics data frame. Use get_change_in_medians_df to create the data frame with change in median values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [4]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that lscc index is up-to-date...       



                                            

In [5]:
ip = u.get_interacting_proteins('PTEN')

# RAS-RAF-MEK-ERK MAPK Pathway
ip.append('GRB2')
ip.append('SHC1') # (SHC in paper)
ip.append('SOS1') # (SOS in the paper) *GEF for RAS "recruitment to the plasma membrane requires GRB2, PIP2, PA" 
ip.append('KRAS') # (RAS in paper) possibly add "RAS regulators such as PHLPP, SHP-2, and NF-2"
ip.append('RAF1') # (RAF-1 in paper)
ip.append('MAP2K1') #(MEK1 protein name in paper)
ip.append('MAP2K2 ') # (MEK2 in paper)
ip.append('MAPK3') # (ERK1 in paper) https://www.uniprot.org/uniprot/P27361
ip.append('MAPK1') # (ERK2 in paper)

#mTOR Pathway
ip.append('GAB1')
#ip.append('PI3K')
ip.append('AKT1') # (AKT in paper) *binds PIP3 (see info for isoform AKT1 at: https://www.uniprot.org/uniprot/P31749)
ip.append('PDK1') #activate AKT
ip.append('MTORC2') #activate AKT
ip.append('GSK3B') # (GSK-3B in paper)inhibit cyclin D (akt inhibits GSK3B)
ip.append('FOXO1') #(FOXO in paper)
ip.append('TSC2') # repress mTOR, AKT inhibits
ip.append('MTOR') # promote cyclin D
ip.append('BAD')
ip.append('CAS9')

ip.append('CCND1') # Cyclin D1 -> G1/S 


singles = set(ip)
ip = list(singles)
print(len(ip))

46


# Gbm

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [6]:
gene = 'PTEN'

g_del_wt = all_prot_format_df(g, ip)
g_del_wt = g_del_wt.dropna(axis='columns', how='all')
g_del_wt.head()



Name,FOXO1_proteomics,SOS1_proteomics,PIK3CB_proteomics,CSNK2A2_proteomics,RAF1_proteomics,MAST2_proteomics,PIK3CA_proteomics,TP53_proteomics,MVP_proteomics,SHC1_proteomics,...,AKT1_proteomics,PIK3CD_proteomics,XIAP_proteomics,EGFR_proteomics,INPP4B_proteomics,TSC2_proteomics,SLC9A3R1_proteomics,CCND1_proteomics,MAPK1_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,-0.08552,0.086085,-0.550558,-0.273672,-0.005546,,0.124542,0.044573,0.147936,-0.095985,...,-0.06593,-0.315003,-0.069611,1.054962,0.060124,-0.049127,0.616424,-0.135982,0.364629,Deletion
C3L-00365,0.06686,0.198497,0.191167,-0.035616,-0.148462,0.213109,-0.211025,0.117856,0.371665,0.003049,...,-0.285929,-0.629943,0.028903,2.219562,-0.639545,-0.273339,0.450206,,0.419605,Deletion
C3L-00674,0.185222,-0.166658,-0.548042,-0.105403,-0.283019,-0.706997,-0.573919,-0.120887,0.329753,0.051279,...,-0.222607,0.085276,-0.479773,0.137329,0.669283,-0.118501,0.307356,-0.092478,-0.369258,Deletion
C3L-00677,0.251746,-0.01403,-0.397225,-0.189861,-0.258855,0.123021,-0.360962,1.832557,-0.518407,0.288725,...,-0.370584,-0.382652,-0.205255,-0.768478,0.451016,-0.098653,-0.040643,-0.454459,-0.090202,Deletion
C3L-01040,-0.19003,-0.269299,-0.395284,0.239919,-0.263888,,0.137414,-0.288708,-0.741119,-0.337079,...,0.004573,-0.537787,-0.123213,-1.249435,-0.024038,-0.137478,-0.155468,0.272387,-0.919902,Deletion


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used (FDR_BH).

In [7]:
cols = list(g_del_wt.columns[:-1])

# Get only sig genes
g_sig = u.wrap_ttest(g_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
# Create list of sig genes
if g_sig is not None:
    g_sig_list = list(g_sig.Comparison)
else: 
    g_sig_list = None
print('significant pvals: \n',g_sig)

# Get all pvals
g = u.wrap_ttest(g_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
g_pval = g.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Gbm_P_Value'})

significant pvals: 
           Comparison       P_Value
0    PTEN_proteomics  1.219523e-07
1   MAPK3_proteomics  1.020660e-06
2  PIK3CD_proteomics  1.030815e-03
3    GRB2_proteomics  1.089951e-03
4  PIK3C3_proteomics  1.225949e-03
5    USP7_proteomics  4.017792e-03
6  MAP2K1_proteomics  4.138456e-03
7  PIK3R1_proteomics  4.282296e-03


Part 3: Create the differences in median df. (median of wildtype tumors - median of cnv deletions)

In [8]:
g_med = get_change_in_medians_df(g, "Gbm", g_del_wt, cols)

Part 4: Merge the p-values and the differences in median dfs.

In [9]:
g_merged = g_pval.merge(g_med, on='Proteomics',how='outer')

g_merged.head()

Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median
0,PTEN_proteomics,1.219523e-07,-0.511726
1,MAPK3_proteomics,1.02066e-06,-0.665105
2,PIK3CD_proteomics,0.001030815,-0.492099
3,GRB2_proteomics,0.001089951,-0.312146
4,PIK3C3_proteomics,0.001225949,-0.102518


# Repeat for other cancers.

# Ovarian

In [10]:
gene = 'PTEN'

o_del_wt = all_prot_format_df(o, ip)
o_del_wt = o_del_wt.dropna(axis='columns', how='all')
# Drop duplicate columns - FIX
o_del_wt = o_del_wt.loc[:,~o_del_wt.columns.duplicated()]



In [11]:
cols = list(o_del_wt.columns[:-1])

# Get only sig sites
o_sig = u.wrap_ttest(o_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if o_sig is not None:
    o_sig_list = list(o_sig.Comparison)
else: 
    o_sig_list = None
print('significant pvals: \n',o_sig)

# Get all pvals
o = u.wrap_ttest(o_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
o_pval = o.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Ov_P_Value'})

significant pvals: 
         Comparison   P_Value
0  PTEN_proteomics  0.000909


In [12]:
o_med = get_change_in_medians_df(o, "Ov", o_del_wt, cols)

In [13]:
o_merged = o_pval.merge(o_med, on='Proteomics',how='outer')

# Breast

In [14]:
gene = 'PTEN'

b_del_wt = all_prot_format_df(b, ip)
b_del_wt = b_del_wt.dropna(axis='columns', how='all')
b_del_wt = b_del_wt.loc[:,~b_del_wt.columns.duplicated()]
b_del_wt.head()



Name,AKT1_proteomics,BAD_proteomics,CCND1_proteomics,CSNK2A2_proteomics,EGFR_proteomics,FOXO1_proteomics,GAB1_proteomics,GRB2_proteomics,GSK3B_proteomics,INPP4B_proteomics,...,ROCK1_proteomics,SHC1_proteomics,SLC9A3R1_proteomics,SOS1_proteomics,TP53_proteomics,TSC2_proteomics,USP13_proteomics,USP7_proteomics,XIAP_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT000814,-1.3052,-3.4476,-0.9314,0.2406,-0.4834,-3.7568,-1.1271,-0.2427,1.5534,-3.0446,...,-1.1252,1.5515,-1.7943,0.174,2.0719,-0.3132,0.8608,0.1114,-0.2036,Wildtype_Tumor
CPT001846,-1.0153,-2.9549,-1.3726,0.3962,4.1062,1.4313,-0.7836,0.0424,-0.2608,-1.6078,...,-0.0308,1.1821,-2.6813,0.0843,-1.099,0.1296,0.1383,-0.5048,0.6489,Wildtype_Tumor
X01BR001,-0.9244,-0.0662,-0.4825,-0.4782,1.2191,0.1388,0.2541,-0.5892,-0.5551,-1.8425,...,0.8348,1.3002,-1.4497,0.0299,0.8946,-0.8476,-1.0077,-0.0277,0.3672,Wildtype_Tumor
X01BR009,-1.7569,1.8181,-1.8982,0.498,1.6974,0.3465,-0.1929,-0.3572,0.4441,-4.2738,...,0.4158,0.0152,-1.852,-0.1543,2.4935,-1.0712,-0.3624,-0.4882,-0.9145,Wildtype_Tumor
X01BR010,0.3344,-0.5176,-0.6633,0.2158,2.4878,1.4308,-0.6238,-1.256,0.1244,0.2529,...,-0.2732,0.2677,-0.488,-0.1472,-1.545,0.4307,-2.2463,-0.8683,0.0849,Deletion


In [15]:
cols = list(b_del_wt.columns[:-1])

# Get only sig sites
b_sig = u.wrap_ttest(b_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if b_sig is not None:
    b_sig_list = list(b_sig.Comparison)
else: 
    b_sig_list = None
print('significant pvals: \n',b_sig)

# Get all pvals
b_pval = u.wrap_ttest(b_del_wt, 'Mutation', cols, return_all = True)
b_pval = b_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Brca_P_Value'})

significant pvals: 
         Comparison   P_Value
0  PTEN_proteomics  0.000004


In [16]:
b_med = get_change_in_medians_df(b, "Brca", b_del_wt, cols)

In [17]:
b_merged = b_pval.merge(b_med, on='Proteomics',how='outer')

# Colon

In [18]:
gene = 'PTEN'

c_del_wt = all_prot_format_df(col, ip)
c_del_wt = c_del_wt.dropna(axis='columns', how='all')
c_del_wt = c_del_wt.loc[:,~c_del_wt.columns.duplicated()]



In [19]:
cols = list(c_del_wt.columns[:-1])

# Get only sig sites
c_sig = u.wrap_ttest(c_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if c_sig is not None:
    c_sig_list = list(c_sig.Comparison)
else: 
    c_sig_list = None
print('significant pvals: \n',c_sig)

# Get all pvals
c_pval = u.wrap_ttest(c_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
c_pval = c_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Colon_P_Value'})

significant pvals: 
 None


In [20]:
c_med = get_change_in_medians_df(col, "Colon", c_del_wt, cols)

In [21]:
c_merged = c_pval.merge(c_med, on='Proteomics',how='outer')

# Hnscc

In [22]:
gene = 'PTEN'

h_del_wt = all_prot_format_df(h, ip)
h_del_wt = h_del_wt.dropna(axis='columns', how='all')



In [23]:
cols = list(h_del_wt.columns[:-1])

# Get only sig sites
h_sig = u.wrap_ttest(h_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if h_sig is not None:
    h_sig_list = list(h_sig.Comparison)
else: 
    h_sig_list = None
print('significant pvals: \n',h_sig)

# Get all pvals
h_pval = u.wrap_ttest(h_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
h_pval = h_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Hnscc_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1
  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
           Comparison   P_Value
0    PTEN_proteomics  0.000023
1  INPP4B_proteomics  0.000676
2  PIK3CA_proteomics  0.001677


In [24]:
h_med = get_change_in_medians_df(h, "Hnscc", h_del_wt, cols)

In [25]:
h_merged = h_pval.merge(h_med, on='Proteomics',how='outer')
#del_wt[['PREX2_proteomics','Mutation']].dropna() # in median, not in pval (only 1 value for deletion)

# Luad

In [26]:
gene = 'PTEN'

l_del_wt = all_prot_format_df(l, ip)
l_del_wt = l_del_wt.dropna(axis='columns', how='all')
l_del_wt = l_del_wt.loc[:,~l_del_wt.columns.duplicated()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
cols = list(l_del_wt.columns[:-2])

# Get only sig sites
l_sig = u.wrap_ttest(l_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if l_sig is not None:
    l_sig_list = list(l_sig.Comparison)
else: 
    l_sig_list = None
print('significant pvals: \n',l_sig)

# Get all pvals
l_pval = u.wrap_ttest(l_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
l_pval = l_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Luad_P_Value'})
# isoforms for some proteins

significant pvals: 
           Comparison       P_Value
0    EGFR_proteomics  9.121167e-07
1  PIK3CD_proteomics  4.312883e-04


In [28]:
#Isoform code - fix later
#del_wt.replace(to_replace = r'_NP_.*$', value = '', regex = True)

In [29]:
l_med = get_change_in_medians_df(l, "Luad", l_del_wt, cols)

In [30]:
l_merged = l_pval.merge(l_med, on='Proteomics',how='outer')

# Lscc

In [31]:
gene = 'PTEN'

ls_del_wt = all_prot_format_df(ls, ip)
ls_del_wt = ls_del_wt.dropna(axis='columns', how='all')
ls_del_wt = ls_del_wt.loc[:,~ls_del_wt.columns.duplicated()]



In [32]:
cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = u.wrap_ttest(ls_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n',ls_sig)

# Get all pvals
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})
#ls_pval # isoforms for some proteins

significant pvals: 
            Comparison   P_Value
0     PTEN_proteomics  0.000056
1   PIK3CD_proteomics  0.000292
2  CSNK2A1_proteomics  0.000684
3    GSK3B_proteomics  0.002182
4    USP13_proteomics  0.002580
5      MVP_proteomics  0.003226


In [33]:
ls_med = get_change_in_medians_df(ls, "Lscc", ls_del_wt, cols)

In [34]:
ls_merged = ls_pval.merge(ls_med, on='Proteomics',how='outer')

# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frame shifts). Different code is needed to create the data frame for Endometrial.

In [36]:
en_mut_type = en.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = ip)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
en_mut_type = en_mut_type[['Mutation']] 
merged = ip_df.join(en_mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
trunc_wt['Mutation'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Truncation        53
Wildtype_Tumor    15
Name: Mutation, dtype: int64

In [37]:
e_sig = u.wrap_ttest(trunc_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if e_sig is not None:
    e_sig_list = list(e_sig.Comparison)
else: 
    e_sig_list = None
print('significant pvals: \n',e_sig)

# Get all pvals
e_pval = u.wrap_ttest(trunc_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
e_pval = e_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'En_P_Value'})

significant pvals: 
            Comparison   P_Value
0     PTEN_proteomics  0.000640
1     TP53_proteomics  0.001246
2   PIK3CA_proteomics  0.001761
3  CSNK2A1_proteomics  0.002364
4     EGFR_proteomics  0.006128


Differences in median with adaption to trunctation mutations.

In [77]:
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

for prot in ip:
    dif = trunc_med[prot+'_proteomics'] - wt_med[prot+'_proteomics']
    en_d[prot+'_proteomics'] = dif
    #print(trunc_med[prot+'_proteomics'], '-', wt_med[prot+'_proteomics'], ' = ', en_d[prot+'_proteomics'])
    

en_med = pd.DataFrame.from_dict(en_d, orient='index', columns=['En_Median'])
en_med = en_med.reset_index().rename(columns={'index':'Proteomics'})

In [39]:
e_merged = e_pval.merge(en_med, on='Proteomics',how='outer')

# Get a list of significant genes in at least one cancer

In [50]:
# Keep only genes significant in > 1 cancer
all_sig = []

sig_lists = [g_sig_list, e_sig_list, b_sig_list, o_sig_list, c_sig_list, h_sig_list, l_sig_list, ls_sig_list]

for next_list in sig_lists:
    if next_list is not None:
        all_sig.append(next_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # remove duplicates
len(sig)

sig_genes = list(map(lambda x: str.replace(x, "_proteomics", ""), sig)) 
len(sig_genes)

16

# Get a list of significant genes in multiple cancers

In [41]:
# sig in multiple (more than 1) cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
len(mult)
mult


['PIK3CA_proteomics',
 'PTEN_proteomics',
 'PIK3CD_proteomics',
 'EGFR_proteomics',
 'CSNK2A1_proteomics']

In [42]:
# see sig in cancer
cancer = ['Gbm','En','Brca','Ov','Colon','Hnscc','Luad']
i = 0
for next_list in sig_lists:

    print(cancer[i], ':')
    if next_list is not None: 
        print(len(next_list),'\n')
    #print(next_list, '\n')
    if (i < 6):
        i += 1
    

Gbm :
8 

En :
5 

Brca :
1 

Ov :
1 

Colon :
Hnscc :
3 

Luad :
2 

Luad :
6 



# Heat Maps

Interacting Genes - Signficant in > 1 Cancer

In [43]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
sig_df = pd.read_csv(root+R'\sig_pval_heatmap.csv')

In [57]:
# sig > 1 cancer
bool_df = sig_df.Proteomics.isin(sig_genes)
plot_df = sig_df[bool_df]
plot_df.Proteomics.unique()

array(['PTEN', 'MAPK3', 'PIK3CD', 'GRB2', 'PIK3C3', 'USP7', 'MAP2K1',
       'PIK3R1', 'INPP4B', 'EGFR', 'CSNK2A1', 'PIK3CA', 'TP53'],
      dtype=object)

In [58]:
# log p-vals for right scale in plot (bigger circle, smaller pval)
plot_df['size'] = plot_df['P_Value'].apply(lambda x: -1*(np.log(x)/10))
plot_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Proteomics,P_Value,Medians,Cancer,size
12,PTEN,1.219523e-07,-0.511726,Gbm,1.591964
32,MAPK3,1.020660e-06,-0.665105,Gbm,1.379506
840,PIK3CD,1.030815e-03,-0.492099,Gbm,0.687741
865,GRB2,1.089951e-03,-0.312146,Gbm,0.682162
913,PIK3C3,1.225949e-03,-0.102518,Gbm,0.670404
...,...,...,...,...,...
18076,EGFR,3.668857e-01,-0.023500,Colon,0.100270
18728,MAP2K1,7.170253e-01,-0.026750,Colon,0.033264
18946,GRB2,8.274353e-01,0.016800,Colon,0.018942
19232,TP53,9.688700e-01,0.049050,Colon,0.003162


In [56]:
p.plotCircleHeatMap(plot_df, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                   graph_width=700)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = np.where(df[circle_var]<0, np.abs(df[circle_var]), df[circle_var])*50


Pos/Neg Correlations

In [59]:
pos_neg_df = pd.read_csv(root+R'\pos_neg_df.csv')

In [69]:
get = pos_neg_df.Proteomics.isin(sig_genes) # bool df where True has both pos and neg
genes_pn = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn.Proteomics.unique()

array(['MAPK3', 'PIK3R1', 'INPP4B', 'TP53'], dtype=object)

In [72]:
# log p-vals for right scale in plot (bigger circle, smaller pval)
genes_pn['size'] = genes_pn['P_Value'].apply(lambda x: -1*(np.log(x)/10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [71]:
p.plotCircleHeatMap(genes_pn, circle_var = 'size', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                   graph_height=500, graph_width=500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = np.where(df[circle_var]<0, np.abs(df[circle_var]), df[circle_var])*50
