# PIK3CA trans effects on known interacting proteins

We investigated the effects of PIK3CA mutation on the proteomics and phosphoproteomic of all proteins in our dataframes. 

We compare the proteomics and phosphoproteomics of patients with mutations in the PIK3CA hotspot with cancer patients with no PIK3CA mutation (cancer wildtype) and also all patients with any PIK3CA mutation with cancer wildtype.

## Standard Imports

In [6]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import json
import operator

#import the Endometrial data from CPTAC package
import CPTAC.Endometrial as CPTAC

# 1. Set up initial dataframes and variables

In [7]:
gene = 'PIK3CA'
somatic = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

#A list of mutations present in the hotspot according to our Hotspot3D output
hotspot_mutations = ['p.E545A', 'p.E545K', 'p.E545V', 'p.Q546P', 'p.Q546R', 'p.E542K']

#
mutated_hotspot = somatic.loc[(somatic['Location'].isin(hotspot_mutations)) & (somatic['Gene'] == gene)]
hotspot_patients = mutated_hotspot['Clinical_Patient_Key']

hotspot_patients_both_keys = mutated_hotspot[['Clinical_Patient_Key', 'Patient_Id']]
hotspot_patients = hotspot_patients_both_keys['Clinical_Patient_Key']
print(hotspot_patients)

#The pvalue cutoff used to find significance
pcutoff = 0.05


160      S001
1147     S003
1426     S009
4757     S021
16613    S023
19188    S024
21386    S032
21643    S033
22656    S038
37618    S061
38352    S063
39719    S066
43588    S067
43687    S068
46859    S084
47557    S088
48398    S090
49903    S097
50016    S098
Name: Clinical_Patient_Key, dtype: object


# 2. Find number of patients with PIK3CA mutations in/outside of the hotspot

In [8]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']
#note: the gene PIK3CA encodes for the protein p110Î±, hence the variable 'p110_mutated'

# This naturally includes a value that is 'NA' so we subtract 1
# Some people have more than one mutation so we need to grab the unique values
num_mutated = len(set(p110_mutated['Clinical_Patient_Key'])) - 1
print("Total number of patients with mutations: " + str(num_mutated))

Mutations inside hotspot: 19
Total number of patients with mutations: 48


# 3. Create list for all proteins in proteomics dataframe and all phosphosites in phosphoproteomics dataframe

In [9]:
#Build the protein list; this may have only a single protein if desired
protList = proteomics.columns
phosprotList = phos.columns

# 4. Proteomic Abundance

## 4a. Hotspot mutations vs cancer wildtype

In [None]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']

# 1. Use the Bonferonni correction to adjust the p-value by dividing alpha (0.05) by the number of tests. 
#   Because we are analyzing 19 proteins, we will have 19 tests. 
pcutoff = 0.05/len(protList)

# 2. Create a dictionary for significant proteins with their prespective pvalues
#    Create another dictionary for unsignificant results

sigResults = {}
unsigResults = {}

# 3. Loop through each protein to test for significant protein level
for protein in protList: 
    
    #Check to make sure that the protein is in the proteomics dataframe that we are reading from
    if protein in proteomics.columns:
        # 4. Set up a new column for mutations that are within the hotspot
        p10_mutations = CPTAC.compare_mutations(proteomics, protein, gene)
        p10_mutations['Within_Hotspot'] = False
        p10_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        p10_mutations.loc[p10_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p10_mutations = p10_mutations.loc[p10_mutations['Patient_Type'] == 'Tumor'].dropna(axis=0)

        in_hotspot = p10_mutations.loc[p10_mutations['Within_Hotspot'] == True]
        wt = p10_mutations.loc[p10_mutations['Within_Hotspot'] == 'Wildtype']
        
        # 5. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot
        ttest = scipy.stats.ttest_ind(in_hotspot[protein], wt[protein])
        
        if ttest[1] < pcutoff:
            sigResults[protein] = ttest[1]
        else:
            unsigResults[protein] = ttest[1]
            
#5. Sort the dictionaries 
sigResults = sorted(sigResults.items(), key=operator.itemgetter(1))
unsigResults = sorted(unsigResults.items(), key=operator.itemgetter(1))
            
# 6. Print the results
print('SIGNIFICANT RESULTS:')
# Json is a nice package that allows you to print items in the dictionary line by line
print(json.dumps(sigResults, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigResults, indent = 2))


Mutations inside hotspot: 19


## 4b. Plot the data for the most significant result

## 4c. All mutations vs cancer wildtype

In [None]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']

# 1. Use the Bonferonni correction to adjust the p-value by dividing alpha (0.05) by the number of tests. 
#   Because we are analyzing 19 proteins, we will have 19 tests. 
pcutoff = 0.05/len(protList)

# 2. Create a dictionary for significant proteins with their prespective pvalues
#    Create another dictionary for unsignificant results

sigResults = {}
unsigResults = {}

# 3. Loop through each protein to test for significant protein level
for protein in protList: 
    
    #Check to make sure that the protein is in the proteomics dataframe that we are reading from
    if protein in proteomics.columns:
        # 4. Set up a new column for mutations that are within the hotspot
        p10_mutations = CPTAC.compare_mutations(proteomics, protein, gene)
        p10_mutations['Within_Hotspot'] = False
        p10_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        p10_mutations.loc[p10_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p10_mutations = p10_mutations.loc[p10_mutations['Patient_Type'] == 'Tumor'].dropna(axis=0)

        all_mut = p10_mutations.loc[p10_mutations['Within_Hotspot'] != 'Wildtype']
        wt = p10_mutations.loc[p10_mutations['Within_Hotspot'] == 'Wildtype']
        
        # 5. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot
        ttest = scipy.stats.ttest_ind(all_mut[protein], wt[protein])
        
        if ttest[1] < pcutoff:
            sigResults[protein] = ttest[1]
            
        else:
            unsigResults[protein] = ttest[1]

# 5. Sort the dictionaries 
sigResults = sorted(sigResults.iteritems())
unsigResults = sorted(unsigResults.iteritems())
            
# 6. Print the results
print('SIGNIFICANT RESULTS:')
# Json is a nice package that allows you to print items in the dictionary line by line
print(json.dumps(sigResults, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigResults, indent = 2))


## 4d. Plot the data for the most significant result

# 5. Phosphoproteomic Abundance

## 5a. Hotspot mutation vs cancer wildtype

In [None]:
sites = []
pvalues = []


# 1. Loop through each of the proteins in the list of interacting proteins
for protein in protList:
   # 2. Loop through each phosphorylation site in that protein to find any that show any significant changes in phosphorylation level
    #  Some proteins do not have recorded data in the phosphoproteomics data. If this is the case, we move on to the next protein in the list
    try:
        
        # 3. Set up a new column for mutations that are within the hotspot
        phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
        phos_mutations['Within_Hotspot'] = False
        phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        
        # 4. Select for tumor samples only 
        phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']
       
        #5. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
        phos_sites = list(phos_mutations)
        # Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
        phos_sites = phos_sites[:(len(phos_sites) - 3)]

        # 6. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
        #    Drop any columns that contain values of 'Nan' which means for 'Not a number'
        hotspot_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] == True].dropna(axis = 1)
        wt = phos_mutations.loc[phos_mutations['Within_Hotspot'] == 'Wildtype'].dropna(axis = 1)

        # 7. Loop through each site to test for significant difference in phosphorylation levels between the two tested cases
        for site in phos_sites:

        # 8. Check if the site is in both dataframes. That column may have been deleted in step 4
            if site in hotspot_mut.columns and site in wt.columns:
                
                # 9. Perform a two-sample ttest for mutations in hotspot vs cancer wildtype
                ttest = scipy.stats.ttest_ind(hotspot_mut[site], wt[site])
                
                # 10. Append site name and pvalue in two separate lists that will be used to calculate the pvalue adjustment using Bonferroni correction
                sites.append(site)
                pvalues.append(ttest[1])
    #Error caught in the case that the protein had no data in the phosproteomics data
    except:
        print('')
        
        
# 11. After looping through all of the phosphorylation sites of each protein, filter the list of pvalues with respective sites to find those with significant results
#     Create a dictionary to pair the significant sites with their significant pvalues 
#     Create another dictionary for unsignificant results
sigSites = {}
unsigSites = {}

#     The Bonferroni correction is alpha/number of sites tested
alpha = 0.05
pcutoff = alpha/len(sites)

#     Loop through each pvalue to find those that are lower than the pcutoff
for i in range(0, len(sites) - 1):
    if pvalues[i] < pcutoff:
        sigSites[sites[i]] = pvalues[i]
    else:
        unsigSites[sites[i]] = pvalues[i]

# 12. Print your results
print('SIGNIFICANT RESULTS:')
print(json.dumps(sigSites, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigSites, indent = 2))

## 5b. Plot the data for the most significant result

In [None]:
site = 'RPS6KC1-S423'
protein = 'RPS6KC1'

phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
phos_mutations['Within_Hotspot'] = False
phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'

# Select for tumor samples only 
phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']

#5. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
phos_sites = list(phos_mutations)
# Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
phos_sites = phos_sites[:(len(phos_sites) - 3)]

# 6. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
#    Drop any columns that contain values of 'Nan' which means for 'Not a number'
hotspot_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] == True].dropna(axis = 1)
wt = phos_mutations.loc[phos_mutations['Within_Hotspot'] == 'Wildtype'].dropna(axis = 1)


ttest = scipy.stats.ttest_ind(hotspot_mut[site], wt[site])

a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize = a4_dims)
boxplt = sns.boxplot(data = phos_mutations, x = 'Within_Hotspot', y = site, ax = ax)
boxplt = sns.stripplot(data = phos_mutations, x = 'Within_Hotspot', y = site, jitter = True, color = '.3')
boxplt.set_xlabel('Mutation Location', fontsize='20')
boxplt.set_ylabel('Protein Expression', fontsize='20')
boxplt.set_title("Effect of " + gene + " mutation on " + site + " proteomics" + '\nP-value = ' + str(round(ttest[1],  6)), fontsize='22')
plt.show()



## 5c. All mutations vs cancer wildtype

In [None]:
sites = []
pvalues = []


# 1. Loop through each of the proteins in the list of interacting proteins
for protein in protList:
   # 2. Loop through each phosphorylation site in that protein to find any that show any significant changes in phosphorylation level
    #  Some proteins do not have recorded data in the phosphoproteomics data. If this is the case, we move on to the next protein in the list
    try:
        
        # 3. Set up a new column for mutations that are within the hotspot
        phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
        phos_mutations['Within_Hotspot'] = False
        phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        
        # 4. Select for tumor samples only 
        phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']
       
        #5. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
        phos_sites = list(phos_mutations)
        # Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
        phos_sites = phos_sites[:(len(phos_sites) - 3)]

        # 6. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
        #    Drop any columns that contain values of 'Nan' which means for 'Not a number'
        all_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] != 'Wildtype'].dropna(axis = 1)
        wt = phos_mutations.loc[phos_mutations['Within_Hotspot'] == 'Wildtype'].dropna(axis = 1)

        # 7. Loop through each site to test for significant difference in phosphorylation levels between the two tested cases
        for site in phos_sites:

        # 8. Check if the site is in both dataframes. That column may have been deleted in step 4
            if site in all_mut.columns and site in wt.columns:
                
                # 9. Perform a two-sample ttest for all mutations vs cancer wildtype
                ttest = scipy.stats.ttest_ind(all_mut[site], wt[site])
                
                # 10. Append site name and pvalue in two separate lists that will be used to calculate the pvalue adjustment using Bonferroni correction
                sites.append(site)
                pvalues.append(ttest[1])
    #Error caught in the case that the protein had no data in the phosproteomics data
    except:
        print('')
        
        
# 11. After looping through all of the phosphorylation sites of each protein, filter the list of pvalues with respective sites to find those with significant results
#     Create a dictionary to pair the significant sites with their significant pvalues 
#     Create another dictionary for unsignificant results
sigSites = {}
unsigSites = {}

#     The Bonferroni correction is alpha/number of sites tested
alpha = 0.05
pcutoff = alpha/len(sites)

#     Loop through each pvalue to find those that are lower than the pcutoff
for i in range(0, len(sites) - 1):
    if pvalues[i] < pcutoff:
        sigSites[sites[i]] = pvalues[i]
    else:
        unsigSites[sites[i]] = pvalues[i]

# 12. Print your results
print('SIGNIFICANT RESULTS:')
print(json.dumps(sigSites, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigSites, indent = 2))

## 5d. Plot the data for the most significant results

In [None]:
site = 'LAMTOR1-S27'
protein = 'LAMTOR'

phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
phos_mutations['Within_Hotspot'] = False
phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'

# Select for tumor samples only 
phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']

#5. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
phos_sites = list(phos_mutations)
# Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
phos_sites = phos_sites[:(len(phos_sites) - 3)]

# 6. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
#    Drop any columns that contain values of 'Nan' which means for 'Not a number'
hotspot_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] != 'Wildtype'].dropna(axis = 1)
wt = phos_mutations.loc[phos_mutations['Within_Hotspot'] == 'Wildtype'].dropna(axis = 1)


ttest = scipy.stats.ttest_ind(hotspot_mut[site], wt[site])

a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize = a4_dims)
boxplt = sns.boxplot(data = phos_mutations, x = 'Within_Hotspot', y = site, ax = ax)
boxplt = sns.stripplot(data = phos_mutations, x = 'Within_Hotspot', y = site, jitter = True, color = '.3')
boxplt.set_xlabel('Mutation Location', fontsize='20')
boxplt.set_ylabel('Protein Expression', fontsize='20')
boxplt.set_title("Effect of " + gene + " mutation on " + site + " phosphoproteomics" + '\nP-value = ' + str(round(ttest[1],  6)), fontsize='22')
plt.show()


### We can conclude from these results that PIK3CA mutation, neither hotspot no non-hotspot mutations, do not have a significant effect on the proteomics or phosproteomics of its interacting proteins