# PIK3CA trans effects on known interacting proteins

We first investigated the effects of PIK3CA mutation on the proteomics and phosphoproteomic of known interacting proteins. This list primarily included proteins in the PI3K/AKT/MTOR signaling cascade that plays a key role in cell survival and cell proliferation.

## Standard Imports

In [56]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import json

#import the Endometrial data from CPTAC package
import CPTAC.Endometrial as CPTAC

### Set up initial dataframes and variables

In [57]:
gene = 'PIK3CA'
somatic = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

#A list of mutations present in the hotspot according to our Hotspot3D output
hotspot_mutations = ['p.E545A', 'p.E545K', 'p.E545V', 'p.Q546P', 'p.Q546R', 'p.E542K']


mutated_hotspot = somatic.loc[(somatic['Location'].isin(hotspot_mutations)) & (somatic['Gene'] == gene)]
hotspot_patients = mutated_hotspot['Clinical_Patient_Key']

hotspot_patients_both_keys = mutated_hotspot[['Clinical_Patient_Key', 'Patient_Id']]
hotspot_patients = hotspot_patients_both_keys['Clinical_Patient_Key']
print(hotspot_patients)

#The pvalue cutoff used to find significance
pcutoff = 0.05


160      S001
1147     S003
1426     S009
4757     S021
16613    S023
19188    S024
21386    S032
21643    S033
22656    S038
37618    S061
38352    S063
39719    S066
43588    S067
43687    S068
46859    S084
47557    S088
48398    S090
49903    S097
50016    S098
Name: Clinical_Patient_Key, dtype: object


## Find number of patients with PIK3CA mutations in/outside of the hotspot

In [58]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']
#note: the gene PIK3CA encodes for the protein p110α, hence the variable 'p110_mutated'

# This naturally includes a value that is 'NA' so we subtract 1
# Some people have more than one mutation so we need to grab the unique values
num_mutated = len(set(p110_mutated['Clinical_Patient_Key'])) - 1
print("Total number of patients with mutations: " + str(num_mutated))

Mutations inside hotspot: 19
Total number of patients with mutations: 48


# Proteomic Abundance

### List of proteins that interact with PIK3CA (according to Uniprot and STRING)

In [59]:
#Build the protein list; this may have only a single protein if desired
protList = ['IRS1', 'IRS2', 'RRAS', 'AKT2', 'NRAS', 'PTEN', 'AKT1', 'MRAS', 'HRAS', 'RPS6KB1', 'PIK3R1', 'PKC', 'MTOR', 'S6K', 'MAPK', 'ERBB3', 'P85A', 'P55G', 'CDK5']

## Hotspot mutation vs non-hotspot mutation

In [74]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']

# 1. Use the Bonferonni correction to adjust the p-value by dividing alpha (0.05) by the number of tests. 
#   Because we are analyzing 19 proteins, we will have 19 tests. 
pcutoff = 0.05/len(protList)

# 2. Create a dictionary for significant proteins with their prespective pvalues
#    Create another dictionary for unsignificant results

sigResults = {}
unsigResults = {}

# 3. Loop through each protein to test for significant protein level
for protein in protList: 
    
    #Check to make sure that the protein is in the proteomics dataframe that we are reading from
    if protein in proteomics.columns:
        # 4. Set up a new column for mutations that are within the hotspot
        p10_mutations = CPTAC.compare_mutations(proteomics, protein, gene)
        p10_mutations['Within_Hotspot'] = False
        p10_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        p10_mutations.loc[p10_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p10_mutations = p10_mutations.loc[p10_mutations['Patient_Type'] == 'Tumor'].dropna(axis=0)
        
        in_hotspot = p10_mutations.loc[p10_mutations['Within_Hotspot'] == True]
        out_hotspot = p10_mutations.loc[p10_mutations['Within_Hotspot'] == False]
        
        
        # 5. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot
        ttest = scipy.stats.ttest_ind(in_hotspot[protein], out_hotspot[protein])
        
        if ttest[1] < pcutoff:
            sigResults[protein] = ttest[1]
        else:
            unsigResults[protein] = ttest[1]

# 5. Print the results
print('SIGNIFICANT RESULTS:')
# Json is a nice package that allows you to print items in the dictionary line by line
print(json.dumps(sigResults, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigResults, indent = 2))


Mutations inside hotspot: 19
SIGNIFICANT RESULTS:
{}
 
UNSIGNIFICANT RESULTS:
{
  "IRS1": 0.8570734994915308,
  "IRS2": 0.09674094347736001,
  "RRAS": 0.025709117294969015,
  "AKT2": 0.24486222824644044,
  "NRAS": 0.579841867910798,
  "PTEN": 0.0676976304099718,
  "AKT1": 0.39764976429801524,
  "MRAS": 0.17544373477828829,
  "HRAS": 0.3114823476147295,
  "RPS6KB1": 0.44079322936855947,
  "PIK3R1": 0.9013685831466293,
  "MTOR": 0.11458881422507551,
  "ERBB3": 0.4097077317602019,
  "CDK5": 0.6123211223904736
}


## Hotspot mutation vs other mutations and cancer wildtype

In [61]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']

# 1. Use the Bonferonni correction to adjust the p-value by dividing alpha (0.05) by the number of tests. 
#   Because we are analyzing 19 proteins, we will have 19 tests. 
pcutoff = 0.05/len(protList)

# 2. Create a dictionary for significant proteins with their prespective pvalues
#    Create another dictionary for unsignificant results

sigResults = {}
unsigResults = {}

# 3. Loop through each protein to test for significant protein level
for protein in protList: 
    
    #Check to make sure that the protein is in the proteomics dataframe that we are reading from
    if protein in proteomics.columns:
        # 4. Set up a new column for mutations that are within the hotspot
        p10_mutations = CPTAC.compare_mutations(proteomics, protein, gene)
        p10_mutations['Within_Hotspot'] = False
        p10_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        p10_mutations.loc[p10_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p10_mutations = p10_mutations.loc[p10_mutations['Patient_Type'] == 'Tumor'].dropna(axis=0)

        in_hotspot = p10_mutations.loc[p10_mutations['Within_Hotspot'] == True]
        wt = p10_mutations.loc[p10_mutations['Within_Hotspot'] != True]
        
        # 5. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot
        ttest = scipy.stats.ttest_ind(in_hotspot[protein], wt[protein])
        
        if ttest[1] < pcutoff:
            sigResults[protein] = ttest[1]
        else:
            unsigResults[protein] = ttest[1]

# 5. Print the results
print('SIGNIFICANT RESULTS:')
# Json is a nice package that allows you to print items in the dictionary line by line
print(json.dumps(sigResults, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigResults, indent = 2))


Mutations inside hotspot: 19
SIGNIFICANT RESULTS:
{}
 
UNSIGNIFICANT RESULTS:
{
  "IRS1-S1005": 0.1538874422084835,
  "IRS1-S270": 0.273498035871859,
  "IRS1-S527": 0.6576950639533548,
  "IRS2-S1100": 0.7932403796403827,
  "IRS2-S365": 0.2190537914574149,
  "RRAS2-S186": 0.06560223604200595,
  "PTEN-S467": 0.5573473385330219,
  "AKT1-S126": 0.7527767230674126,
  "AKT1-S129": 0.6225282061459592,
  "AKT1S1-S222": 0.40990754364672066,
  "AKT1S1-S223": 0.5472780915404707,
  "AKT1S1-S232": 0.890724203697261,
  "LAMTOR1-S27": 0.8062980865678622,
  "RPS6KA1-S372": 0.10432653425215277,
  "RPS6KA3-S369": 0.44692002190398494,
  "RPS6KA3-S415": 0.4922566425312921,
  "RPS6KA3-T365": 0.5308755964582641,
  "RPS6KA4-S343": 0.50715754989591,
  "RPS6KA4-S347": 0.4913203854756516,
  "RPS6KA4-S682": 0.12699833922172207,
  "RPS6KA4-S745": 0.5186282931858306,
  "RPS6KC1-S423": 0.004588149271431776,
  "RPS6KC1-S667": 0.5232936781331718,
  "MAPK1-Y187": 0.8677251063910387,
  "MAPK14-Y182": 0.9778508055313739

## Hotspot mutations vs cancer wildtype

In [68]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']

# 1. Use the Bonferonni correction to adjust the p-value by dividing alpha (0.05) by the number of tests. 
#   Because we are analyzing 19 proteins, we will have 19 tests. 
pcutoff = 0.05/len(protList)

# 2. Create a dictionary for significant proteins with their prespective pvalues
#    Create another dictionary for unsignificant results

sigResults = {}
unsigResults = {}

# 3. Loop through each protein to test for significant protein level
for protein in protList: 
    
    #Check to make sure that the protein is in the proteomics dataframe that we are reading from
    if protein in proteomics.columns:
        # 4. Set up a new column for mutations that are within the hotspot
        p10_mutations = CPTAC.compare_mutations(proteomics, protein, gene)
        p10_mutations['Within_Hotspot'] = False
        p10_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        p10_mutations.loc[p10_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p10_mutations = p10_mutations.loc[p10_mutations['Patient_Type'] == 'Tumor'].dropna(axis=0)

        in_hotspot = p10_mutations.loc[p10_mutations['Within_Hotspot'] == True]
        wt = p10_mutations.loc[p10_mutations['Within_Hotspot'] == 'Wildtype']
        
        # 5. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot
        ttest = scipy.stats.ttest_ind(in_hotspot[protein], wt[protein])
        
        if ttest[1] < pcutoff:
            sigResults[protein] = ttest[1]
        else:
            unsigResults[protein] = ttest[1]

# 5. Print the results
print('SIGNIFICANT RESULTS:')
# Json is a nice package that allows you to print items in the dictionary line by line
print(json.dumps(sigResults, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigResults, indent = 2))


Mutations inside hotspot: 19
         IRS1  Mutation Patient_Type Within_Hotspot
S002 -0.79300  Wildtype        Tumor       Wildtype
S004 -0.18000  Wildtype        Tumor       Wildtype
S005 -0.51900  Wildtype        Tumor       Wildtype
S006 -1.46000  Wildtype        Tumor       Wildtype
S007  1.24000  Wildtype        Tumor       Wildtype
S008 -0.03510  Wildtype        Tumor       Wildtype
S011  0.18600  Wildtype        Tumor       Wildtype
S013 -0.83500  Wildtype        Tumor       Wildtype
S015  0.15000  Wildtype        Tumor       Wildtype
S016 -1.01000  Wildtype        Tumor       Wildtype
S017 -1.49000  Wildtype        Tumor       Wildtype
S019  1.65000  Wildtype        Tumor       Wildtype
S020  0.16900  Wildtype        Tumor       Wildtype
S025  0.51300  Wildtype        Tumor       Wildtype
S029 -0.08520  Wildtype        Tumor       Wildtype
S034  0.61500  Wildtype        Tumor       Wildtype
S035  0.45400  Wildtype        Tumor       Wildtype
S037  0.13300  Wildtype        Tumo

         AKT1  Mutation Patient_Type Within_Hotspot
S002  0.04530  Wildtype        Tumor       Wildtype
S004 -1.07000  Wildtype        Tumor       Wildtype
S005  0.09680  Wildtype        Tumor       Wildtype
S006 -0.64100  Wildtype        Tumor       Wildtype
S007 -0.41900  Wildtype        Tumor       Wildtype
S008  0.14300  Wildtype        Tumor       Wildtype
S011 -0.05910  Wildtype        Tumor       Wildtype
S013 -0.61500  Wildtype        Tumor       Wildtype
S015 -1.16000  Wildtype        Tumor       Wildtype
S016 -0.69900  Wildtype        Tumor       Wildtype
S017 -0.16200  Wildtype        Tumor       Wildtype
S019 -0.09330  Wildtype        Tumor       Wildtype
S020  0.25700  Wildtype        Tumor       Wildtype
S025 -0.29600  Wildtype        Tumor       Wildtype
S029  0.40000  Wildtype        Tumor       Wildtype
S034  0.01540  Wildtype        Tumor       Wildtype
S035  0.76500  Wildtype        Tumor       Wildtype
S037  0.01140  Wildtype        Tumor       Wildtype
S039 -0.0365

         CDK5  Mutation Patient_Type Within_Hotspot
S002 -0.01720  Wildtype        Tumor       Wildtype
S004 -0.33400  Wildtype        Tumor       Wildtype
S005 -0.01950  Wildtype        Tumor       Wildtype
S006 -0.29700  Wildtype        Tumor       Wildtype
S007  0.80700  Wildtype        Tumor       Wildtype
S008 -0.02650  Wildtype        Tumor       Wildtype
S011  0.17200  Wildtype        Tumor       Wildtype
S013 -0.55300  Wildtype        Tumor       Wildtype
S015 -0.06400  Wildtype        Tumor       Wildtype
S016  0.18600  Wildtype        Tumor       Wildtype
S017  0.45800  Wildtype        Tumor       Wildtype
S019  0.52800  Wildtype        Tumor       Wildtype
S020 -0.13700  Wildtype        Tumor       Wildtype
S025  0.18300  Wildtype        Tumor       Wildtype
S029  0.47600  Wildtype        Tumor       Wildtype
S034  0.00183  Wildtype        Tumor       Wildtype
S035 -0.21600  Wildtype        Tumor       Wildtype
S037  0.01830  Wildtype        Tumor       Wildtype
S039  0.0166

## All mutations vs cancer wildtype

In [73]:
print("Mutations inside hotspot: " + str(len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']

# 1. Use the Bonferonni correction to adjust the p-value by dividing alpha (0.05) by the number of tests. 
#   Because we are analyzing 19 proteins, we will have 19 tests. 
pcutoff = 0.05/len(protList)

# 2. Create a dictionary for significant proteins with their prespective pvalues
#    Create another dictionary for unsignificant results

sigResults = {}
unsigResults = {}

# 3. Loop through each protein to test for significant protein level
for protein in protList: 
    
    #Check to make sure that the protein is in the proteomics dataframe that we are reading from
    if protein in proteomics.columns:
        # 4. Set up a new column for mutations that are within the hotspot
        p10_mutations = CPTAC.compare_mutations(proteomics, protein, gene)
        p10_mutations['Within_Hotspot'] = False
        p10_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        p10_mutations.loc[p10_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p10_mutations = p10_mutations.loc[p10_mutations['Patient_Type'] == 'Tumor'].dropna(axis=0)

        all_mut = p10_mutations.loc[p10_mutations['Within_Hotspot'] != 'Wildtype']
        wt = p10_mutations.loc[p10_mutations['Within_Hotspot'] == 'Wildtype']
        
        # 5. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot
        ttest = scipy.stats.ttest_ind(all_mut[protein], wt[protein])
        
        if ttest[1] < pcutoff:
            sigResults[protein] = ttest[1]
            
        else:
            unsigResults[protein] = ttest[1]
            
# 5. Print the results
print('SIGNIFICANT RESULTS:')
# Json is a nice package that allows you to print items in the dictionary line by line
print(json.dumps(sigResults, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigResults, indent = 2))


Mutations inside hotspot: 19
0.3035716066390899
0.07027134773500125
0.3698729088560708
0.0037875613802568534
0.8373527469808126
0.20302505456125847
0.8022219785434666
0.6105204981755263
0.5580623705425318
0.031195095038079188
0.03475975535096763
0.12697356066680146
0.26427162128979986
0.0158825119935403
SIGNIFICANT RESULTS:
{}
 
UNSIGNIFICANT RESULTS:
{
  "IRS1": 0.3035716066390899,
  "IRS2": 0.07027134773500125,
  "RRAS": 0.3698729088560708,
  "AKT2": 0.0037875613802568534,
  "NRAS": 0.8373527469808126,
  "PTEN": 0.20302505456125847,
  "AKT1": 0.8022219785434666,
  "MRAS": 0.6105204981755263,
  "HRAS": 0.5580623705425318,
  "RPS6KB1": 0.031195095038079188,
  "PIK3R1": 0.03475975535096763,
  "MTOR": 0.12697356066680146,
  "ERBB3": 0.26427162128979986,
  "CDK5": 0.0158825119935403
}


# Phosphoproteomic Abundance

## Hotspot mutation vs non-hotspot mutation

In [64]:
sites = []
pvalues = []

# 1. Use a Bonferroni cutoff to adjust the pvalue in testing for significance 
#pcutoff = 0.05/len(phos_mutations.columns)

# 2. Loop through each of the proteins in the list of interacting proteins
for protein in protList:
   # 3. Loop through each phosphorylation site in that protein to find any that show any significant changes in phosphorylation level
    #  Some proteins do not have recorded data in the phosphoproteomics data. If this is the case, we move on to the next protein in the list
    try:
        
        # 4. Set up a new column for mutations that are within the hotspot
        phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
        phos_mutations['Within_Hotspot'] = False
        phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        
        # 5. Select for tumor samples only 
        phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']
       
        #6. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
        phos_sites = list(phos_mutations)
        # Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
        phos_sites = phos_sites[:(len(phos_sites) - 3)]

        # 7. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
        #    Drop any columns that contain values of 'Nan' which means for 'Not a number'
        hotspot_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] == True].dropna(axis = 1)
        non_hotspot_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] == False].dropna(axis = 1)

        # 8. Loop through each site to test for significant difference in phosphorylation levels between the two tested cases
        for site in phos_sites:

        # 9. Check if the site is in both dataframes. That column may have been deleted in step 4
            if site in hotspot_mut.columns and site in non_hotspot_mut.columns:
                
                # 10. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot
                ttest = scipy.stats.ttest_ind(hotspot_mut[site], non_hotspot_mut[site])
                
                # 11. Append site name and pvalue in two separate lists that will be used to calculate the pvalue adjustment using Bonferroni correction
                sites.append(site)
                pvalues.append(ttest[1])
    #Error caught in the case that the protein had no data in the phosproteomics data
    except:
        print('')
        
        
# 12. After looping through all of the phosphorylation sites of each protein, filter the list of pvalues with respective sites to find those with significant results
#     Create a dictionary to pair the significant sites with their significant pvalues 
#     Create another dictionary for unsignificant results
sigSites = {}
unsigSites = {}

#     The Bonferroni correction is alpha/number of sites tested
alpha = 0.05
pcutoff = alpha/len(sites)

#     Loop through each pvalue to find those that are lower than the pcutoff
for i in range(0, len(sites) - 1):
    if pvalues[i] < pcutoff:
        sigSites[sites[i]] = pvalues[i]
    else:
        unsigSites[sites[i]] = pvalues[i]

# 13. Print your results
print('SIGNIFICANT RESULTS:')
print(json.dumps(sigSites, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigSites, indent = 2))

    


Gene MRAS not found in phosphoproteomics data

Gene HRAS not found in phosphoproteomics data

Gene P85A not found in phosphoproteomics data

Gene P55G not found in phosphoproteomics data

SIGNIFICANT RESULTS:
{}
 
UNSIGNIFICANT RESULTS:
{
  "IRS1-S1005": 0.17528948470055017,
  "IRS1-S270": 0.03007352043700034,
  "IRS1-S527": 0.8131561848131362,
  "IRS2-S1100": 0.7645665883505008,
  "IRS2-S365": 0.4781886740079353,
  "RRAS2-S186": 0.17119748527873915,
  "PTEN-S467": 0.9294994804576602,
  "AKT1-S126": 0.7486950797373193,
  "AKT1-S129": 0.5916370158032049,
  "AKT1S1-S222": 0.5918285139058572,
  "AKT1S1-S223": 0.43267755057076984,
  "AKT1S1-S232": 0.7772468446134322,
  "LAMTOR1-S27": 0.18293732469580826,
  "RPS6KA1-S372": 0.05339162019092151,
  "RPS6KA3-S369": 0.4210133831570302,
  "RPS6KA3-S415": 0.5628514553056778,
  "RPS6KA3-T365": 0.4558365802220815,
  "RPS6KA4-S343": 0.7768822975837044,
  "RPS6KA4-S347": 0.9362529697434308,
  "RPS6KA4-S682": 0.28305324636110585,
  "RPS6KA4-S745": 0.26

## Hotspot mutation vs cancer wildtype

In [65]:
sites = []
pvalues = []


# 1. Loop through each of the proteins in the list of interacting proteins
for protein in protList:
   # 2. Loop through each phosphorylation site in that protein to find any that show any significant changes in phosphorylation level
    #  Some proteins do not have recorded data in the phosphoproteomics data. If this is the case, we move on to the next protein in the list
    try:
        
        # 3. Set up a new column for mutations that are within the hotspot
        phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
        phos_mutations['Within_Hotspot'] = False
        phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        
        # 4. Select for tumor samples only 
        phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']
       
        #5. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
        phos_sites = list(phos_mutations)
        # Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
        phos_sites = phos_sites[:(len(phos_sites) - 3)]

        # 6. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
        #    Drop any columns that contain values of 'Nan' which means for 'Not a number'
        hotspot_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] == True].dropna(axis = 1)
        wt = phos_mutations.loc[phos_mutations['Within_Hotspot'] == 'Wildtype'].dropna(axis = 1)

        # 7. Loop through each site to test for significant difference in phosphorylation levels between the two tested cases
        for site in phos_sites:

        # 8. Check if the site is in both dataframes. That column may have been deleted in step 4
            if site in hotspot_mut.columns and site in wt.columns:
                
                # 9. Perform a two-sample ttest for mutations in hotspot vs cancer wildtype
                ttest = scipy.stats.ttest_ind(hotspot_mut[site], wt[site])
                
                # 10. Append site name and pvalue in two separate lists that will be used to calculate the pvalue adjustment using Bonferroni correction
                sites.append(site)
                pvalues.append(ttest[1])
    #Error caught in the case that the protein had no data in the phosproteomics data
    except:
        print('')
        
        
# 11. After looping through all of the phosphorylation sites of each protein, filter the list of pvalues with respective sites to find those with significant results
#     Create a dictionary to pair the significant sites with their significant pvalues 
#     Create another dictionary for unsignificant results
sigSites = {}
unsigSites = {}

#     The Bonferroni correction is alpha/number of sites tested
alpha = 0.05
pcutoff = alpha/len(sites)

#     Loop through each pvalue to find those that are lower than the pcutoff
for i in range(0, len(sites) - 1):
    if pvalues[i] < pcutoff:
        sigSites[sites[i]] = pvalues[i]
    else:
        unsigSites[sites[i]] = pvalues[i]

# 12. Print your results
print('SIGNIFICANT RESULTS:')
print(json.dumps(sigSites, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigSites, indent = 2))

Gene MRAS not found in phosphoproteomics data

Gene HRAS not found in phosphoproteomics data

Gene P85A not found in phosphoproteomics data

Gene P55G not found in phosphoproteomics data

SIGNIFICANT RESULTS:
{}
 
UNSIGNIFICANT RESULTS:
{
  "IRS1-S1005": 0.19417303251270715,
  "IRS1-S270": 0.6123346440221398,
  "IRS1-S527": 0.6202579866122413,
  "IRS2-S1100": 0.5593582798259669,
  "IRS2-S365": 0.1480060586799261,
  "RRAS2-S186": 0.0681929188803574,
  "PTEN-S467": 0.3443244276059154,
  "AKT1-S126": 0.7883661163452153,
  "AKT1-S129": 0.6794351192511748,
  "AKT1S1-S222": 0.3638299279998196,
  "AKT1S1-S223": 0.6732770531436207,
  "AKT1S1-S232": 0.962703662241644,
  "LAMTOR1-S27": 0.7161195316132374,
  "RPS6KA1-S372": 0.2096652012691004,
  "RPS6KA3-S369": 0.5043949130038693,
  "RPS6KA3-S415": 0.5282218763149932,
  "RPS6KA3-T365": 0.6360891945377386,
  "RPS6KA4-S343": 0.4171864166971315,
  "RPS6KA4-S347": 0.32314014370600697,
  "RPS6KA4-S682": 0.11122422076457351,
  "RPS6KA4-S745": 0.7614522

## All mutations vs cancer wildtype

In [66]:
sites = []
pvalues = []


# 1. Loop through each of the proteins in the list of interacting proteins
for protein in protList:
   # 2. Loop through each phosphorylation site in that protein to find any that show any significant changes in phosphorylation level
    #  Some proteins do not have recorded data in the phosphoproteomics data. If this is the case, we move on to the next protein in the list
    try:
        
        # 3. Set up a new column for mutations that are within the hotspot
        phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
        phos_mutations['Within_Hotspot'] = False
        phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        
        # 4. Select for tumor samples only 
        phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']
       
        #5. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
        phos_sites = list(phos_mutations)
        # Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
        phos_sites = phos_sites[:(len(phos_sites) - 3)]

        # 6. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
        #    Drop any columns that contain values of 'Nan' which means for 'Not a number'
        all_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] != 'Wildtype'].dropna(axis = 1)
        wt = phos_mutations.loc[phos_mutations['Within_Hotspot'] == 'Wildtype'].dropna(axis = 1)

        # 7. Loop through each site to test for significant difference in phosphorylation levels between the two tested cases
        for site in phos_sites:

        # 8. Check if the site is in both dataframes. That column may have been deleted in step 4
            if site in all_mut.columns and site in wt.columns:
                
                # 9. Perform a two-sample ttest for all mutations vs cancer wildtype
                ttest = scipy.stats.ttest_ind(all_mut[site], wt[site])
                
                # 10. Append site name and pvalue in two separate lists that will be used to calculate the pvalue adjustment using Bonferroni correction
                sites.append(site)
                pvalues.append(ttest[1])
    #Error caught in the case that the protein had no data in the phosproteomics data
    except:
        print('')
        
        
# 11. After looping through all of the phosphorylation sites of each protein, filter the list of pvalues with respective sites to find those with significant results
#     Create a dictionary to pair the significant sites with their significant pvalues 
#     Create another dictionary for unsignificant results
sigSites = {}
unsigSites = {}

#     The Bonferroni correction is alpha/number of sites tested
alpha = 0.05
pcutoff = alpha/len(sites)

#     Loop through each pvalue to find those that are lower than the pcutoff
for i in range(0, len(sites) - 1):
    if pvalues[i] < pcutoff:
        sigSites[sites[i]] = pvalues[i]
    else:
        unsigSites[sites[i]] = pvalues[i]

# 12. Print your results
print('SIGNIFICANT RESULTS:')
print(json.dumps(sigSites, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigSites, indent = 2))

Gene MRAS not found in phosphoproteomics data

Gene HRAS not found in phosphoproteomics data

Gene P85A not found in phosphoproteomics data

Gene P55G not found in phosphoproteomics data

SIGNIFICANT RESULTS:
{}
 
UNSIGNIFICANT RESULTS:
{
  "IRS1-S1005": 0.7819473288725783,
  "IRS1-S270": 0.420671984317987,
  "IRS1-S527": 0.6092510914703817,
  "IRS2-S1100": 0.29051612765254076,
  "IRS2-S365": 0.17536896672599964,
  "RRAS2-S186": 0.17220819173708915,
  "PTEN-S467": 0.19026606659216647,
  "AKT1-S126": 0.9132179738567242,
  "AKT1-S129": 0.890024535674317,
  "AKT1S1-S222": 0.5206482315169443,
  "AKT1S1-S223": 0.8293345010001284,
  "AKT1S1-S232": 0.8776613495892818,
  "LAMTOR1-S27": 0.11619202055339348,
  "RPS6KA1-S372": 0.9288019759523318,
  "RPS6KA3-S369": 0.8187216070129072,
  "RPS6KA3-S415": 0.6959015001149395,
  "RPS6KA3-T365": 0.8319894488679778,
  "RPS6KA4-S343": 0.43504258422238407,
  "RPS6KA4-S347": 0.2202764845546568,
  "RPS6KA4-S682": 0.14167192742230666,
  "RPS6KA4-S745": 0.6311

## Hotspot mutation v other mutations and wildtype

In [67]:
sites = []
pvalues = []


# 1. Loop through each of the proteins in the list of interacting proteins
for protein in protList:
   # 2. Loop through each phosphorylation site in that protein to find any that show any significant changes in phosphorylation level
    #  Some proteins do not have recorded data in the phosphoproteomics data. If this is the case, we move on to the next protein in the list
    try:
        
        # 3. Set up a new column for mutations that are within the hotspot
        phos_mutations = CPTAC.compare_mutations(phos, protein, gene)
        phos_mutations['Within_Hotspot'] = False
        phos_mutations.loc[hotspot_patients, 'Within_Hotspot'] = True
        phos_mutations.loc[phos_mutations['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        
        # 4. Select for tumor samples only 
        phos_mutations = phos_mutations.loc[phos_mutations['Patient_Type'] == 'Tumor']
       
        #5. Create a list of the phosphosites by selecting all the columns in the dataframe, excluding the last three which are categorical variables
        phos_sites = list(phos_mutations)
        # Take out the categorical variables, which are 'Mutation', 'Patient_Type', and 'Within_Hotspot'
        phos_sites = phos_sites[:(len(phos_sites) - 3)]

        # 6. Create two dataframes to test the two cases: one for phosphorylation levels when PIK3CA is mutated at a hotspot, the other when PIK3CA is mutated outside the hotspot
        #    Drop any columns that contain values of 'Nan' which means for 'Not a number'
        hotspot_mut = phos_mutations.loc[phos_mutations['Within_Hotspot'] == True].dropna(axis = 1)
        wt = phos_mutations.loc[phos_mutations['Within_Hotspot'] != True ].dropna(axis = 1)

        # 7. Loop through each site to test for significant difference in phosphorylation levels between the two tested cases
        for site in phos_sites:

        # 8. Check if the site is in both dataframes. That column may have been deleted in step 4
            if site in hotspot_mut.columns and site in wt.columns:
                
                # 9. Perform a two-sample ttest for mutations in hotspot vs mutations outside of the hotspot and cancer wildtype
                ttest = scipy.stats.ttest_ind(hotspot_mut[site], wt[site])
                
                # 10. Append site name and pvalue in two separate lists that will be used to calculate the pvalue adjustment using Bonferroni correction
                sites.append(site)
                pvalues.append(ttest[1])
    #Error caught in the case that the protein had no data in the phosproteomics data
    except:
        print('')
        
        
# 11. After looping through all of the phosphorylation sites of each protein, filter the list of pvalues with respective sites to find those with significant results
#     Create a dictionary to pair the significant sites with their significant pvalues 
#     Create another dictionary for unsignificant results
sigSites = {}
unsigSites = {}

#     The Bonferroni correction is alpha/number of sites tested
alpha = 0.05
pcutoff = alpha/len(sites)

#     Loop through each pvalue to find those that are lower than the pcutoff
for i in range(0, len(sites) - 1):
    if pvalues[i] < pcutoff:
        sigSites[sites[i]] = pvalues[i]
    else:
        unsigSites[sites[i]] = pvalues[i]

# 12. Print your results
print('SIGNIFICANT RESULTS:')
print(json.dumps(sigSites, indent = 2))
print(' ')
print('UNSIGNIFICANT RESULTS:')
print(json.dumps(unsigSites, indent = 2))


Gene MRAS not found in phosphoproteomics data

Gene HRAS not found in phosphoproteomics data

Gene P85A not found in phosphoproteomics data

Gene P55G not found in phosphoproteomics data

SIGNIFICANT RESULTS:
{}
 
UNSIGNIFICANT RESULTS:
{
  "IRS1-S1005": 0.1538874422084835,
  "IRS1-S270": 0.273498035871859,
  "IRS1-S527": 0.6576950639533548,
  "IRS2-S1100": 0.7932403796403827,
  "IRS2-S365": 0.2190537914574149,
  "RRAS2-S186": 0.06560223604200595,
  "PTEN-S467": 0.5573473385330219,
  "AKT1-S126": 0.7527767230674126,
  "AKT1-S129": 0.6225282061459592,
  "AKT1S1-S222": 0.40990754364672066,
  "AKT1S1-S223": 0.5472780915404707,
  "AKT1S1-S232": 0.890724203697261,
  "LAMTOR1-S27": 0.8062980865678622,
  "RPS6KA1-S372": 0.10432653425215277,
  "RPS6KA3-S369": 0.44692002190398494,
  "RPS6KA3-S415": 0.4922566425312921,
  "RPS6KA3-T365": 0.5308755964582641,
  "RPS6KA4-S343": 0.50715754989591,
  "RPS6KA4-S347": 0.4913203854756516,
  "RPS6KA4-S682": 0.12699833922172207,
  "RPS6KA4-S745": 0.51862829

### We can conclude from these results that PIK3CA mutation, neither hotspot no non-hotspot mutations, do not have a significant effect on the proteomics or phosproteomics of its interacting proteins