# Statistical Testing Templates for CPTAC Data

<b>Standard imports for playing with and plotting data frames.</b>

In [40]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
#import altair as alt
#alt.renderers.enable('notebook') #Necessary for Altair to work

<b>Note:</b> You need to run this code (only once) in the command line before the above will work: 
<b>conda install -c conda-forge altair vega_datasets notebook vega</b>

In [41]:
import CPTAC

In [42]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

### Test for protein levels (mutation in a gene with protein levels of any protein, including itself)
<b>Note:</b> You can put any number of proteins in the list and it will compare that gene with the protein levels of all of them

In [34]:
#Build the protein list; this may have only a single protein if desired
protList = ['IRS1', 'IRS2', 'RRAS', 'AKT2', 'NRAS', 'PTEN', 'AKT1', 'MRAS', 'HRAS', 'RPS6KB1', 'PIK3R1', 'PKC', 'MTOR', 'S6K', 'MAPK']

In [35]:
gene = 'PIK3CA'

significantResults = [];
for protein in protList: 
    if protein in proteomics.columns:
        cross = CPTAC.merge_mutations(proteomics, protein, gene)
        cross = cross[["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[cross["Mutation"] != "Wildtype"]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        if(ttest[1] <= pcutoff):
            significantResults.append(protein)
            print("Test for " + protein + ": ")
            print(ttest) 

Test for IRS1: 
Ttest_indResult(statistic=-3.624921787264356, pvalue=0.00040553855122213927)
Test for IRS2: 
Ttest_indResult(statistic=-4.305242968682799, pvalue=3.4083077260524325e-05)
Test for RRAS: 
Ttest_indResult(statistic=-4.788321134119203, pvalue=4.279593710269613e-06)
Test for AKT2: 
Ttest_indResult(statistic=-4.5103219547783135, pvalue=1.3713846510248371e-05)
Test for PTEN: 
Ttest_indResult(statistic=-4.112896462551923, pvalue=6.679197651007237e-05)
Test for MTOR: 
Ttest_indResult(statistic=-3.270682419291185, pvalue=0.0013550626865203245)


### List of significantly affected proteins

In [36]:
print(sigList)

[]


### Plot protein levels of any protein vs. mutation at any gene

In [37]:
#Specify the gene and protein you wish to plot
gene = 'PIK3CA'
protein = 'CTNNB1'

#Create the dataframe
genedf = somatic_mutations[gene].to_frame()
proteindf = proteomics[protein].to_frame()
proteindfName = protein + " protein" #Technically only necessary when the gene and the protein have the same name
proteindf.columns = [proteindfName]
cross = genedf.add(proteindf, fill_value=0).dropna(axis=0)

#Plot the data
somatic_boxplot = sns.boxplot(data=cross, x=gene ,y=proteindfName)
somatic_boxplot.set_title(gene + " gene mutation and " + protein + " protein abundance")
somatic_boxplot = sns.stripplot(data=cross, x=gene, y=proteindfName,jitter=True, color=".3")
somatic_boxplot.set(xlabel="Somatic Gene Mutation",ylabel="Proteomics")

KeyError: 'PIK3CA'

### Test for phosphorylation levels (mutation in any gene with phosphorylation of any protein)

In [43]:
#List of proteins (will test all phosphorylation sites on these proteins)
phosProtList = protList

In [46]:
phos = phos[1:100]

In [47]:
sites = phos.columns;
significantPhosResults = [];
totalSites = 0


for protein in phosProtList:
    matchesdf = phos.filter(regex=protein)
    totalSites += len(matchesdf.columns)

pcutoff = 0.05 / totalSites

    

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.merge_mutations(phos, protein, gene)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.merge_mutations(phos, site, gene)
                mutateddf = sitedf.loc[sitedf['Mutation'] == 'Missense_Mutation'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                if(ttest[1] <= pcutoff):
                    #significantResults.append(site)
                    print("Test for " + site + ": ")
                    print(ttest) 

AttributeError: 'DataFrame' object has no attribute 'name'

### List of significantly affected phosphorylation sites

In [None]:
print(sigPhosResults)

### Plot phosphorylation levels and gene mutation
<b>Note:</b> There may be fewer data points due to NA values

In [None]:
#Specify the gene and the site; you may use a string to specify the site or reference the significant results above
gene = 'CTNNB1'
site = sigPhosResults[0]

#Build the dataframe for plotting
genedf = somatic_mutations[gene].to_frame()
sites = phos.filter(regex=site)
genedf = genedf.add(sites, fill_value=0)

phos_boxplot = sns.boxplot(data=genedf, x=gene ,y=site)
phos_boxplot.set_title(gene + " gene mutation and " + site + " phosphorylation levels")
phos_boxplot = sns.stripplot(data=genedf, x=gene, y=site,jitter=True, color=".3")
phos_boxplot.set(xlabel="Somatic Gene Mutation",ylabel="Phosphoproteomics")

### Test for phosphorylation levels and protein abundance

In [None]:
#Specify the pattern for all phosphorylation sites as well as the protein of interest
phosProt = phos.filter(regex='CTNNB1') 
protein = 'CTNNB1'
sigPhosProtResults = []

#Build dataframe with protein levels and phosphorylation levels
proteindf = proteomics[protein].to_frame()
phosProtCross = proteindf.add(phosProt, fill_value=0)

#Bonferroni correction
pcutoff = 0.05/len(cross.columns)

#Test each site for significance and print the results
for loc in phosProtCross.columns:
    if not loc == protein:
        oneSitedf = phosProtCross[[loc, protein]].dropna(axis=0)
        pearsonresult = pearsonr(oneSitedf[loc], oneSitedf[protein])
        if(pearsonresult[1] < pcutoff):
            sigPhosProtResults.append(loc)
            print("Results for " + loc)
            print(pearsonresult)

### Scatterplot of phosphorylation levels vs. protein abundance (mutation included)

In [None]:
#Specify a protein, a gene and a site; you may use a string to specify the site or reference sigPhosProtResults above
protein = 'CTNNB1'
gene = 'CTNNB1'
site = sigPhosProtResults[1]

#Build the dataframe for plotting
protdf = proteomics[protein].to_frame()
protdfName = protein + " protein levels" #Technically only necessary when the gene and the protein have the same name
protdf.columns = [protdfName]
sites = phos.filter(regex=site)
protdf = protdf.add(sites, fill_value=0)

genedf = somatic_mutations[gene].to_frame()
genedfName = gene + " mutation"
genedf.columns = [genedfName]
finaldf = protdf.add(genedf, fill_value=0)

alt.Chart(finaldf).mark_point().encode(
    alt.X(protdfName),
    alt.Y(site),
    alt.Color(genedfName, type='nominal')
).interactive()