# Statistical Testing Templates for CPTAC Data

<b>Standard imports for playing with and plotting data frames.</b>

In [63]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest
#import altair as alt
#alt.renderers.enable('notebook') #Necessary for Altair to work

In [64]:
import CPTAC

In [65]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

### Take out the control (patients 101 - 140)

In [66]:
proteomics = proteomics[:100]

## t-test for effects of all mutations combined
<b>Note:</b> You can put any number of proteins in the list and it will compare that gene with the protein levels of all of them

### List of interacting proteins (according to STRING and Uniprot)

In [74]:
#Build the protein list; this may have only a single protein if desired
protList = ['IRS1', 'IRS2', 'RRAS', 'AKT2', 'NRAS', 'PTEN', 'AKT1', 'MRAS', 'HRAS', 'RPS6KB1', 'PIK3R1', 'PKC', 'MTOR', 'S6K', 'MAPK', 'ERBB3', 'P85A', 'P55G', 'CDK5']

In [75]:
gene = 'PIK3CA'
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        cross = cross[:100]
        cross = cross[["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[cross["Mutation"] != "Wildtype"]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

In [76]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

In [77]:
print(significant_proteins)
print(significant_pvals)

[]
[]


### Test for phosphorylation levels (mutation in any gene with phosphorylation of any protein)

In [71]:
#List of proteins (will test all phosphorylation sites on these proteins)
phosProtList = protList

In [72]:
phos = phos[:100]

In [73]:
sites = phos.columns;
significantPhosResults = [];
totalSites = 0

for protein in phosProtList:
    matchesdf = phos.filter(regex=protein)
    totalSites += len(matchesdf.columns)

pcutoff = 0.05 / totalSites   

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[sitedf['Mutation'] == 'Missense_Mutation'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                if(ttest[1] <= pcutoff):
                    significantPhosResults.append(site)
                    print("Test for " + site + ": ")
                    print(ttest) 

AttributeError: 'DataFrame' object has no attribute 'name'

### List of significantly affected phosphorylation sites

In [None]:
print(sigPhosResults)

### Plot phosphorylation levels and gene mutation
<b>Note:</b> There may be fewer data points due to NA values

In [None]:
#Specify the gene and the site; you may use a string to specify the site or reference the significant results above
gene = 'PIK3CA'
site = sigPhosResults[0]

#Build the dataframe for plotting
genedf = somatic_mutations[gene].to_frame()
sites = phos.filter(regex=site)
genedf = genedf.add(sites, fill_value=0)

phos_boxplot = sns.boxplot(data=genedf, x=gene ,y=site)
phos_boxplot.set_title(gene + " gene mutation and " + site + " phosphorylation levels")
phos_boxplot = sns.stripplot(data=genedf, x=gene, y=site,jitter=True, color=".3")
phos_boxplot.set(xlabel="Somatic Gene Mutation",ylabel="Phosphoproteomics")

### Test for phosphorylation levels and protein abundance

In [None]:
#Specify the pattern for all phosphorylation sites as well as the protein of interest
phosProt = phos.filter(regex='PIK3CA') 
protein = 'PIK3CA'
sigPhosProtResults = []

#Build dataframe with protein levels and phosphorylation levels
proteindf = proteomics[protein].to_frame()
phosProtCross = proteindf.add(phosProt, fill_value=0)

#Bonferroni correction
pcutoff = 0.05/len(cross.columns)

#Test each site for significance and print the results
for loc in phosProtCross.columns:
    if not loc == protein:
        oneSitedf = phosProtCross[[loc, protein]].dropna(axis=0)
        pearsonresult = pearsonr(oneSitedf[loc], oneSitedf[protein])
        if(pearsonresult[1] < pcutoff):
            sigPhosProtResults.append(loc)
            print("Results for " + loc)
            print(pearsonresult)

### Scatterplot of phosphorylation levels vs. protein abundance (mutation included)

In [None]:
#Specify a protein, a gene and a site; you may use a string to specify the site or reference sigPhosProtResults above
protein = 'CTNNB1'
gene = 'CTNNB1'
site = sigPhosProtResults[1]

#Build the dataframe for plotting
protdf = proteomics[protein].to_frame()
protdfName = protein + " protein levels" #Technically only necessary when the gene and the protein have the same name
protdf.columns = [protdfName]
sites = phos.filter(regex=site)
protdf = protdf.add(sites, fill_value=0)

genedf = somatic_mutations[gene].to_frame()
genedfName = gene + " mutation"
genedf.columns = [genedfName]
finaldf = protdf.add(genedf, fill_value=0)

alt.Chart(finaldf).mark_point().encode(
    alt.X(protdfName),
    alt.Y(site),
    alt.Color(genedfName, type='nominal')
).interactive()