# Use Case 4: Comparing Mutation Protein Abundance

<b>Standard imports for playing with and plotting data frames.</b>

In [2]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

<b>Import CPTAC data</b>

In [3]:
import CPTAC

Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Data...

 ******PLEASE READ******


In [4]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

### Test for protein levels (mutation in a gene with protein levels of any protein)

In [22]:
gene = 'ERBB4'
protList = ['EGFR', 'ERBB2']

pcutoff = 0.05/len(protList)
print(pcutoff)

genedf = somatic_mutations[gene].to_frame()
for protein in protList:
    if protein in proteomics.columns:
        proteindf = proteomics[protein].to_frame()
        cross = genedf.add(proteindf, fill_value=0).dropna(axis=0)
        mutated = cross.loc[cross[gene] == 1.0]
        wt = cross.loc[cross[gene] == 0.0]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        print("Test for " + protein + ": ")
        print(ttest)

0.025
Test for EGFR: 
Ttest_indResult(statistic=-0.7259814913173891, pvalue=0.4695803987766183)
Test for ERBB2: 
Ttest_indResult(statistic=-0.929006841844551, pvalue=0.3551681634936623)


### Test for phosphorylation levels (mutation in a gene with phosphorylation of any protein)

In [15]:
gene = 'ERBB4'
sites = phos.filter(regex='ERBB2')

genedf = somatic_mutations[gene].to_frame()
cross = genedf.add(sites, fill_value=0)
mutated = cross.loc[cross[gene] == 1.0]
wt = cross.loc[cross[gene] == 0.0]

pcutoff = 0.05/len(cross.columns)
print(pcutoff)

for loc in cross.columns:
    if not loc == gene:
        mutsitedf = mutated[[gene, loc]].dropna()
        wtsitedf = wt[[gene, loc]].dropna()
        ttest = scipy.stats.ttest_ind(mutsitedf[loc], wtsitedf[loc])
        print('Results for ' + loc + ': ')
        print(ttest)

0.005555555555555556
Results for ERBB2-S1054: 
Ttest_indResult(statistic=0.8256257579504211, pvalue=0.4119036683903391)
Results for ERBB2-S1078: 
Ttest_indResult(statistic=-0.5605086483554743, pvalue=0.5766608206471877)
Results for ERBB2-S1083: 
Ttest_indResult(statistic=-0.23672641038688677, pvalue=0.8133631592078125)
Results for ERBB2-S1107: 
Ttest_indResult(statistic=0.7883269597878024, pvalue=0.4326985743618985)
Results for ERBB2-S1151: 
Ttest_indResult(statistic=-0.23361645356844535, pvalue=0.8161661314804745)
Results for ERBB2-S998: 
Ttest_indResult(statistic=-0.2576591943595221, pvalue=0.7976144295454702)
Results for ERBB2-T1166: 
Ttest_indResult(statistic=0.6956765358947242, pvalue=0.4887815138840478)
Results for ERBB2-T701: 
Ttest_indResult(statistic=0.06284196915612855, pvalue=0.9500740354038221)


### Test for phosphorylation levels and protein abundance

In [27]:
phosProt = phos.filter(regex='FGFR') 
protein = 'STAT1'

proteindf = proteomics[protein].to_frame()
cross = proteindf.add(phosProt, fill_value=0)
for loc in cross.columns:
    if not loc == gene:
        oneSitedf = cross[[loc, protein]].dropna(axis=0)
        pearsonresult = scipy.stats.pearsonr(oneSitedf[loc], oneSitedf[protein])
        print("Results for " + loc)
        print(pearsonresult)

Results for FGFR1OP-S152
(-0.03333259196372111, 0.7592285091138733)
Results for FGFR1OP-S156
(0.15158405081358645, 0.07486026784097914)
Results for FGFR1OP-S160
(0.22275628733958308, 0.008395276554038449)
Results for FGFR1OP-S279
(0.1048119063268295, 0.25458624353413445)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()