# PIK3CA trans protein pathway data munging 

This notebook performs t-tests of protein abundance between PIK3CA hotspot mutations (E542K,E545K,and H1047R)and wildtype tumors (no PIK3CA mutation of any kind) for a proteins within the PI3K-AKT wiki pathway

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
import matplotlib.pyplot as plt
import sys 
import re
import cptac
import statsmodels.stats.multitest
import operator
import cptac.utils as u
import warnings
warnings.filterwarnings('ignore')

In [4]:
#Load CPTAC data sets
br = cptac.Brca()
endo = cptac.Endometrial()
col = cptac.Colon()

                                                

In [5]:
#cptac version
cptac.version()

'0.8.5'

In [6]:
def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [7]:
def get_prot_mutations_df(cancer_type, prot_list):
    prot = cancer_type.join_omics_to_mutations(omics_df_name = "proteomics",mutations_genes="PIK3CA", tissue_type="tumor")
    if cancer_type == br:
        prot= u.reduce_multiindex(df=prot, levels_to_drop=["Database_ID"])
    genefilter = prot.columns.get_level_values("Name").isin(prot_list)
    prot_pik3ca = prot[prot.columns[genefilter]]
    prot_pik3ca = cptac.utils.reduce_multiindex(prot_pik3ca, flatten=True)
    prot_pik3ca['PIK3CA_Mutation'] = [','.join(map(str, l)) for l in prot_pik3ca['PIK3CA_Mutation']]
    prot_pik3ca['PIK3CA_Location'] = [','.join(map(str, l)) for l in prot_pik3ca['PIK3CA_Location']]
    prot_pik3ca = rename_duplicate_cols(prot_pik3ca)
    
    hotspot= prot_pik3ca[(prot_pik3ca.PIK3CA_Location.str.contains('E542K'))| 
                    prot_pik3ca.PIK3CA_Location.str.contains('E545K')| 
                   prot_pik3ca.PIK3CA_Location.str.contains('H1047R')]
    hotspot["PIK3CA_Mutation"] = "Hotspot"
    wt = prot_pik3ca[(prot_pik3ca.PIK3CA_Mutation.str.contains('Wildtype'))]
    hotspot_wt = pd.concat([hotspot, wt])
    cols = hotspot_wt.columns.to_list()
    cols.remove("PIK3CA_Mutation")
    cols.remove("PIK3CA_Location")
    prot_pval = u.wrap_ttest(hotspot_wt, 'PIK3CA_Mutation', cols,return_all=True, pval_return_corrected= True, correction_method= "FDR_bh")
    return prot_pval

In [27]:
prot = u.get_proteins_in_pathways('PI3K-Akt Signaling Pathway', 'wikipathways')
proteins = list(prot.member)

prot_list = [prot + "_proteomics" for prot in proteins]
prot_list.append("PIK3CA_Mutation")
prot_list.append("PIK3CA_Location")


In [28]:
#Wiki pathway
brca_results = get_prot_mutations_df(br, prot_list)
brca_results.head(10)

Unnamed: 0,Comparison,P_Value
0,HSP90B1_proteomics,0.257892
1,TNN_proteomics,0.257892
2,FN1_proteomics_1,0.257892
3,TSC1_proteomics,0.257892
4,FN1_proteomics_3,0.257892
5,FOXO3_proteomics,0.330394
6,ITGB5_proteomics,0.330394
7,RHEB_proteomics,0.35865
8,COL1A2_proteomics,0.35865
9,TP53_proteomics,0.389823


# Endo

In [29]:
endo_results = get_prot_mutations_df(endo, prot_list)
endo_results

Unnamed: 0,Comparison,P_Value
0,JAK1_proteomics,0.281678
1,LAMA1_proteomics,0.281678
2,PPP2R2D_proteomics,0.281678
3,PIK3CA_proteomics,0.281678
4,COL1A1_proteomics,0.281678
...,...,...
192,FN1_proteomics,0.987913
193,MAP2K1_proteomics,0.987913
194,PPP2R1A_proteomics,0.987913
195,FGFR3_proteomics,0.987913


# Colon 

In [30]:
col_results = get_prot_mutations_df(col, prot_list)
col_results

Unnamed: 0,Comparison,P_Value
0,AKT1_proteomics,0.983575
1,PIK3R2_proteomics,0.983575
2,PIK3R1_proteomics,0.983575
3,PIK3AP1_proteomics,0.983575
4,PDPK1_proteomics,0.983575
...,...,...
130,CSF1R_proteomics,0.994204
131,PKN2_proteomics,0.994204
132,AKT2_proteomics,0.994204
133,PPP2R5C_proteomics,0.994204
