In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [4]:
import CPTAC.Endometrial as CPTAC
CPTAC.list_data()

Below are the available endometrial data frames contained in this package:
	 clinical
	 	 Dimensions: (144, 26)
	 derived_molecular
	 	 Dimensions: (144, 144)
	 acetylproteomics
	 	 Dimensions: (144, 10862)
	 proteomics
	 	 Dimensions: (144, 10999)
	 transcriptomics_linear
	 	 Dimensions: (109, 28057)
	 transcriptomics_circular
	 	 Dimensions: (109, 4945)
	 CNA
	 	 Dimensions: (95, 28057)
	 phosphoproteomics_site
	 	 Dimensions: (144, 73212)
	 phosphoproteomics_gene
	 	 Dimensions: (144, 8466)
	 somatic binary
	 	 Dimensions: (95, 51559)
	 somatic MAF
	 	 Dimensions: (52560, 5)


In [16]:
gene = 'PIK3CA'
somatic = CPTAC.get_mutations() #changed get_somatic to mutations
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

#A list of mutations present in the hotspot according to our Hotspot3D output
hotspot_mutations = ['p.E545A', 'p.E545K', 'p.E545V', 'p.Q546P', 'p.Q546R', 'p.E542K']
mutated_hotspot = somatic.loc[(somatic['Location'].isin(hotspot_mutations)) & (somatic['Gene'] == gene)]
hotspot_patients = mutated_hotspot['Clinical_Patient_Key']
#print('Patients with hotspot mutations: \n')
#print(hotspot_patients)

#The pvalue cutoff used to find significance
pcutoff = 0.05
#mutated_hotspot.head()

In [30]:
#my practice .loc: get all missence mut for PIK3CA
missence = somatic.loc[(somatic['Mutation'] == 'Missense_Mutation') & (somatic['Gene'] == 'PIK3CA')] 
# use () because & higher precedence than == 
missence.head()
# find patient mutations
patient = somatic.loc[somatic['Patient_Id'] == 'C3L-00143']
#patient = somatic.loc[(somatic['Patient_Id'] == 'C3L-00143') & (somatic['Gene'] == 'PIK3CA')]

#find patients using iloc
somatic.index
patient_group = somatic.iloc[2:6]
patient_group

Unnamed: 0,Clinical_Patient_Key,Patient_Id,Gene,Mutation,Location
2,S001,C3L-00006,RPL22,Missense_Mutation,p.V72M
3,S001,C3L-00006,CASZ1,Missense_Mutation,p.R233Q
4,S001,C3L-00006,PRAMEF9,Missense_Mutation,p.L30M
5,S001,C3L-00006,SPEN,Missense_Mutation,p.V2741I


In [56]:
print('{}''{}'.format('Patients with mutations inside hotspot: ', len(hotspot_patients)))
p110_mutated = somatic.loc[somatic['Gene'] == 'PIK3CA']
#note: the gene PIK3CA encodes for the protein p110α, hence the variable 'p110_mutated'

# This naturally includes a value that is 'NA' so we subtract 1
# Some people have more than one mutation so we need to grab the unique values
num_mutated = len(set(p110_mutated['Clinical_Patient_Key'])) - 1
print('{}' '{}'.format('Total number of patients with PIK3CA mutations: ', num_mutated))

notavail = ['nan', 'NAN']
na = p110_mutated['Clinical_Patient_Key'].isin(notavail)
type(na)
na_df = na.to_frame()

new = na_df.loc[na_df['Clinical_Patient_Key'] == True]
new.head()

#using .find with series -- series.string.find(sub,start,end)
p110_s = p110_mutated['Clinical_Patient_Key']
p110_s.str.find('na',0,200)


Patients with mutations inside hotspot: 19
Total number of patients with PIK3CA mutations: 46


160     -1
1147    -1
1357    -1
1505    -1
2191    -1
2192    -1
3315    -1
3316    -1
4521    -1
4629    -1
4630    -1
7089    -1
7090    -1
7091    -1
7092    -1
16486   -1
16487   -1
19061   -1
19163   -1
19398   -1
19399   -1
20051   -1
20378   -1
21159   -1
21259   -1
21516   -1
21690   -1
22497   -1
22587   -1
22588   -1
        ..
37215   -1
37420   -1
37421   -1
38155   -1
38204   -1
38248   -1
39521   -1
39522   -1
43390   -1
43391   -1
43490   -1
44714   -1
44715   -1
45236   -1
45272   -1
45393   -1
46358   -1
46661   -1
46662   -1
47106   -1
47360   -1
47857   -1
48304   -1
49287   -1
49308   -1
49361   -1
49362   -1
49475   -1
50300   -1
52535   -1
Name: Clinical_Patient_Key, Length: 62, dtype: int64

In [None]:
# Count the number of instances of various mutation types in the column 'Mutation' in the p110_mutated dataframe.
mut_counts = p110_mutated['Mutation'].value_counts()

# The previous function returns a series. Convert the series to a dataframe for better visual
counts_df = mut_counts.to_frame()

#Rename the column to clarify the number of mutations
counts_df = counts_df.rename(index=str, columns = { 'Mutation' : 'Mutation Count'})
print(counts_df)

In [None]:
protein = 'PIK3CA'
# 1. Merge the mutation status of PIK3CA and the protemics data of PIK3CA
p110_mutations_protein = CPTAC.compare_mutations(proteomics, protein)
#note - CPTAC will return a dataframe of the mutation status and proteomics of the same gene 
#   if only two parameters are provided: the dataframe and the mutation/protein being analyzed

# 2. Set up a new column for mutations that are within the hotspot
p110_mutations_protein['Within_Hotspot'] = 'Outside Hotspot'
p110_mutations_protein.loc[hotspot_patients, 'Within_Hotspot'] = 'Within Hotspot'
p110_mutations_protein.loc[p110_mutations_protein['Mutation'] == 'Wildtype_Tumor', 'Within_Hotspot'] = 'Wildtype_Tumor'

# 3. Select for tumor samples only 
p110_mutations_protein = p110_mutations_protein.loc[p110_mutations_protein['Sample_Status'] == 'Tumor'].dropna(axis=0)

# 4. Make three new dataframes, one for people with a hotspot mutation, one for those with mutations outside the hotspot, and  one for all the wildtype cancer patients
hotspot_mut = p110_mutations_protein.loc[p110_mutations_protein['Within_Hotspot'] == 'Within Hotspot'] 
non_hotspot_mut = p110_mutations_protein.loc[p110_mutations_protein['Within_Hotspot'] == 'Outside Hotspot']
wt = p110_mutations_protein.loc[p110_mutations_protein['Within_Hotspot'] == 'Wildtype_Tumor']


# 5. ttest for proteomics of PIK3CA between wildtype cancer and hotspot mutation
tstat, pvalue = scipy.stats.ttest_ind(hotspot_mut[protein], wt[protein])
print('{}' '{}' '{}' '{}'.format(protein, ' proteomics hotspot mutations vs wildtype: ', pvalue, '\n'))

#Perform a ttest for non-hotspot mutations vs wildtype 
tstat, pvalue = scipy.stats.ttest_ind(non_hotspot_mut[protein], wt[protein])
print('{}' '{}' '{}'.format('Non-hotspot mutations vs wildtype: ', pvalue, '\n'))

# 6. Repeat for all mutations vs wildtype cancer
all_mutated = p110_mutations_protein.loc[p110_mutations_protein['Within_Hotspot'] != 'Wildtype']
tstat, pvalue = scipy.stats.ttest_ind(all_mutated[protein], wt[protein])
print('{}' '{}' '{}'.format(protein, " proteomics all mutation vs wildtype: " , pvalue))