In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import re
import seaborn as sns
import sys
import urllib3
import json
import operator
import collections

import CPTAC.Endometrial as CPTAC

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list(). In order to access a specific data set,
import a CPTAC subfolder using either 'import CPTAC.Dataset' or 'from
CPTAC import Dataset'.
******
Version: 0.2.5
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [2]:
somatic = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

# Part 1: Formatting Data

## Step 1: Enter gene name
# INPUT: 

In [3]:
gene = "PTEN" #Name of any gene of which you are interested in examing the effects of mutation

## Step 2: Select type of data
Set "data_type" to "proteomics" for proteomic abundance data <br><br>
Set "data_type" to "phosphoproteomics" for phosphoproteomic data <br><br>
Set "data_type" to "transcriptomics" for transcription data
# INPUT:

In [4]:
data_type = "phosphoproteomics" #Options are "proteomics", "phosphoproteomics", "transcriptomics"

## Step 3: Select scope of data analysis
Set "scope" to "cis" to examine cis effect of gene mutation on the protein it codes for <br> <br>
Set "scope" to "interacting" to examine the effect of gene mutation on known interacting proteins (interacting proteins are automatically generated through interacting with STRING database API). If using scope="interacting", set "number_of_interacting_proteins" to the number of interacting proteins you want to pull from the STRING database <br><br>
Set "scope" to "all" to look at the effect on all proteins in our dataset
# INPUT:

In [5]:
scope = "cis" #Options are "cis", "interacting", "all"
number_of_interacting_proteins = 20 #Parameter only necessary if examining "interacting", it will otherwise be ignored

## Step 4: Select Question

Set "question" to "mutated_vs_wildtype" to compare mutated and wildtype groups<br><br>
Set "question" to "hotspot_vs_wildtype" to compare individuals with a hotspot mutation <br><br>
Set "question" to "truncation_vs_wildtype" to compare individuals with a truncating mutation
# INPUT:

In [6]:
question = "truncation_vs_wildtype" #Options are .....

## Step 5: Set Dataframe to Selected Data

In [7]:
if data_type == "proteomics":
    dataframe = CPTAC.get_proteomics()
    
elif data_type == "phosphoproteomics":
    dataframe = CPTAC.get_phosphoproteomics()
    
elif data_type == "transcriptomics":
    dataframe = CPTAC.get_transcriptomics()
    
else:
    print("Invalid data_type value entered! See step 2 above.")

## Step 6: Generate Protein List Based on the Selected Scope

In [8]:
protein_list = []    

'''Just add the gene itself if cis'''
if scope == "cis":
    protein_list = [gene]
    
    '''If interacting, get interacting proteins from STRING database'''
elif scope == "interacting":
    
    '''Use urllib3 to access the string database api, gather list of interacting proteins'''
    urllib3.disable_warnings()
    string_api_url = "https://string-db.org/api"
    output_format = "json"
    method = "network"

    '''Use the specified gene and homo sapiens species code'''
    my_gene = [gene]
    species = "9606"

    '''Format the api request to collect the appropriate information'''
    request_url = string_api_url + "/" + output_format + "/" + method + "?"
    request_url += "identifiers=%s" % "%0d".join(my_gene)
    request_url += "&" + "species=" + species
    request_url += "&" + "limit=" + str(number_of_interacting_proteins)

    '''Send a request to the API, print the response status'''
    try:
        http = urllib3.PoolManager()
        response = http.request('GET',request_url)
        #print("Accessing String database, response status: ", response.status)
    except urllib3.HTTPError as err:
        error_message = err.read()
        print(error_message)
        sys.exit()

    interacting_proteins = []
    if response.status == 200: 
        '''Get the data from the API's response'''
        data = response.data
        y = json.loads(data)

        '''Make a list of the resulting interacting proteins'''
        for entry in y:
            if entry["preferredName_A"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_A"])
            if entry["preferredName_B"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_B"])
                
        for ip in interacting_proteins:
            protein_list.append(ip)
    else:
        print("\nSpecified gene was not found in String database, double check that you have it correctly!")
        protein_list.append(gene)

    '''If all, add all proteins in our dataset'''
elif scope == "all":
    protein_list = proteomics.columns

    '''If none of these, invalid'''
else:
    print("Invalid scope value entered! See step 3 above.")
    
'''Display list of proteins selected'''
print("Protein List: \n")
for pro in protein_list:
    print(pro)

Protein List: 

PTEN


## Step 7: Format Dataframe Based on Question

This will generate a dataframe with the gene mutation, sample status, and columns for either proteomics, phosphoproteomics, or transcriptomics for every protein (or every site in every protein, if phosphorylation).

#### Question: Mutated vs. Wildtype
Places either a 'Mutated' or 'Wildtype' value in the category column, based on gene mutation

In [9]:
if question == "mutated_vs_wildtype":
    '''Get dataframe for mutations, select sample status and gene mutation only'''
    genedf = CPTAC.compare_mutations(dataframe, protein_list[0], gene)
    genedf = genedf[[gene+"_Mutation", "Sample_Status"]]
    '''Loop through every protein in protein list, add in omics'''
    for protein in protein_list:
        protdf = CPTAC.compare_mutations(dataframe, protein, gene)
        if protdf is not None:
            for col in protdf.columns:
                if col not in genedf.columns:
                    genedf[col] = protdf[col]
                    
    '''Define category variable for mutated or wildtype'''                
    for ind, row in genedf.iterrows():
        if row[gene+"_Mutation"] != 'Wildtype_Tumor' and row[gene+"_Mutation"] != 'Wildtype':
            genedf.at[ind, 'Category'] = 'Mutated'
        else:
            genedf.at[ind, 'Category'] = 'Wildtype'
        

#### Question: Truncation Mutation vs Wildtype
Places either 'Truncating_Mutation', 'Non_Truncating_Mutation', or 'Wildtype' in the Category column, depending on gene mutation

In [10]:
if question == "truncation_vs_wildtype":
    '''Get dataframe for mutations, select sample status and gene mutation only'''
    genedf = CPTAC.compare_mutations(dataframe, gene, gene)
    genedf = genedf[[gene+"_Mutation", "Sample_Status"]]
    '''Loop through every protein in protein list, add in omics'''
    for protein in protein_list:
        protdf = CPTAC.compare_mutations(dataframe, protein, gene)
        if protdf is not None:
            for col in protdf.columns:
                if col not in genedf.columns:
                    genedf[col] = protdf[col]
                    
    truncating_mutations = ['Nonsense_Mutation', 'Frame_Shift_Del','Frame_Shift_Ins',]
    for ind, row in genedf.iterrows():
        if row[gene+"_Mutation"] in truncating_mutations:
            genedf.at[ind, 'Category'] = 'Truncating_Mutation'
        elif row[gene+"_Mutation"] == 'Wildtype' or row[gene+"_Mutation"] == 'Wildtype_Tumor':
            genedf.at[ind, 'Category'] = 'Wildtype'
        else:
            genedf.at[ind, 'Category'] = 'Non_Truncating_Mutation'

#### Show formatting

In [11]:
genedf.head()

Unnamed: 0,PTEN_Mutation,Sample_Status,PTEN-S467,PTEN-S475,PTEN-S478,PTEN-S537,PTEN-S543,PTEN-S558,PTEN-T539,PTEN-T555,PTEN-T556,Category
S001,Nonsense_Mutation,Tumor,0.195,0.1175,0.175,,,-0.428,,,,Truncating_Mutation
S002,Missense_Mutation,Tumor,-0.0611,-0.0751,,,-0.256,-0.0996,,,-0.2375,Non_Truncating_Mutation
S003,Nonsense_Mutation,Tumor,0.416,,,,,-0.8655,,,-0.551,Truncating_Mutation
S005,Missense_Mutation,Tumor,1.77,,,,,1.02,,,0.86,Non_Truncating_Mutation
S006,Wildtype_Tumor,Tumor,0.0258,,,-0.128,-0.0901,-0.2725,-0.0901,,-0.291,Wildtype


## Step 8: Select Desired Information from Dataframe
Set "drop_non_tumor" to "True" to look only at tumor patients. Set it to false to keep them in our comparison.
# INPUT: 

In [12]:
'''Select whether or not you want to drop non-tumor patients'''
drop_non_tumor = True #Options are True, False

In [13]:
'''Select only the Tumor patients if they have selected "drop_non_tumor"'''
if drop_non_tumor:
    genedf = genedf.loc[genedf['Sample_Status'] == 'Tumor']
    
'''Drop Columns that arent "Category" or omics data'''
genedf = genedf.drop('Sample_Status', axis = 1)
genedf = genedf.drop(gene+"_Mutation", axis = 1)

## Step 9: Select Variables for Comparison
Set "variable_a" and "variable_b" to two of the Options listed below for comparison

In [14]:
possible_choices = list(set(genedf['Category']))
print("Options for comparison (and number of): \n")
for choice in possible_choices:
    choicedf = genedf.loc[genedf['Category']==choice]
    print(choice, " (", len(choicedf), ")")

Options for comparison (and number of): 

Non_Truncating_Mutation  ( 22 )
Truncating_Mutation  ( 53 )
Wildtype  ( 20 )


# INPUT: 

In [15]:
'''select 2 variables to do comparison'''
variable_a = 'Truncating_Mutation' #See options from list above
variable_b = 'Wildtype' #See options from list above

# Part 2: Comparison

## Step 1: Collect P-Values

In [16]:
sites = genedf.columns
p_values = {}
a_df = genedf.loc[genedf['Category'] == variable_a]
b_df = genedf.loc[genedf['Category'] == variable_b]
for site in sites:
    if site != 'Category':
        test_stat, p_val = scipy.stats.ttest_ind(a_df[site], b_df[site])
        if not np.isnan(p_val):
            p_values[site] = p_val
          
print("P-Values: \n")
print("{\n" + "\n".join("{}: {}".format(k, v) for k, v in p_values.items()) + "\n}")

P-Values: 

{
PTEN-S467: 0.1904318008037339
}


## Step 2: Adjust for multiple testing with a bonferroni correction

In [17]:
alpha = .05
bonferroni_cutoff = alpha / len(p_values)
print("Bonferroni Adjusted P-Value Cutoff for Significance: ")
print(bonferroni_cutoff)

Bonferroni Adjusted P-Value Cutoff for Significance: 
0.05


## Step 3: Sort into significant and insignificant groups

In [18]:
significant = {}
insignificant = {}
for key, val in p_values.items():
    if val <= bonferroni_cutoff:
        significant[key] = val
    else:
        insignificant[key] = val
        
print("Significant Values: \n")
print("{\n" + "\n".join("{}: {}".format(k, v) for k, v in significant.items()) + "\n}")

print("\nInsignificant Values: \n")
print("{\n" + "\n".join("{}: {}".format(k, v) for k, v in insignificant.items()) + "\n}")

Significant Values: 

{

}

Insignificant Values: 

{
PTEN-S467: 0.1904318008037339
}
