In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest
import sys
import urllib3
import json
import operator
import collections

import CPTAC.Endometrial as CPTAC

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list(). In order to access a specific data set,
import a CPTAC subfolder using either 'import CPTAC.Dataset' or 'from
CPTAC import Dataset'.
******
Version: 0.2.5
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [23]:
somatic = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

## Step 1: Enter gene name

In [24]:
gene = "ARID1A" #Name of any gene of which you are interested in examing the effects of mutation

## Step 2: Select type of data
Set "data_type" to "proteomics" for proteomic abundance data <br><br>
Set "data_type" to "phosphoproteomics" for phosphoproteomic data <br><br>
Set "data_type" to "transcriptomics" for transcription data

In [25]:
data_type = "proteomics" #Options are "proteomics", "phosphoproteomics", "transcriptomics"

## Step 3: Select scope of data analysis
Set "scope" to "cis" to examine cis effect of gene mutation on the protein it codes for <br> <br>
Set "scope" to "interacting" to examine the effect of gene mutation on known interacting proteins (interacting proteins are automatically generated through interacting with STRING database API). If using scope, set "number_of_interacting_proteins" to the number of interacting proteins you want to pull from the STRING database <br><br>
Set "scope" to "all" to look at the effect on all proteins in our dataset

In [34]:
scope = "interacting" #Options are "cis", "interacting", "all"
number_of_interacting_proteins = 25 #Parameter only necessary if examining "interacting", else set to 0

## Step 4: Select Question
Set "question" to "mutated_vs_wildtype" to compare mutated and wildtype groups <br><br>
Set "question" to "hotspot_vs_wildtype" to compare individuals with a hotspot mutation with wildtype <br><br>
Set "question" to "truncation_vs_wildtype" to compare individuals with a truncating mutation with wildtype

In [35]:
question = "mutated_vs_wildtype" #Options are .....

### Set dataframe to that of the seleted data type

In [36]:
if data_type == "proteomics":
    dataframe = CPTAC.get_proteomics
    
elif data_type == "phosphoproteomics":
    dataframe = CPTAC.get_phosphoproteomics
    
elif data_type == "transcriptomics":
    dataframe = CPTAC.get_transcriptomics
    
else:
    print("Invalid data_type value entered! See step 2 above.")

### Generate protein list based on the selected scope

In [37]:
protein_list = []    

'''Just add the gene itself if cis'''
if scope == "cis":
    protein_list = [gene]

'''If interacting, get interacting proteins from STRING database'''
elif scope == "interacting":
    '''Use urllib3 to access the string database api, gather list of interacting proteins'''
    urllib3.disable_warnings()
    string_api_url = "https://string-db.org/api"
    output_format = "json"
    method = "network"

    '''Use the specified gene and homo sapiens species code'''
    my_gene = [gene]
    species = "9606"

    '''Format the api request to collect the appropriate information'''
    request_url = string_api_url + "/" + output_format + "/" + method + "?"
    request_url += "identifiers=%s" % "%0d".join(my_gene)
    request_url += "&" + "species=" + species
    request_url += "&" + "limit=" + str(number_of_interacting_proteins)

    '''Send a request to the API, print the response status'''
    try:
        http = urllib3.PoolManager()
        response = http.request('GET',request_url)
        #print("Accessing String database, response status: ", response.status)
    except urllib3.HTTPError as err:
        error_message = err.read()
        print(error_message)
        sys.exit()

    interacting_proteins = []
    if response.status == 200: 
        '''Get the data from the API's response'''
        data = response.data
        y = json.loads(data)

        '''Make a list of the resulting interacting proteins'''
        for entry in y:
            if entry["preferredName_A"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_A"])
            if entry["preferredName_B"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_B"])
                
        for ip in interacting_proteins:
            protein_list.append(ip)
    else:
        print("\nSpecified gene was not found in String database, double check that you have it correctly!")
        protein_list.append(gene)

'''If all, add all proteins in our dataset'''
elif scope == "all":
    protein_list = proteomics.columns

'''If none of these, invalid'''
else:
    print("Invalid scope value entered! See step 3 above.")
    
'''Display list of proteins selected'''
print("Protein List: \n")
for pro in protein_list:
    print(pro)

Protein List: 

SMARCC1
SUPT16H
CCND1
SMARCD3
CREBBP
SMARCB1
KAT2B
DNMT3A
SMARCA2
SMARCC2
KMT2D
ARID1A
SMARCE1
PHF10
BAZ1B
ARID1B
NF1
CDC5L
SMARCD1
SMARCD2
SMARCA4
ACTL6A
BCL7B
SS18
DPF2
DPF3
