In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest
import sys
import urllib3
import json
import operator
import collections

import CPTAC.Endometrial as CPTAC

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [2]:
'''
@Param protein:
    The name of the protein that you want to generate a list of interacting proteins for.

@Param number (default=25):
    The number of interacting proteins that you want to get.
    
@Return:
    A list of proteins known by the String api to be interacting partners with the specified protein.
    Returns None if specified protein isn't found in String database, or connection to String api fails.
    
    
This method takes as a parameter the name of a protein. It then accesses the STRING database, through
a call to their public API, and generates a list of proteins known to be interacting partners with the specified
protein. Optional second parameter is number (which by default is 25), which specifies in the API call how many
interacting partners to retrieve from the database. The list of interacting proteins is returned to the caller
as a python list.
'''

def get_interacting_proteins_string(protein, number=25):
    '''Use urllib3 to access the string database api, gather list of interacting proteins'''
    urllib3.disable_warnings()
    string_api_url = "https://string-db.org/api"
    output_format = "json"
    method = "network"

    '''Use the specified gene and homo sapiens species code'''
    my_protein = [protein]
    species = "9606"

    '''Format the api request to collect the appropriate information'''
    request_url = string_api_url + "/" + output_format + "/" + method + "?"
    request_url += "identifiers=%s" % "%0d".join(my_protein)
    request_url += "&" + "species=" + species
    request_url += "&" + "limit=" + str(number)

    '''Send a request to the API, print the response status'''
    try:
        http = urllib3.PoolManager()
        response = http.request('GET',request_url)
        '''Catch exception if it fails while accessing the api'''
    except urllib3.HTTPError as err:
        error_message = err.read()
        print("Error accessing STRING api, " , error_message)
        sys.exit()
    
    '''Get the data from the api response'''
    interacting_proteins = []
    if response.status == 200: 
        '''Get the data from the API's response'''
        data = response.data
        y = json.loads(data)

        '''Make a list of the resulting interacting proteins'''
        for entry in y:
            if entry["preferredName_A"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_A"])
            if entry["preferredName_B"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_B"])
        
        return interacting_proteins
        
        '''If we didnt get a successful response from the api, notify the caller and return None'''
    else:
        print("\nSpecified gene was not found in String database, double check that you have it correctly!")
        return None

In [None]:
def get_iteracting_proteins_biogrid(gene_name, number=25):
    '''Store interacting proteins in a list'''
    interacting_proteins = []
    urllib3.disable_warnings()
    
    '''Configure url for request'''
    request_url = "https://webservice.thebiogrid.org/interactions/?searchNames=true&excludeGenes=true&geneList=" + gene_name +"&includeInteractors=true&format=json&taxId=9606&start=0&max=" + str(number) + "&accesskey=0ff59dcf3511928e78aad499688381c9"
    try:
        '''Send request, get response'''
        http = urllib3.PoolManager()
        response = http.request('GET',request_url)
        
        '''If response was successful'''
        if response.status == 200: 
            '''Get the data from the API's response'''
            data = response.data
            y = json.loads(data)
            
            '''Add name of each protein to list of interacting proteins'''
            for entry in y:
                if y[entry]['OFFICIAL_SYMBOL_A'] not in interacting_proteins:
                    interacting_proteins.append(y[entry]['OFFICIAL_SYMBOL_A'])
            
            '''Return this list to caller'''
            return interacting_proteins
        
        else:
            '''If response was not successful, notify caller of error, return None'''
            print("Error accessing api!")
            return None
        
        '''Catch exception, notify caller of errorm return None'''
    except Exception as err:
        print("Error accessing api, " , err)
        return None

## Test 1

In [3]:
ips = get_interacting_proteins('PTEN', number=20)
print(ips)

['PDGFRB', 'PIK3R2', 'PIK3C3', 'CSNK2A2', 'CSNK2A1', 'SLC9A3R1', 'USP13', 'PIK3CA', 'TP53', 'PIK3CB', 'PTK2', 'USP7', 'XIAP', 'PTEN', 'PREX2', 'MAST2', 'ROCK1', 'NEDD4', 'INPP4B', 'PIK3R1', 'AKT1']


## Test 2

In [4]:
ips = get_interacting_proteins('ARID1A', number=50)
print(ips)

['CCND1', 'ACTL6B', 'NR3C1', 'WDR77', 'SMARCC1', 'SUPT16H', 'CDK4', 'SMARCD3', 'CREBBP', 'SMARCB1', 'EP300', 'KAT2B', 'DNMT3A', 'SMARCA2', 'SMARCC2', 'GTF2E1', 'RUNX1', 'KMT2D', 'CHAF1A', 'PRMT5', 'ARID1A', 'SMARCE1', 'PHF10', 'BAZ1B', 'ARID1B', 'HMGB1', 'ACTB', 'NF1', 'GTF2B', 'JUN', 'CDC5L', 'BCL7C', 'IRF4', 'IRF2', 'GTF2F1', 'PBRM1', 'SMARCD1', 'RELA', 'SMARCD2', 'SMARCA4', 'TOP2B', 'ACTL6A', 'BCL7B', 'SS18', 'CBFB', 'RXRA', 'DPF2', 'BCL7A', 'VDR', 'DPF3', 'POLR2A']


## Test 3

In [5]:
ips = get_interacting_proteins('BADNAME', number=50)
print(ips)


Specified gene was not found in String database, double check that you have it correctly!
None


## Comparison with BioGrid

In [16]:
ips = get_interacting_proteins('PIK3CA', number=25)
print("STRING:")
for i in ips:
    print(i)
    
ips2 = generate_iteracting_from_BioGrid('PIK3CA', number=25)
print("\nBioGrid:")
for i in ips2:
    print(i)
    
shared = 0
for i in ips2:
    if i not in ips:
        shared+=1
        
print("\n" + str(shared) + " in 2nd not in first.")

STRING:
PIK3R2
TNS1
RPS6KB1
KRAS
PDGFRA
PIK3R3
AKT3
PIK3CA
ERBB3
IGF1R
ERBB2
EGFR
GNAQ
KIT
MRAS
IRS1
CTNNB1
NRAS
PTEN
IRS2
AKT2
CDC42
ESR1
HRAS
PIK3R1
AKT1

BioGrid:
MAP2K4
MYPN
ACVR1
GATA2
RPA2
ARF1
ARF3
XRN1
APP
APLP1
CITED2
EP300
APOB
ARRB2
CSF1R
PRRC2A
LSM1
SLC4A1
BCL3
ADRB1
BRCA1

21 in 2nd not in first.
