In [31]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest
import sys
import urllib3
import json
import operator
import collections

import CPTAC.Endometrial as CPTAC

In [40]:
'''
@Param protein:
    The name of the protein that you want to generate a list of interacting proteins for.

@Param number (default=25):
    The number of interacting proteins that you want to get.
    
@Return:
    A list of proteins known by the String api to be interacting partners with the specified protein.
    Returns None if specified protein isn't found in String database, or connection to String api fails.
    
    
This method takes as a parameter the name of a protein. It then accesses the STRING database, through
a call to their public API, and generates a list of proteins known to be interacting partners with the specified
protein. Optional second parameter is number (which by default is 25), which specifies in the API call how many
interacting partners to retrieve from the database. The list of interacting proteins is returned to the caller
as a python list.
'''

def get_interacting_proteins_string(protein, number=25):
    '''Use urllib3 to access the string database api, gather list of interacting proteins'''
    urllib3.disable_warnings()
    string_api_url = "https://string-db.org/api"
    output_format = "json"
    method = "network"

    '''Use the specified gene and homo sapiens species code'''
    my_protein = [protein]
    species = "9606"

    '''Format the api request to collect the appropriate information'''
    request_url = string_api_url + "/" + output_format + "/" + method + "?"
    request_url += "identifiers=%s" % "%0d".join(my_protein)
    request_url += "&" + "species=" + species
    request_url += "&" + "limit=" + str(number)

    '''Send a request to the API, print the response status'''
    try:
        http = urllib3.PoolManager()
        response = http.request('GET',request_url)
        '''Catch exception if it fails while accessing the api'''
    except urllib3.HTTPError as err:
        error_message = err.read()
        print("Error accessing STRING api, " , error_message)
        sys.exit()
    
    '''Get the data from the api response'''
    interacting_proteins = []
    if response.status == 200: 
        '''Get the data from the API's response'''
        data = response.data
        y = json.loads(data)

        '''Make a list of the resulting interacting proteins'''
        for entry in y:
            if entry["preferredName_A"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_A"])
            if entry["preferredName_B"] not in interacting_proteins:
                interacting_proteins.append(entry["preferredName_B"])
        
        return interacting_proteins
        
        '''If we didnt get a successful response from the api, notify the caller and return None'''
    else:
        print("\nSpecified gene was not found in String database, double check that you have it correctly!")
        return None

In [66]:
'''
@Param protein:
    The name of the protein that you want to generate a list of interacting proteins for.

@Param number (default=25):
    The number of interacting proteins that you want to get.
    
@Return:
    A list of proteins known by the biogrid api to be interacting partners with the specified protein.
    Returns None if specified protein isn't found in biogrid database, or connection to biogrid api fails.
    
    
This method takes as a parameter the name of a protein. It then accesses the biogrid database, through
a call to their public API, and generates a list of proteins known to be interacting partners with the specified
protein. Optional second parameter is number (which by default is 25), which specifies in the API call how many
interacting partners to retrieve from the database. The list of interacting proteins is returned to the caller
as a python list.
'''
def get_interacting_proteins_biogrid(protein, number=25):
    '''Store interacting proteins in a list'''
    interacting_proteins = []
    urllib3.disable_warnings()
    
    '''Configure url for request'''
    request_url = "https://webservice.thebiogrid.org/interactions/?searchNames=true&geneList=" + protein +"&includeInteractors=true&format=json&taxId=9606&start=0&max=" + str(number) + "&accesskey=0ff59dcf3511928e78aad499688381c9"
    try:
        '''Send request, get response'''
        http = urllib3.PoolManager()
        response = http.request('GET',request_url)
        
        '''If response was successful'''
        if response.status == 200: 
            '''Get the data from the API's response'''
            data = response.data
            y = json.loads(data)
            
            '''Add name of each protein to list of interacting proteins'''
            for entry in y:
                if y[entry]['OFFICIAL_SYMBOL_A'] not in interacting_proteins:
                    interacting_proteins.append(y[entry]['OFFICIAL_SYMBOL_A'])
            
            '''Return this list to caller'''
            return interacting_proteins
        
        else:
            '''If response was not successful, notify caller of error, return None'''
            print("Error accessing api!")
            return None
        
        '''Catch exception, notify caller of errorm return None'''
    except Exception as err:
        print("Error accessing api, " , err)
        return None

In [42]:
'''
@Param protein:
    The name of the protein that you want to generate a list of interacting proteins for.

@Param number (default=25):
    The number of interacting proteins that you want to get from both STRING and BioGrid(used by uniprot). This 
    number of proteins will be generated by both String and BioGrid, and the two will be combined. The actual number of 
    proteins in the list returned by this method will be between the number specified and 2 times the number specified, 
    depending on how many of the interacting proteins the two APIs 'agree' on.
    
@Return:
    A list of proteins known by the String and BioGrid APIs to be interacting partners with the specified protein.
    Returns None if specified protein isn't found in either database, or both API calls fail.
    
    
This method takes as a parameter the name of a protein. It then accesses the STRING and BioGrid databases, through
a call to their public API, and generates a list of proteins known to be interacting partners with the specified
protein. Optional second parameter is number (which by default is 25), which specifies in the API call how many
interacting partners to retrieve from the database. The list of interacting proteins is returned to the caller
as a python list.
'''
def get_interacting_proteins(protein, number=25):
    string_list = get_interacting_proteins_string(protein, number)
    biogrid_list = get_interacting_proteins_biogrid(protein, number)
    
    if string_list == None and biogrid_list == None:
        return None
    
    else:
        interacting_proteins = []
        for prot in string_list:
            if prot not in interacting_proteins:
                interacting_proteins.append(prot)
        for prot in biogrid_list:
            if prot not in interacting_proteins:
                interacting_proteins.append(prot)

        return interacting_proteins

## Test 1

In [43]:
ips = get_interacting_proteins_string('PTEN', number=20)
print(ips)

['PDGFRB', 'PIK3R2', 'PIK3C3', 'CSNK2A2', 'CSNK2A1', 'SLC9A3R1', 'USP13', 'PIK3CA', 'TP53', 'PIK3CB', 'PTK2', 'USP7', 'XIAP', 'PTEN', 'PREX2', 'MAST2', 'ROCK1', 'NEDD4', 'INPP4B', 'PIK3R1', 'AKT1']


## Test 2

In [44]:
ips = get_interacting_proteins_string('ARID1A', number=50)
print(ips)

['CCND1', 'ACTL6B', 'NR3C1', 'WDR77', 'SMARCC1', 'SUPT16H', 'CDK4', 'SMARCD3', 'CREBBP', 'SMARCB1', 'EP300', 'KAT2B', 'DNMT3A', 'SMARCA2', 'SMARCC2', 'GTF2E1', 'RUNX1', 'KMT2D', 'CHAF1A', 'PRMT5', 'ARID1A', 'SMARCE1', 'PHF10', 'BAZ1B', 'ARID1B', 'HMGB1', 'ACTB', 'NF1', 'GTF2B', 'JUN', 'CDC5L', 'BCL7C', 'IRF4', 'IRF2', 'GTF2F1', 'PBRM1', 'SMARCD1', 'RELA', 'SMARCD2', 'SMARCA4', 'TOP2B', 'ACTL6A', 'BCL7B', 'SS18', 'CBFB', 'RXRA', 'DPF2', 'BCL7A', 'VDR', 'DPF3', 'POLR2A']


## Test 3

In [45]:
ips = get_interacting_proteins_string('BADNAME', number=50)
print(ips)


Specified gene was not found in String database, double check that you have it correctly!
None


## Comparison with BioGrid

In [77]:
ips = get_interacting_proteins_string('ARID1A', number=25)
print("STRING:")
for i in ips:
    print(i)
    
ips2 = get_interacting_proteins_biogrid('ARID1A', number=25)
print("\nBioGrid:")
for i in ips2:
    print(i)
    
shared = 0
for i in ips2:
    if i not in ips:
        shared+=1
        
print("\n" + str(shared) + " in 2nd not in first.")

STRING:
SMARCC1
SUPT16H
CCND1
SMARCD3
CREBBP
SMARCB1
KAT2B
DNMT3A
SMARCA2
SMARCC2
KMT2D
ARID1A
SMARCE1
PHF10
BAZ1B
ARID1B
NF1
CDC5L
SMARCD1
SMARCD2
SMARCA4
ACTL6A
BCL7B
SS18
DPF2
DPF3

BioGrid:
KLF1
GATA1
SMARCB1
SMARCA4
SMARCA2
ING1
SMARCE1
ARID1A
SMARCC2
SMARCC1
PGR
BCL7C
HIC1

6 in 2nd not in first.


In [76]:
all_ips = get_interacting_proteins('LAMB1')
print(all_ips)
print(len(all_ips))

['ITGB4', 'ITGA3', 'HGF', 'LAMB1', 'NID2', 'LAMA4', 'LAMA5', 'LAMC1', 'ITGAV', 'LAMC2', 'NID1', 'ITGA1', 'ITGA2', 'LAMA3', 'FBN1', 'FN1', 'COL18A1', 'LAMC3', 'HSPG2', 'AGRN', 'LAMA1', 'ITGB1', 'CYR61', 'LAMA2', 'DAG1', 'ITGB3', 'TGFB1', 'ATXN7', 'CACNA1A', 'Ogg1', 'Lama1', 'UBC', 'ATXN7L2', 'GFI1B', 'FBXO6', 'DPY30', 'HSPD1', 'PDIA3']
38
