In [16]:
pip install cptac

Collecting cptac
[?25l  Downloading https://files.pythonhosted.org/packages/00/ce/a0766d558f651af95b59b1219e1f211301340179a35b2ed8d56da68c802a/cptac-0.6.4-py3-none-any.whl (4.9MB)
[K     |████████████████████████████████| 4.9MB 1.6MB/s eta 0:00:01
Installing collected packages: cptac
Successfully installed cptac-0.6.4
Note: you may need to restart the kernel to use updated packages.


In [38]:
import pandas as pd
import numpy as np
import re
import sys
import operator
import collections
import os
import cptac
from datetime import date

In [39]:
#size=-1 to get all genes, but takes a long time
#if statements might need to be added in case keys not in dict when parsing

#this function will query wikipathways for all interactions of a given gene for the Homo Sapiens species
#some interactions won't be genes

import urllib3
import json

def findInteractions(geneName): #retuns list of interacting elements
    interactionList = []
    
    try:
        requestURL = "http://webservice.wikipathways.org/findInteractions?query=" + geneName + "&format=json" 
        http = urllib3.PoolManager()
        response = http.request('GET', requestURL)

        if response.status == 200: #if success, get data from API response
            responseData = response.data
            #print(type(responseData))
            responseDataDict = json.loads(responseData)
            #print(responseDataDict)
            resultList = list(responseDataDict.values())
            interactionsList = resultList[0]

            for interactionDict in interactionsList:

                if interactionDict["species"] == "Homo sapiens":
                    rightValuesList = interactionDict["fields"]["right"]["values"]
                    for value in rightValuesList:
                        if (value not in interactionList) and (geneName.casefold() not in value.casefold()):
                            interactionList.append(value)

                    leftValuesList = interactionDict["fields"]["left"]["values"]
                    for value in leftValuesList:
                        if (value not in interactionList) and (geneName.casefold() not in value.casefold()):
                            interactionList.append(value)
                else:
                    continue       
        else:
            print("Something went wrong with the response status")

        return interactionList
    except:
        print("There was a problem getting a response from the WikiPathways API")

In [40]:
#function to split each item with a newline into two items

#fix if genes are in parenthesis

def fixParsingErrors(interactingList):
    newlineFreeList = []
    for item in interactingList:
        item = item.strip()
        if "\n" in item:
            nList = item.split("\n")
            for n in nList:
                newlineFreeList.append(n)
        elif "\t" in item:
            tList = item.split("\t")
            for t in tList:
                newlineFreeList.append(t)
        elif item.startswith("(") and item.endswith(")"):
            item = item[1:(len(item) - 1)]
            print("HERE")
            newlineFreeList.append(item)
        else:
            newlineFreeList.append(item)
    return set(newlineFreeList)

In [41]:
def getUniprotGeneList():
    filePath = '/Users/coribushman/github/cptac/cptac/utils/Uniprot_Proteome.tsv'
    bioplex_interactions = pd.read_csv(filePath, sep='\t')
    uniprotList = []
    for ind, row in bioplex_interactions.iterrows():
        prots = str(row['Gene names'])
        #print(prots)
        prot_arr = prots.split(' ')
        #print(prot_arr)
        for prot in prot_arr:
            if prot not in uniprotList:
                uniprotList.append(prot)
    return uniprotList

In [46]:
def intersectWithUniprot(interactingGenes):
    #get uniprot genes from file
    uniprotGenes = getUniprotGeneList()
    
    #intersect
    geneSet = set(interactingGenes)
    uniprotSet = set(uniprotGenes)   #convert to set
    intersectSet = geneSet.intersection(uniprotSet)
    
    today = date.today()
    todayDate = today.strftime("%m/%d/%y")
    
    print("WikiPathways Interacting Proteins as of " + str(todayDate))
    return intersectSet

In [47]:
wikiPathwaysInterGenes = findInteractions("p53")
filteredInterGenes = fixParsingErrors(wikiPathwaysInterGenes)
finalGeneSet = intersectWithUniprot(filteredInterGenes)
print(finalGeneSet)

WikiPathways Interacting Proteins as of 10/25/19
{'NOXA1', 'SIRT1', 'CHK1', 'PTEN', 'ATR', 'EP300', 'IL2', 'MDM2', 'CCND1', 'CDKN1A', 'HIPK2', 'DCAKD', 'PRKDC', 'SFN', 'PIAS1', 'RELA', 'SIAH1', 'GADD45A', 'HDAC1', 'ATM', 'CHK2', 'CBP', 'BRCA1', 'MDM4', 'CSNK1D', 'IL6', 'PML', 'AURKA', 'FADD', 'PUMA'}


OLD FUNCTIONS

In [7]:
#METHOD 2: GO THROUGH EACH GENE IN MY UNFILTERED LIST AND SEE IF IT IS IN UNIPROT
        
import certifi
import json
import urllib3
#import requests, sys

def geneInUniprot(geneName):  #pass in the interacting list from findInteractions
    try:
        requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&gene=" + geneName + "&organism=homo%20sapiens&format=json"
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        response = http.request('GET', requestURL)
    except:
        return "n"

    if response.status == 200: #if success, get data from API response
        responseData = response.data
        responseDataList = json.loads(responseData)
        
        
        
        if (len(responseDataList) != 0) and ("accession" in responseDataList[0].keys()) and (geneName != "DNA"):
            return geneName
        else:
            return "n"
    else:
        #print("Something went wrong with the response status")
        return "n"
        
#p = geneInUniprot("TP53")
#print(p)

In [8]:
#NEW
#REMOVE DNA LATER

import certifi
import json
import urllib3
#import requests, sys

def geneInUniprotNew(geneList):  #pass in the interacting list from findInteractions
    try:
        requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&exact_gene=" + geneList + "&organism=homo%20sapiens&format=json"
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        response = http.request('GET', requestURL)
    except:
        return ["API request didn't work"]

    if response.status == 200: #if success, get data from API response
        responseData = response.data
        responseDataList = json.loads(responseData)
        
        filteredList = []
        for data in responseDataList:
            filteredList.append(data["gene"][0]["name"]["value"])
            #if "accession" in data.keys():
                #filteredList.append(data["gene"]["name"]["value"])
                
        filteredSet = set(filteredList)
        #print("geneList: " + geneList)
        #print("filteredSet: " + str(filteredSet))
        return filteredSet
        
    else:
        #print("Something went wrong with the response status")
        return ["response status is not 200"]

In [9]:
import time
start_time = time.time()

interactingList = findInteractions("TP53")

fixedInteractingList = fixNewlines(interactingList)
#print(sorted(fixedInteractingList))
#print("----------------------------------------------------------------")

goodList = []    
listForFunct = []
print(fixedInteractingList)

for gene in fixedInteractingList:
    listForFunct.append(gene)
    if len(listForFunct) == 20:
        inputStr = ",".join(listForFunct)
        print("inputStr: " + inputStr)
        goodList += (geneInUniprotNew(inputStr))
        listForFunct = []

if len(listForFunct) > 0:
    inputStr = ",".join(listForFunct)
    goodList += (geneInUniprotNew(inputStr))

print(goodList)

print("--- %s seconds ---" % (time.time() - start_time))

['Ubiquitin-mediated', 'degradation', 'SNAI2', 'Calcium', 'Substrate &eacute;', 'ATM', 'ATR', 'BLM', 'CDC25C', 'MDM2', 'del17p', '', 'MIR15A', 'MIR16-1', 'Chk2', 'DNA', 'CREBBP', 'ARF', '(CDKN2A)', 'MTA2', 'Accumulation DNA damage', 'SREBF1', 'Nuclear abnormalities', 'RCHY1', 'DNA damage', 'DNA Damage', 'MYC', 'HMGB1', 'APC', 'TGFB', 'OTUD5', 'RFC4', 'CTNNB1', 'SNURF', 'PLAC8', 'CHEK2', 'Hypoxia', 'nitric oxide', 'TP73 Delta Np73', 'SUMO-1', 'OTX2', 'TNFSF10', 'MAPK14', 'MAPK13', 'MAPK12', 'MAPK11', 'Chk1', 'UCHL1', 'MEG3', 'P85A_HUMAN', 'CDC42', 'CAPN1', 'CAPN2', 'MIR1285-2', 'GRIN1', 'RHOC', 'RHOA', 'ROCK2', 'RHOB', 'cenersen sodium', 'PRKAB1', 'PRKAG1', 'PRKAA1', 'IFI16', 'GSK3B', 'ING1', 'BCL2', 'ROS', 'EP300', 'MDM4', 'p14 ARF ', '(CDKN2A)', 'HPV E6', 'PCNA', 'MIR34B', 'MIR34A', 'MIR34C', 'Senescence', 'p21', 'Apoptosis', 'Cyclin B', 'CDK1', 'GADD45A', 'p53 pathway', 'BAX', 'CDKN1A', 'GADD45G', 'BAK1', 'POLK', 'DDB2', 'GADD45B', 'uc339', '(Entraper1)', 'NANOG', 'hsa-mir-143/145 cl

In [110]:
#Method 2: check individually if each gene in interacting list is a uniprot gene
#FASTER

import time
start_time = time.time()

interactingList = findInteractions("TP53")

#filteredGenes = [geneInUniprot(x) for x in interactingList if geneInUniprot(x) != "n"]

fixedInteractingList = fixNewlines(interactingList)

filteredGenes = []
for gene in fixedInteractingList:
    if geneInUniprot(gene) != "n":
        filteredGenes.append(gene)
print(len(filteredGenes))
print(filteredGenes)

print("--- %s seconds ---" % ((time.time() - start_time)))

123
['SNAI2', 'ATM', 'ATR', 'BLM', 'CDC25C', 'MDM2', 'Chk2', 'CREBBP', 'ARF', 'MTA2', 'SREBF1', 'RCHY1', 'MYC', 'HMGB1', 'APC', 'TGFB', 'OTUD5', 'RFC4', 'CTNNB1', 'SNURF', 'PLAC8', 'CHEK2', 'OTX2', 'TNFSF10', 'MAPK14', 'MAPK13', 'MAPK12', 'MAPK11', 'Chk1', 'UCHL1', 'CDC42', 'CAPN1', 'CAPN2', 'GRIN1', 'RHOC', 'RHOA', 'ROCK2', 'RHOB', 'PRKAB1', 'PRKAG1', 'PRKAA1', 'IFI16', 'GSK3B', 'ING1', 'BCL2', 'ROS', 'EP300', 'MDM4', 'PCNA', 'CDK1', 'GADD45A', 'BAX', 'CDKN1A', 'GADD45G', 'BAK1', 'POLK', 'DDB2', 'GADD45B', 'NANOG', 'PMAIP1', 'BBC3', 'BOK', 'THBS1', 'BCL2L11', 'CDKN1C', 'CDKN1B', 'ATAD2', 'RAD17', 'UBE2T', 'COL9A3', 'COL9A1', 'MTDH', 'LBR', 'EGFR', 'RFC3', 'UBE2C', 'FANCI', 'CEBPZ', 'RNF144B', 'S100A6', 'PIDD', 'CDK5R1', 'SLC11A2', 'ING2', 'SERPINE1', 'SESN1', 'RRM2B', 'FAS', 'TNFRSF10B', 'CCNB3', 'CCNB2', 'CCNB1', 'SFN', 'SAT2', 'SAT1', 'CASP8', 'SESN2', 'DDIT4', 'CDC2', 'ATF2', 'ELK1', 'p38', 'MAPK8', 'JNK', 'MAPK9', 'MAPK10', 'BMF', 'BID', 'BNIP3L', 'BNIP3', 'HIF1A', 'ARNT', 'RB1', 

In [5]:
#this function will filter out all interactions that aren't proteins in UNIPROT
#DON'T DO THIS, TAKES TOOO LONG
import certifi
import json
import urllib3
#import requests, sys

def filterInteractionList(interactionList):  #pass in the interacting list from findInteractions
    geneInteractionsUNIPROT = []
    filteredGenes = []
    try:
        #requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?"
        requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&organism=homo%20sapiens&format=json"
        #response = requests.get(requestURL)
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        print("1")
        response = http.request('GET', requestURL)
        print("2")
    except:
        print("There was a problem getting a response from the UNIPROT API")
        return

    if response.status == 200: #if success, get data from API response

        responseData = response.data
        responseDataList = json.loads(responseData)
        #print("num total genes: " + str(len(responseDataList)))
        for accessionDict in responseDataList:
            if "gene" in accessionDict.keys():
                aList = accessionDict["gene"]
                for aDict in aList:
                    if ("name" in aDict.keys()) and ("value" in aDict["name"]):
                        aGene = aDict["name"]["value"]
                        if aGene not in geneInteractionsUNIPROT:
                            geneInteractionsUNIPROT.append(aGene)
        for gene in interactionList:
            if gene in geneInteractionsUNIPROT:
                filteredGenes.append(gene)
        return filteredGenes
    else:
        print("Something went wrong with the response status")

In [6]:
#Method 1: get a list of all uniprot genes and intersect them with interacting genes to filter out non-genes
#time it
#save uniprot proteins to a file and put it in cptac
interactingList = findInteractions("TP53")
fixedInteractingList = fixNewlines(interactingList)
finalGeneList = filterInteractionList(fixedInteractingList)
#print(len(finalGeneList))
print(finalGeneList)

1
There was a problem getting a response from the UNIPROT API
None


Wiki Pathways: query for interacting genes given one gene

In [35]:
import certifi
import json
import urllib3
#import requests, sys

def tryMe():
    try:
        requestURL = "https://www.uniprot.org/uniprot/?query=homo%20sapiens&columns=id%2Centry%20name%2Creviewed%2Cprotein%20names%2Cgenes%2Corganism%2Clength&sort=score&format=json"
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        print("1")
        response = http.request('GET', requestURL)
        print("2")
    except:
        print("There was a problem getting a response from the UNIPROT API")
        return
    if response.status == 200:
        responseData = response.data
        responseDataList = json.loads(responseData)
        print(str(reponseDataList))
        
tryMe()

1
2
