In [32]:
import json
import requests
from tqdm import tqdm

import rdflib
from rdflib import Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

def load_text_file(path):
    content = None
    with open(path, encoding="utf8") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return content

def load_tsv_id_file(path):
    content = load_text_file(path)
    ids2label = dict()
    for line in content:
        try:
            code, label = line.split("\t")
        except:
            print(line)
        ids2label[code] = label
    return ids2label

def add_node(node):
    all_nodes.add(node)
    if node.startswith("NCBIGENE:"):
        ncbigenes.add(node[len("NCBIGENE:"):])
    elif node.startswith("PUBCHEM.COMPOUND:"):
        pubchemcompounds.add(node[len("PUBCHEM.COMPOUND:"):])
    elif node.startswith("CL:"):
        clss.add(node)
    elif node.startswith("GO:"):
        gos.add(node)
    elif node.startswith("UBERON:"):
        uberons.add(node)
    elif node.startswith("DOID:"):
        doids.add(node)
    elif node.startswith("HP:"):
        hps.add(node)
    elif node.startswith("KEGG:"):
        keggs.add(node)
    elif node.startswith("REACTOME:"):
        reactomes.add(node)
        
def read_nodes(path):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]

    for line in content:
        head,rel,tail,_,_ = line.split("\t")
        add_node(head)
        add_node(tail)
        
def divide_chunks(l, n):
    l = list(l)
    for i in range(0, len(l), n): 
        yield l[i:i + n]

In [2]:
labels_path = r"./workspace/labels"
graph_path = r"./workspace/graphs"
edge_path = r"C:\Users\ottsi\Downloads\HQ_DIR\graph_files\edges.csv"

train_path = r"C:\Users\ottsi\OneDrive\MedUni\OpenBioLink\SAFRAN\Evaluations\OBL\train.txt"
valid_path = r"C:\Users\ottsi\OneDrive\MedUni\OpenBioLink\SAFRAN\Evaluations\OBL\valid.txt"
test_path = r"C:\Users\ottsi\OneDrive\MedUni\OpenBioLink\SAFRAN\Evaluations\OBL\test.txt"

## Read nodes from graph

In [3]:
all_nodes = set()
ncbigenes = set()
clss = set()
uberons = set()
gos = set()
doids = set()
hps = set()
pubchemcompounds = set()
keggs = set()
reactomes = set()

read_nodes(edge_path)

# APIs

## NCBIGENE, PUBCHEM.COMPOUND

In [7]:
labels_file = open(f"{labels_path}/obl_labels.txt", "w", encoding="utf8")
descr_file = open(f"{labels_path}/obl_descriptions.txt", "w", encoding="utf8")

## GENE

In [11]:

ontology = "gene"
step_size = 500

results = dict()
count = 0
ids_str = ""
for x in tqdm(ncbigenes):
    ids_str = ids_str + str(x) + ","
    count = count + 1
    if count >= step_size:
        ids_str = ids_str[0:-1]
        response = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={ontology}&rettype=docsum&retmode=json", data={'id':f'{ids_str}'}, headers={'content-type': 'application/x-www-form-urlencoded'})
        response_json = response.json()
        for key,value in response_json["result"].items():
            if key != 'uids':
                results[key] = value
        ids_str = ""
        count = 0

if count > 0:
    response = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={ontology}&rettype=docsum&retmode=json", data={'id':f'{ids_str}'}, headers={'content-type': 'application/x-www-form-urlencoded'})
    response_json = response.json()
    for key,value in response_json["result"].items():
            if key != 'uids':
                results[key] = value

for x in results:
    labels_file.write("NCBIGENE:" + x + "\t" + results[x]["description"] + "\n")
    if results[x]["summary"] != "":
        descr_file.write("NCBIGENE:" + x + "\t" + results[x]["summary"] + "\n")


100%|██████████| 19598/19598 [02:59<00:00, 108.92it/s]


## COMPOUND

In [101]:


ids_str = []
step_size = 10000
count = 0

for x in tqdm(pubchemcompounds):
    ids_str.append('{"cid":"' + x + '"}')
    count = count + 1
    if count >= step_size:
        ids_str = ",".join(ids_str)
        response = requests.post('https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json', data={'query':'{"select":"*","collection":"compound","where":{"ors":[' + ids_str + ']},"order":["cid,asc"],"start":1,"limit":10000,"width":1000000,"listids":0}'}, headers={'content-type': 'application/x-www-form-urlencoded'})
        response_json = response.json()
        for entry in response_json["SDQOutputSet"][0]["rows"]:
            labels_file.write("PUBCHEM.COMPOUND:" + str(entry["cid"]) + "\t" + entry["cmpdname"] + "\n")

        ids_str = []
        count = 0

if count > 0:
    ids_str = ",".join(ids_str)
    response = requests.post('https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json', data={'query':'{"select":"*","collection":"compound","where":{"ors":[' + ids_str + ']},"order":["cid,asc"],"start":1,"limit":10000,"width":1000000,"listids":0}'}, headers={'content-type': 'application/x-www-form-urlencoded'})
    response_json = response.json()

    print(len(response_json["SDQOutputSet"][0]["rows"]))
    for entry in response_json["SDQOutputSet"][0]["rows"]:
        labels_file.write("PUBCHEM.COMPOUND:" + str(entry["cid"]) + "\t" + entry["cmpdname"] + "\n")



100%|██████████| 77635/77635 [02:37<00:00, 491.83it/s]
7402


In [6]:
"""
PUBCHEM.COMPOUND
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/1983,1984/description/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/1983,1984/synonyms/JSON
"""

json_file = open(f"{labels_path}/compound.txt", "w")
json_file.write("[\n")
results = []
for x in tqdm(divide_chunks(pubchemcompounds, 10), total=int(len(pubchemcompounds)/10)):
    ids_str = ",".join(x)
    response = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{ids_str}/description/JSON")
    response_json = response.json()

    results.extend(response_json["InformationList"]["Information"])
    json_file.write(json.dumps(response_json["InformationList"]["Information"]) + ",\n")
    json_file.flush()

json_file.write("]\n")
json_file.close()


7764it [3:39:33,  1.70s/it]


In [15]:
from collections import defaultdict

jayson = None
with open(f"{labels_path}/compound.txt") as infile:
    jayson = json.load(infile)


cids = defaultdict(dict)

for part in jayson:
    for ele in part:
        assert ("Title" in ele and "Description" not in ele) or ("Title" not in ele and "Description" in ele), "Big oof"
        if "Title" in ele:
            cids[ele["CID"]]["Title"] = ele["Title"]
        elif "Description" in ele:
            if "Description" not in cids[ele["CID"]]:
                cids[ele["CID"]]["Description"] = []
            cids[ele["CID"]]["Description"].append(ele["Description"])
        else:
            print("HM")
            print(ele)

import json
with open(f'{labels_path}/result.json', 'w') as fp:
    json.dump(cids, fp, indent=4)


In [10]:
json_file = None
with open(f'{labels_path}/result.json') as fp:
    json_file = json.load(fp)

for x in json_file:
    labels_file.write("PUBCHEM.COMPOUND:" + x + "\t" + json_file[x]["Title"] + "\n")
    if "Description" in json_file[x] and len(json_file[x]["Description"]) > 0:
        descr_file.write("PUBCHEM.COMPOUND:" + x + "\t" + json_file[x]["Description"][0] + "\n")


## UBERON, HP, GO, DOID, CL

In [9]:


import requests
import json


"""
uberon, hp, go, doid, cl

# to prettify
python -m json.tool uberon.txt uberon.json

"""

for ontology, ids in [("hp", hps), ("uberon", uberons), ("go", gos), ("doid", doids), ("cl", clss)]:
    print(ontology)
    response = requests.get(f"http://www.ebi.ac.uk/ols/api/ontologies/{ontology}/terms?page=0&size=500")
    totalPages = int(response.json()["page"]["totalPages"])
    for page in tqdm(range(0,totalPages)):
        response = requests.get(f"http://www.ebi.ac.uk/ols/api/ontologies/{ontology}/terms?page={page}&size=500")
        response_json = response.json()
        for x in response_json["_embedded"]["terms"]:
            if x["obo_id"] in ids:
                labels_file.write(x["obo_id"] + "\t" + x["label"] + "\n")
                if "description" in x and x["description"] != None and len(x["description"]) > 0:
                    descr_file.write(x["obo_id"] + "\t" + x["description"][0] + "\n")
                elif "annotation" in x and x["annotation"] != None:
                    if "definition" in x["annotation"] and x["annotation"]["definition"] != None and len(x["annotation"]["definition"]) > 0:
                        descr_file.write(x["obo_id"] + "\t" + x["annotation"]["definition"][0] + "\n")
                elif "obo_definition_citation" in x and x["obo_definition_citation"] != None and len(x["obo_definition_citation"]) > 0:
                        descr_file.write(x["obo_id"] + "\t" + x["obo_definition_citation"][0]["definition"] + "\n")


hp
100%|██████████| 53/53 [01:59<00:00,  2.26s/it]
uberon
100%|██████████| 32/32 [01:33<00:00,  2.93s/it]
go
100%|██████████| 102/102 [03:34<00:00,  2.10s/it]
doid
100%|██████████| 36/36 [01:47<00:00,  3.00s/it]
cl
100%|██████████| 21/21 [00:51<00:00,  2.46s/it]


## KEGG

In [8]:
"""
KEGG
KEGG:hsa00232 

"""
ontology = "kegg"
response = requests.get(f"http://rest.kegg.jp/list/pathway/hsa")
for x in tqdm(response.text.split("\n")):
    if x != "":
        code, label = x.split("\t")
        code = code.replace("path:", "KEGG:")
        label = label.split(" - ")[0]
        if code in keggs:
            response = requests.get(f"http://rest.kegg.jp/get/{code.replace('KEGG:', '')}")
            response_text = response.text
            response_text = response_text.split("\n")
            for line in response_text:
                if line.startswith("DESCRIPTION"):
                    descr_file.write(code + "\t" + line[11:].strip() + "\n")
            labels_file.write(code + "\t" + label + "\n")

100%|██████████| 345/345 [02:50<00:00,  2.03it/s]


## REACTOME

In [5]:
for code in tqdm(reactomes):
    code = code.replace("REACTOME:", "")
    response = requests.get(f"https://reactome.org/ContentService/data/query/{code}")
    try:
        response_json = response.json()
        labels_file.write("REACTOME:" + code + "\t" + response_json["name"][0] + "\n")

        if "summation" in response_json and response_json["summation"] != None:
            descr_file.write("REACTOME:" + code + "\t" + response_json["summation"][0]["text"] + "\n")
        else:
            print(code)
    except Exception as e:
        print(f"EXCEPTION {e} {code}")

  4%|▍         | 71/1860 [00:32<14:12,  2.10it/s]EXCEPTION 'name' R-HSA-977442
  6%|▌         | 104/1860 [00:45<11:38,  2.51it/s]EXCEPTION 'name' R-HSA-1980148
  6%|▌         | 108/1860 [00:46<10:18,  2.83it/s]EXCEPTION 'name' R-HSA-629602
  6%|▌         | 113/1860 [00:48<11:07,  2.62it/s]EXCEPTION 'name' R-HSA-69298
  6%|▋         | 119/1860 [00:51<11:52,  2.44it/s]EXCEPTION 'name' R-HSA-71182
  7%|▋         | 122/1860 [00:52<11:35,  2.50it/s]EXCEPTION 'name' R-HSA-194223
  7%|▋         | 135/1860 [00:57<11:19,  2.54it/s]EXCEPTION 'name' R-HSA-73847
  7%|▋         | 136/1860 [00:58<10:56,  2.63it/s]EXCEPTION 'name' R-HSA-70153
  8%|▊         | 143/1860 [01:00<10:54,  2.62it/s]EXCEPTION 'name' R-HSA-1222352
  9%|▉         | 167/1860 [01:09<10:08,  2.78it/s]EXCEPTION 'name' R-HSA-428808
 10%|█         | 191/1860 [01:19<11:30,  2.42it/s]EXCEPTION 'name' R-HSA-174800
 11%|█         | 196/1860 [01:21<11:07,  2.49it/s]EXCEPTION 'name' R-HSA-1980150
 11%|█         | 209/1860 [01:26<10:21,  2

In [12]:
labels_file.close()
descr_file.close()

# RDF graph

In [40]:
identifiers = Namespace("http://identifiers.org/")
OBO = Namespace("http://www.geneontology.org/formats/oboInOwl#")
ai = Namespace("http://ai-strategies.org/ns/")

g = rdflib.Graph()

g.bind("id", identifiers)
g.bind("rdfs", RDFS)
g.bind("oboInOwl", OBO)
g.bind("ai", ai)

In [42]:

ids2label = load_tsv_id_file("./workspace/labels/obl_labels.txt")
ids2descr = load_tsv_id_file("./workspace/labels/obl_descriptions.txt")

for node in tqdm(all_nodes):

    type = ""
    if node.startswith("NCBIGENE:"):
        type = "Gene"
    elif node.startswith("PUBCHEM.COMPOUND:"):
        type = "Drug"
    elif node.startswith("UBERON:"):
        type = "Anatomy"
    elif node.startswith("DOID:"):
        type = "Disease"
    elif node.startswith("GO:"):
        type = "GO"
    elif node.startswith("HP:"):
        type = "Phenotype"
    elif node.startswith("CL:"):
        type = "Anatomy"
    elif node.startswith("KEGG:"):
        type = "Pathway"
    elif node.startswith("REACTOME:"):
        type = "Pathway"
    
    if node in ids2label:
        g.add((
            identifiers.term(node),
            RDFS.label,
            rdflib.Literal(ids2label[node], datatype=XSD.string)
        ))

        g.add((
            identifiers.term(node),
            RDF.type,
            rdflib.Literal(type, datatype=XSD.string)
        ))

        g.add((
            identifiers.term(node),
            ai.wwwresource,
            rdflib.Literal("http://identifiers.org/" + node, datatype=XSD.string)
        ))

        if node in ids2descr:
            g.add((
                identifiers.term(node),
                RDFS.comment,
                rdflib.Literal(ids2descr[node], datatype=XSD.string)
            ))

  1%|          | 1219/184667 [00:00<00:30, 6108.13it/s]
100%|██████████| 184667/184667 [00:33<00:00, 5550.32it/s]


In [43]:
relations = {
    "GENE_PHENOTYPE": "associated with",
    "GENE_EXPRESSED_ANATOMY": "is expressed in",
    "GENE_BINDING_GENE": "is in an binding interaction with",
    "GENE_UNDEREXPRESSED_ANATOMY": "can be underexpressed in",
    "GENE_GENE": "is in an interaction with",
    "GENE_REACTION_GENE": "is in a reaction with",
    "DRUG_REACTION_GENE": "is reacting with",
    "GENE_GO": "associated with",
    "GENE_PATHWAY": "is part of pathway",
    "GENE_OVEREXPRESSED_ANATOMY": "can be overexpressed in",
    "GENE_DRUG": "associated with",
    "DRUG_CATALYSIS_GENE": "is catalyzed by",
    "DRUG_BINDING_GENE": "is binding to",
    "PART_OF": "is part of",
    "GENE_INHIBITION_GENE": "is in an inhibition interaction with",
    "DRUG_INHIBITION_GENE": "is inhibiting",
    "DRUG_PHENOTYPE": "can cause",
    "IS_A": "is a",
    "GENE_CATALYSIS_GENE": "is in an catalysis interaction with",
    "GENE_ACTIVATION_GENE": "is in an activation interaction with",
    "DIS_DRUG": "treated with (indication)",
    "DRUG_ACTIVATION_GENE": "is activating",
    "DIS_PHENOTYPE": "has overservable characteristic",
    "GENE_PTMOD_GENE": "is in an ptmod interaction with",
    "DRUG_BINDINH_GENE": "is binding to and inhibiting",
    "GENE_DIS": "associated with",
    "DRUG_BINDACT_GENE": "is binding to and activating",
    "GENE_EXPRESSION_GENE": "is in an expression interaction with"
}

for relation, label in relations.items():
    g.add((
        identifiers.term(relation),
        RDFS.label,
        rdflib.Literal(label, datatype=XSD.string)
    ))

## Serialize graph

In [44]:

g.serialize(graph_path + "/obl_with_labels.ttl",format="turtle")

In [45]:
outfile = open(graph_path + "/obl_with_labels.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<http://identifiers.org/{head}> <http://identifiers.org/{rel}> <http://identifiers.org/{tail}>>> ai:split ai:{typ} . " + "\n")

read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()