In [8]:
import json
import requests
from tqdm import tqdm

import rdflib
from rdflib import Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

In [21]:
labels_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\labelgraph\Labels"
graph_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\labelgraph\Graphs"
edge_path = r"C:\Users\Simon\Downloads\HQ_DIR\HQ_DIR\graph_files\edges.csv"

train_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\workspace\data\OBL\train.txt"
valid_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\workspace\data\OBL\valid.txt"
test_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\workspace\data\OBL\test.txt"

In [10]:
def add_node(node):
    if node.startswith("NCBIGENE:"):
        ncbigenes.add(node[len("NCBIGENE:"):])
    elif node.startswith("PUBCHEM.COMPOUND:"):
        pubchemcompounds.add(node[len("PUBCHEM.COMPOUND:"):])
    elif node.startswith("CL:"):
        clss.add(node)
    elif node.startswith("GO:"):
        gos.add(node)
    elif node.startswith("UBERON:"):
        uberons.add(node)
    elif node.startswith("DOID:"):
        doids.add(node)
    elif node.startswith("HP:"):
        hps.add(node)
    elif node.startswith("KEGG:"):
        keggs.add(node)
    elif node.startswith("REACTOME:"):
        reactomes.add(node)
        
def read_nodes(path):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]

    for line in content:
        head,rel,tail,_,_ = line.split("\t")
        add_node(head)
        add_node(tail)
        
def divide_chunks(l, n):
    l = list(l)
    for i in range(0, len(l), n): 
        yield l[i:i + n]

## Read nodes from graph

In [11]:
ncbigenes = set()
clss = set()
uberons = set()
gos = set()
doids = set()
hps = set()
pubchemcompounds = set()
keggs = set()
reactomes = set()

read_nodes(edge_path)

# APIs

## NCBIGENE, PUBCHEM.COMPOUND

In [88]:
def download_ncbi(ontology, ids, step_size):
    results = dict()

    count = 0
    ids_str = ""
    for x in tqdm(ids):
        ids_str = ids_str + str(x) + ","
        count = count + 1
        if count >= step_size:
            ids_str = ids_str[0:-1]
            response = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={ontology}&rettype=docsum&retmode=json&id={ids_str}")
            response_json = response.json()
            for key,value in response_json["result"].items():
                if key != 'uids':
                    results[key] = value
            ids_str = ""
            count = 0

    if count > 0:
        response = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={ontology}&rettype=docsum&retmode=json&id={ids_str}")
        response_json = response.json()
        for key,value in response_json["result"].items():
                if key != 'uids':
                    results[key] = value

    file = open(f"{root}/{ontology}.txt", "w")
    json.dump(results, file)
    file.close()

In [None]:
"""
ncbigene, ontology = "gene", step_size = 500
pubchemcompound, ontology = "pccompound", step_size = 200
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=2&rettype=docsum&retmode=json
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pccompound&id=2&rettype=docsum&retmode=json
"""

ontology = "pccompound"
ids = pubchemcompounds
step_size = 200
download_ncbi(ontology, ids, step_size)

ontology = "gene"
ids = ncbigenes
step_size = 500
download_ncbi(ontology, ids, step_size)

## UBERON, HP, GO, DOID, CL

In [59]:


import requests
import json


"""
uberon, hp, go, doid, cl

# to prettify
python -m json.tool uberon.txt uberon.json

"""

ontology = "hp"
ids = hps



file = open(f"{root}/{ontology}.txt", "w", encoding="utf-8")
file.write("{\"terms\":[")

first = True
stop = False
page = 1
while not stop:
    response = requests.get(f"http://www.ebi.ac.uk/ols/api/ontologies/{ontology}/terms?page={page}&size=500")
    if response.text.startswith("{\n  \"_links\""):
        stop = True
        break
    
    response_json = response.json()

    for x in response_json["_embedded"]["terms"]:
        if x["obo_id"] in ids:
            if not first:
                file.write(",")
            else:
                first = False
            json.dump(x,file)
    
    page = page + 1
    # backstop
    if page == 200:
        stop = True

file.write("]}")
file.close()
        
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52


## KEGG

In [50]:
"""
KEGG
KEGG:hsa00232 

"""
ontology = "kegg"
response = requests.get(f"http://rest.kegg.jp/list/pathway/hsa")
with open(f"{root}/{ontology}.txt", "w", encoding="utf-8") as file:
    for x in response.text.split("\n"):
        if x != "":
            code, label = x.split("\t")
            code = code.replace("path:", "KEGG:")
            label = label.split(" - ")[0]
            if code in keggs:
                file.write(code + "\t" + label + "\n")

## REACTOME

In [52]:
"""
REACTOME
https://reactome.org/download/current/ReactomePathways.txt

"""
ontology = "reactome"
response = requests.get(f"https://reactome.org/download/current/ReactomePathways.txt")
with open(f"{root}/{ontology}.txt", "w", encoding="utf-8") as file:
    for x in response.text.split("\n"):
        if x != "":
            code, label, _ = x.split("\t")
            code = "REACTOME:" + code
            label = label.split(" - ")[0]
            if code in reactomes:
                file.write(code + "\t" + label + "\n")


## Old Pubchem.Compound, very slow

In [55]:
"""
PUBCHEM.COMPOUND
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/1983,1984/description/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/1983,1984/synonyms/JSON
"""

result = {
    "descriptions": []
}

for x in tqdm(divide_chunks(pubchemcompounds, 200), total=int(len(pubchemcompounds)/200)):
    ids_str = ",".join(x)
    response = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{ids_str}/description/JSON")
    response_json = response.json()
    for x in response_json["InformationList"]["Information"]:
        if "Description" in x:
            result["descriptions"].append(x)
with open(f"{labels_path}/pccompound_descriptions.txt", "w") as outfile:
    json.dump(result, outfile, indent=4, sort_keys=True)


  9%|▉         | 35/388 [14:20<2:24:38, 24.59s/it]


KeyError: 'InformationList'

# RDF graph

In [11]:
def load_text_file(path):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return content

def load_json_file(path):
    with open(path) as json_file:
        return json.load(json_file)

In [14]:
identifiers = Namespace("http://identifiers.org/")
obl = Namespace("http://ai-strategies.org/openbiolink/")
OBO = Namespace("http://www.geneontology.org/formats/oboInOwl#")

g = rdflib.Graph()

g.bind("obl", obl)
g.bind("id", identifiers)
g.bind("rdfs", RDFS)
g.bind("oboInOwl", OBO)

## NCBIGENE

In [15]:
ncbigene_path = labels_path + "\\gene.json"
ncbigene = load_json_file(ncbigene_path)
for x in tqdm(ncbigene):
    nodeinfo = ncbigene[x]
    
    # label
    assert (nodeinfo["description"] != "" and type(nodeinfo["description"]) == str), "Error label not str or empty"
    g.add((
        identifiers.term("NCBIGENE:" + x),
        RDFS.label,
        rdflib.Literal(nodeinfo["description"], datatype=XSD.string)
    ))

    g.add((
        identifiers.term("NCBIGENE:" + x),
        RDF.type,
        rdflib.Literal("Gene", datatype=XSD.string)
    ))
    
    # synonyms
    synonyms = set()
    synonyms.add(nodeinfo["name"])
    synonyms.add(nodeinfo["nomenclaturesymbol"])
    synonyms.add(nodeinfo["nomenclaturename"])
    if len(nodeinfo["otheraliases"]) > 0:
        for y in ncbigene[x]["otheraliases"].split(", "):
            synonyms.add(y)
    if len(nodeinfo["otherdesignations"]) > 0:
        for y in nodeinfo["otherdesignations"].split("|"):
            synonyms.add(y)
    for synonym in synonyms:
        g.add((
            identifiers.term("NCBIGENE:" + x),
            OBO.hasExactSynonym,
            rdflib.Literal(synonym, datatype=XSD.string)

        ))
        
    # description
    g.add((
        identifiers.term("NCBIGENE:" + x),
        RDFS.comment,
        rdflib.Literal(nodeinfo["summary"], datatype=XSD.string)
    ))


  0%|          | 0/19604 [00:00<?, ?it/s][A
  1%|          | 136/19604 [00:00<00:14, 1358.25it/s][A
  1%|▏         | 272/19604 [00:00<00:25, 755.41it/s] [A
  2%|▏         | 408/19604 [00:00<00:20, 947.75it/s][A
  3%|▎         | 541/19604 [00:00<00:17, 1064.08it/s][A
  3%|▎         | 661/19604 [00:00<00:17, 1091.49it/s][A
  4%|▍         | 799/19604 [00:00<00:15, 1176.89it/s][A
  5%|▍         | 933/19604 [00:00<00:15, 1219.47it/s][A
  5%|▌         | 1060/19604 [00:00<00:15, 1176.12it/s][A
  6%|▌         | 1182/19604 [00:01<00:16, 1099.36it/s][A
  7%|▋         | 1307/19604 [00:01<00:16, 1140.61it/s][A
  7%|▋         | 1431/19604 [00:01<00:15, 1167.43it/s][A
  8%|▊         | 1557/19604 [00:01<00:15, 1191.38it/s][A
  9%|▊         | 1695/19604 [00:01<00:14, 1244.15it/s][A
  9%|▉         | 1821/19604 [00:01<00:14, 1200.85it/s][A
 10%|▉         | 1953/19604 [00:01<00:14, 1232.67it/s][A
 11%|█         | 2083/19604 [00:01<00:14, 1250.02it/s][A
 11%|█▏        | 2212/19604 [00:01

## PCCOMPOUND

In [16]:
ncbigene_path = labels_path + "\\pccompound.json"
ncbigene = load_json_file(ncbigene_path)
for x in tqdm(ncbigene):
    nodeinfo = ncbigene[x]
    
    if "error" not in nodeinfo:
        
        #label
        assert ((len(nodeinfo["meshheadinglist"]) >= 1
                and len(nodeinfo["meshheadinglist"][0]) >= 0
                and type(nodeinfo["meshheadinglist"][0]) == str)
                or (len(nodeinfo["iupacname"]) >= 0
                and type(nodeinfo["iupacname"]) == str)
                ), "Error label not str or empty"

        if(len(nodeinfo["meshheadinglist"]) >= 1):
            g.add((
                identifiers.term("PUBCHEM.COMPOUND:" + x),
                RDFS.label,
                rdflib.Literal(nodeinfo["meshheadinglist"][0], datatype=XSD.string)
            ))
        else:
            g.add((
                identifiers.term("PUBCHEM.COMPOUND:" + x),
                RDFS.label,
                rdflib.Literal(nodeinfo["iupacname"], datatype=XSD.string)
            ))
        
        g.add((
            identifiers.term("PUBCHEM.COMPOUND:" + x),
            RDF.type,
            rdflib.Literal("Drug", datatype=XSD.string)
        ))
        
        #synonyms
        synonyms = set()
        synonyms.add(nodeinfo["iupacname"])
        synonyms.add(nodeinfo["canonicalsmiles"])
        synonyms.add(nodeinfo["molecularformula"])
        for y in nodeinfo["synonymlist"]:
            if y != "":
                synonyms.add(y)
        for y in nodeinfo["meshtermlist"]:
            if y != "":
                synonyms.add(y)
        if len(nodeinfo["meshheadinglist"]) > 1:
            for y in range(1, len(nodeinfo["meshheadinglist"])):
                if y != "":
                    synonyms.add(y)

        for synonym in synonyms:
            g.add((
                identifiers.term("PUBCHEM.COMPOUND:" + x),
                OBO.hasExactSynonym,
                rdflib.Literal(synonym, datatype=XSD.string)
        ))

██▎   | 48781/77727 [01:08<00:43, 671.14it/s][A
 63%|██████▎   | 48849/77727 [01:08<00:46, 615.26it/s][A
 63%|██████▎   | 48912/77727 [01:08<00:46, 614.28it/s][A
 63%|██████▎   | 48998/77727 [01:08<00:42, 681.92it/s][A
 63%|██████▎   | 49068/77727 [01:08<00:42, 673.75it/s][A
 63%|██████▎   | 49137/77727 [01:08<00:42, 666.49it/s][A
 63%|██████▎   | 49217/77727 [01:08<00:40, 703.51it/s][A
 63%|██████▎   | 49298/77727 [01:08<00:39, 721.09it/s][A
 64%|██████▎   | 49371/77727 [01:09<02:20, 202.05it/s][A
 64%|██████▎   | 49441/77727 [01:09<01:51, 253.29it/s][A
 64%|██████▎   | 49530/77727 [01:09<01:24, 334.24it/s][A
 64%|██████▍   | 49602/77727 [01:10<01:13, 384.37it/s][A
 64%|██████▍   | 49686/77727 [01:10<01:01, 459.09it/s][A
 64%|██████▍   | 49756/77727 [01:10<01:00, 466.14it/s][A
 64%|██████▍   | 49822/77727 [01:10<00:55, 505.29it/s][A
 64%|██████▍   | 49886/77727 [01:10<00:52, 528.27it/s][A
 64%|██████▍   | 49972/77727 [01:10<00:45, 604.32it/s][A
 64%|██████▍   | 50041/

## uberon, hp, go, doid, cl

In [17]:


def add_ebi_onto(json_path, type_):
    json_data = load_json_file(json_path)
    for x in tqdm(json_data["terms"]):
        
        # label
        assert (len(x["label"]) >= 0 and type(x["label"]) == str), "Error label not str or empty"
        g.add((
            identifiers.term(x["obo_id"]),
            RDFS.label,
            rdflib.Literal(x["label"], datatype=XSD.string)
        ))

        g.add((
            identifiers.term(x["obo_id"]),
            RDF.type,
            rdflib.Literal(type_, datatype=XSD.string)
        ))
        
        #synonyms
        if x["synonyms"] is not None:
            synonyms = set(x["synonyms"])
            for synonym in synonyms:
                g.add((
                    identifiers.term(x["obo_id"]),
                    OBO.hasExactSynonym,
                    rdflib.Literal(synonym, datatype=XSD.string)
            ))

        # description
        if x["description"] is not None:
            g.add((
                identifiers.term(x["obo_id"]),
                RDFS.comment,
                rdflib.Literal(x["description"][0], datatype=XSD.string)
            ))

add_ebi_onto(labels_path + "\\cl.json", "Anatomy")
add_ebi_onto(labels_path + "\\doid.json", "Disease")
add_ebi_onto(labels_path + "\\go.json", "GO")
add_ebi_onto(labels_path + "\\hp.json", "Phenotype")
add_ebi_onto(labels_path + "\\uberon.json", "Anatomy")
    


  0%|          | 0/2103 [00:00<?, ?it/s][A
 22%|██▏       | 464/2103 [00:00<00:00, 4600.28it/s][A
 44%|████▍     | 925/2103 [00:00<00:00, 4375.34it/s][A
 65%|██████▌   | 1376/2103 [00:00<00:00, 4416.29it/s][A
100%|██████████| 2103/2103 [00:00<00:00, 4008.48it/s]

  0%|          | 0/9270 [00:00<?, ?it/s][A
  4%|▎         | 343/9270 [00:00<00:02, 3264.43it/s][A
  7%|▋         | 670/9270 [00:00<00:02, 3140.40it/s][A
 11%|█         | 1024/9270 [00:00<00:02, 3306.45it/s][A
 15%|█▍        | 1378/9270 [00:00<00:02, 3395.92it/s][A
 19%|█▉        | 1746/9270 [00:00<00:02, 3484.99it/s][A
 23%|██▎       | 2095/9270 [00:00<00:02, 3376.34it/s][A
 27%|██▋       | 2488/9270 [00:00<00:01, 3547.08it/s][A
 32%|███▏      | 2923/9270 [00:00<00:01, 3792.43it/s][A
 36%|███▌      | 3348/9270 [00:00<00:01, 3921.20it/s][A
 41%|████      | 3809/9270 [00:01<00:01, 4127.31it/s][A
 46%|████▌     | 4248/9270 [00:01<00:01, 4204.07it/s][A
 50%|█████     | 4671/9270 [00:01<00:01, 4209.12it/s][A
 55%|

## reactome

In [18]:
path = labels_path + "\\reactome.txt"
content = load_text_file(path)

for line in content:
    code, label = line.split("\t")
    g.add((
        identifiers.term(code),
        RDFS.label,
        rdflib.Literal(label, datatype=XSD.string)
    ))

    g.add((
        identifiers.term(code),
        RDF.type,
        rdflib.Literal("Pathway", datatype=XSD.string)
    ))
    


## kegg

In [19]:
path = labels_path + "\\kegg.txt"
content = load_text_file(path)

for line in content:
    code, label = line.split("\t")
    g.add((
        identifiers.term(code),
        RDFS.label,
        rdflib.Literal(label, datatype=XSD.string)
    ))

    g.add((
        identifiers.term(code),
        RDF.type,
        rdflib.Literal("Pathway", datatype=XSD.string)
    ))

## Serialize graph

In [20]:
g.serialize(graph_path + "\\obl_with_labels.ttl",format="turtle")

In [22]:
outfile = open(r"C:\Users\Simon\Desktop\SAFRANExplorer\labelgraph\Graphs\obl_with_labels.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<http://identifiers.org/{head}> <http://identifiers.org/{rel}> <http://identifiers.org/{tail}>>> obl:split obl:{typ} . " + "\n")

read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()