In [1]:
import shutil
import wget
import bz2
import sys
import json
import os
import pickle
import gzip
import rdflib
import zipfile
from rdflib import Namespace
from rdflib.term import URIRef
from os.path import exists
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD
from urllib.parse import quote

#create this bar_progress method which is invoked automatically from wget
def bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

  # Methods
def read_nodes(lst):
    nodes = set()
    for path in lst:
        content = None
        with open(path, encoding="utf8") as f:
            content = f.readlines()
        content = [x.strip() for x in content]

        for line in content:
            head,rel,tail = line.split("\t")
            nodes.add(head)
            nodes.add(tail)
    return nodes

def read_line(path, skip_first):
    with open(path, encoding="utf8") as infile:
        c = 0
        while True:
            line = infile.readline()
            if not line:
                break
            if c % 100000 == 0:
                print(c)
            c += 1
            if skip_first and c == 0:
                continue
            yield line

In [None]:
# Path setup
train_path = r"cache\train.txt"
test_path = r"cache\test.txt"
valid_path = r"cache\valid.txt"

In [2]:


if not exists('cache'):
    os.makedirs('cache')

if not exists('cache/biokg.zip'):
    url = "http://snap.stanford.edu/ogb/data/linkproppred/biokg.zip"
    wget.download(url, 'cache/biokg.zip', bar=bar_progress)
    import zipfile
    with zipfile.ZipFile('cache/biokg.zip', 'r') as zip_ref:
        zip_ref.extractall('cache')

if not exists('cache/data.zip'):
    url = "https://github.com/OpenBioLink/Utilities/raw/main/data/Pheknowlator/data.zip"
    wget.download(url, 'cache/data.zip', bar=bar_progress)
    import zipfile
    with zipfile.ZipFile('cache/data.zip', 'r') as zip_ref:
        zip_ref.extractall('cache')
    


Downloading: 100% [963312546 / 963312546] bytes

In [None]:
ontology = "gene"
step_size = 500

results = dict()
count = 0
ids_str = ""
for x in tqdm(ncbigenes):
    ids_str = ids_str + str(x) + ","
    count = count + 1
    if count >= step_size:
        ids_str = ids_str[0:-1]
        response = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={ontology}&rettype=docsum&retmode=json", data={'id':f'{ids_str}'}, headers={'content-type': 'application/x-www-form-urlencoded'})
        response_json = response.json()
        for key,value in response_json["result"].items():
            if key != 'uids':
                results[key] = value
        ids_str = ""
        count = 0

if count > 0:
    response = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={ontology}&rettype=docsum&retmode=json", data={'id':f'{ids_str}'}, headers={'content-type': 'application/x-www-form-urlencoded'})
    response_json = response.json()
    for key,value in response_json["result"].items():
            if key != 'uids':
                results[key] = value

for x in results:
    labels_file.write("NCBIGENE:" + x + "\t" + results[x]["description"] + "\n")
    if results[x]["summary"] != "":
        descr_file.write("NCBIGENE:" + x + "\t" + results[x]["summary"] + "\n")

In [None]:
out = rdflib.Graph()
out.bind("rdf", RDF)
out.bind("rdfs", RDFS)
ai = Namespace("http://ai-strategies.org/ns/")
out.bind("ai", ai)

g = rdflib.Graph()

g.bind("ai", ai)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)

In [None]:
for type_ in ["disease", "drug", "function", "protein", "sideeffect"]:
    with gzip.open(f'cache/mapping/{type_}_entidx2name.csv.gz', 'rb') as f:
        content = f.read()
    content = [x.strip() for x in content[1:]]
    for line in content:
        id, rel_label = line.split(",")
        id = f"{type_}:{id}"

        name = ?
        url = ?
        description = ?
        # TODO Name descr ...

        g.add((
            ai.term(quote(id)),
            RDFS.label,
            rdflib.Literal(, datatype=XSD.string)

        ))
        g.add((
            ai.term(quote(id)),
            RDF.type,
            rdflib.Literal(type_, datatype=XSD.string)
        ))
        if "url" in node["data"]:
            g.add((
                ai.term(quote(id)),
                ai.wwwresource,
                rdflib.Literal(node["data"]["url"], datatype=XSD.string)
            ))
        if "description" in node["data"]:
            g.add((
                ai.term(quote(id)),
                RDFS.comment,
                rdflib.Literal(node["data"]["description"], datatype=XSD.string)
            ))


# Relations
with gzip.open('cache/mapping/relidx2relname.csv.gz', 'rb') as f:
    content = f.read()
content = [x.strip() for x in content[1:]]
for line in content:
    id, rel_label = line.split(",")
    g.add((
        ai.term(quote(id)),
        RDFS.label,
        rdflib.Literal(rel_label, datatype=XSD.string)
    ))