In [39]:
import shutil
import wget
import bz2
import json
import os
import sys
import rdflib
import zipfile
from rdflib import Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD
from urllib.parse import quote

#create this bar_progress method which is invoked automatically from wget
def bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

def read_line(path, skip_first):
    with open(path, encoding="utf8") as infile:
        c = 0
        while True:
            line = infile.readline()
            if not line:
                break
            if c % 100000 == 0:
                print(c)
            c += 1
            if skip_first and c == 1:
                continue
            yield line

In [40]:
if not os.path.exists('cache'):
    os.makedirs('cache')

if not os.path.exists('cache/hetionet-v1.0.json'):
    #url = "https://github.com/hetio/hetionet/raw/master/hetnet/json/hetionet-v1.0.json.bz2"
    #wget.download(url, 'cache/hetionet-v1.0.json.bz2', bar=bar_progress)
    with bz2.open("cache/hetionet-v1.0.json.bz2", "rt") as f_in:
        with open('cache/hetionet-v1.0.json', 'w') as f_out:
            shutil.copyfileobj(f_in, f_out)

if not os.path.exists('cache/train.txt'):
    url = "https://github.com/OpenBioLink/Utilities/raw/main/data/Hetionet/data.zip"
    wget.download(url, 'cache/data.zip', bar=bar_progress)
    import zipfile
    with zipfile.ZipFile('cache/data.zip', 'r') as zip_ref:
        zip_ref.extractall('cache')

if not os.path.exists('cache/metaedges.tsv'):
    url = "https://raw.githubusercontent.com/hetio/hetionet/master/describe/edges/metaedges.tsv"
    wget.download(url, 'cache/metaedges.tsv', bar=bar_progress)

In [41]:
# Path setup
train_path = r"cache\train.txt"
test_path = r"cache\test.txt"
valid_path = r"cache\valid.txt"

In [42]:
out = rdflib.Graph()
out.bind("rdf", RDF)
out.bind("rdfs", RDFS)
ai = Namespace("http://ai-strategies.org/ns/")
out.bind("ai", ai)

g = rdflib.Graph()

g.bind("ai", ai)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)

In [43]:

f = open('cache/hetionet-v1.0.json',)
data = json.load(f)

for node in data["nodes"]:
    kind = node['kind'].replace(" ", "")
    id = f"{kind}::{node['identifier']}"

    g.add((
        ai.term(quote(id)),
        RDFS.label,
        rdflib.Literal(node["name"], datatype=XSD.string)

    ))
    g.add((
        ai.term(quote(id)),
        RDF.type,
        rdflib.Literal(node["kind"], datatype=XSD.string)
    ))
    if "url" in node["data"]:
        g.add((
            ai.term(quote(id)),
            ai.wwwresource,
            rdflib.Literal(node["data"]["url"], datatype=XSD.string)
        ))
    if "description" in node["data"]:
        g.add((
            ai.term(quote(id)),
            RDFS.comment,
            rdflib.Literal(node["data"]["description"], datatype=XSD.string)
        ))

for line in read_line('cache/metaedges.tsv', skip_first=True):
    relation, abb, _, _, _, _ = line.split("\t")
    g.add((
        ai.term(quote(abb)),
        RDFS.label,
        rdflib.Literal(relation, datatype=XSD.string)
    ))



0


In [44]:
g.serialize(os.path.abspath(r"hetionet.ttl"),format="turtle")

In [45]:
outfile = open(r"hetionet.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<http://ai-strategies.org/ns/{quote(head)}> <http://ai-strategies.org/ns/{quote(rel)}> <http://ai-strategies.org/ns/{quote(tail)}>>> ai:split ai:{typ} . " + "\n")


read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()

In [46]:
zipObj = zipfile.ZipFile('data.zip', 'w')
zipObj.write('hetionet.ttl', 'hetionet.ttl', zipfile.ZIP_DEFLATED)
zipObj.close()