In [7]:
import shutil
import wget
import bz2
import sys
import json
import os
import pickle
import rdflib
import zipfile
from rdflib import Namespace
from rdflib.term import URIRef
from os.path import exists
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD
import urllib

def quote(id):
    return urllib.parse.quote(id, safe=';,/?:@&=+$-_.!~\'()#')

#create this bar_progress method which is invoked automatically from wget
def bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

  # Methods
def read_nodes(lst):
    nodes = set()
    for path in lst:
        content = None
        with open(path, encoding="utf8") as f:
            content = f.readlines()
        content = [x.strip() for x in content]

        for line in content:
            head,rel,tail = line.split("\t")
            nodes.add(head)
            nodes.add(tail)
    return nodes

def read_line(path, skip_first):
    with open(path, encoding="utf8") as infile:
        c = 0
        while True:
            line = infile.readline()
            if not line:
                break
            if c % 100000 == 0:
                print(c)
            c += 1
            if skip_first and c == 0:
                continue
            yield line

In [8]:
# Path setup
train_path = r"cache\train.txt"
test_path = r"cache\test.txt"
valid_path = r"cache\valid.txt"

In [9]:
if not exists('cache'):
    os.makedirs('cache')

if not exists('cache/labels.txt'):
    url = "https://storage.googleapis.com/pheknowlator/archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/relations_only/owlnets/PheKnowLator_v2.0.0_full_instance_relationsOnly_OWLNETS_NodeLabels.txt"
    wget.download(url, 'cache/labels.txt', bar=bar_progress)

if not exists('cache/train.txt'):
    url = "https://github.com/OpenBioLink/Utilities/raw/main/data/Pheknowlator/data.zip"
    wget.download(url, 'cache/data.zip', bar=bar_progress)
    import zipfile
    with zipfile.ZipFile('cache/data.zip', 'r') as zip_ref:
        zip_ref.extractall('cache')


In [11]:
# Using this instead of RDFlib is muuch faster

outfile = open(os.path.abspath(r"pkl.ttl"),"w",encoding="utf8")

outfile.write("@prefix ai: <https://ai-strategies.org/kgc/> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n\n")

for line in read_line('cache/labels.txt', skip_first = True):
    entity_type, integer_id, entity_uri, label, description, synonym = line.split("\t")
    label = label.replace('\"', '\'')
    description = description.replace('\"', '\'')

    if entity_type == "NODES":
        outfile.write(f"<https://ai-strategies.org/kgc/{quote(entity_uri)}> a \"Entity\"^^xsd:string ;\n")
        outfile.write(f"\trdfs:label \"{label}\"^^xsd:string ;\n")
        outfile.write(f"\trdfs:comment \"{description}\"^^xsd:string ;\n")
        outfile.write(f"\tai:wwwresource \"{entity_uri[1:-1]}\"^^xsd:string .\n\n")
    elif entity_type == "RELATIONS":
        outfile.write(f"<https://ai-strategies.org/kgc/{quote(entity_uri)}> rdfs:label \"part_of\"^^xsd:string .\n\n")

outfile.close()

0
100000
200000
300000
400000
500000
600000
700000


In [12]:
outfile = open(r"pkl.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<https://ai-strategies.org/kgc/{quote(head)}> <https://ai-strategies.org/kgc/{quote(rel)}> <https://ai-strategies.org/kgc/{quote(tail)}>>> ai:split ai:{typ} . " + "\n")

read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()

In [6]:
zipObj = zipfile.ZipFile('data.zip', 'w')
zipObj.write('pkl.ttl', 'pkl.ttl', zipfile.ZIP_DEFLATED)
zipObj.close()