In [1]:
import shutil
import wget
import bz2
import sys
import json
import os
import pickle
import rdflib
import zipfile
from rdflib import Namespace
from rdflib.term import URIRef
from os.path import exists
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

#create this bar_progress method which is invoked automatically from wget
def bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

  # Methods
def read_nodes(lst):
    nodes = set()
    for path in lst:
        content = None
        with open(path, encoding="utf8") as f:
            content = f.readlines()
        content = [x.strip() for x in content]

        for line in content:
            head,rel,tail = line.split("\t")
            nodes.add(head)
            nodes.add(tail)
    return nodes

def read_line(path, skip_first):
    with open(path, encoding="utf8") as infile:
        c = 0
        while True:
            line = infile.readline()
            if not line:
                break
            if c % 100000 == 0:
                print(c)
            c += 1
            if skip_first and c == 0:
                continue
            yield line

In [2]:
# Path setup
train_path = r"cache\train.txt"
test_path = r"cache\test.txt"
valid_path = r"cache\valid.txt"

In [5]:
if not exists('cache'):
    os.makedirs('cache')

if not exists('cache/labels.txt'):
    url = "https://storage.googleapis.com/pheknowlator/archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/relations_only/owlnets/PheKnowLator_v2.0.0_full_instance_relationsOnly_OWLNETS_NodeLabels.txt"
    wget.download(url, 'cache/labels.txt', bar=bar_progress)

if not exists('cache/train.txt'):
    url = "https://github.com/OpenBioLink/Utilities/raw/main/data/Pheknowlator/data.zip"
    wget.download(url, 'cache/data.zip', bar=bar_progress)
    import zipfile
    with zipfile.ZipFile('cache/data.zip', 'r') as zip_ref:
        zip_ref.extractall('cache')


Downloading: 100% [207682562 / 207682562] bytes

In [6]:

ai = Namespace("http://ai-strategies.org/ns/")

g = rdflib.Graph()
g.bind("ai", ai)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)

In [7]:
for line in read_line('cache/labels.txt', skip_first = True):
    try:
        cols = line.split("\t")
        if len(cols) == 5:
            # some rows are erroneous
            entity_type, integer_id, entity_uri, label, description = cols
            print("ONLY 5 cols")
            print(line)
        elif len(cols) ==6:
            # normal row
            entity_type, integer_id, entity_uri, label, description, synonym = cols
        else:
            print("SKIPPED")
            print(line)
            continue
        
        entity_uri = entity_uri[1:-1]

        if entity_type == "NODES":
            g.add((
                URIRef(entity_uri),
                RDFS.label,
                rdflib.Literal(label, datatype=XSD.string)

            ))
            g.add((
                URIRef(entity_uri),
                RDF.type,
                rdflib.Literal("Entity", datatype=XSD.string)
            ))

            g.add((
                URIRef(entity_uri),
                ai.wwwresource,
                rdflib.Literal(entity_uri, datatype=XSD.string)
            ))

            g.add((
                URIRef(entity_uri),
                RDFS.comment,
                rdflib.Literal(description, datatype=XSD.string)
            ))
        elif entity_type == "RELATIONS":
            g.add((
                URIRef(entity_uri),
                RDFS.label,
                rdflib.Literal(label, datatype=XSD.string)
            ))
    except ValueError as e:
        print(e)
        print(line)
        raise ValueError

0
100000
200000
300000
400000
500000
600000
700000


In [8]:
g.serialize(os.path.abspath(r"pheknowlator.ttl"),format="turtle")

In [9]:
outfile = open(r"pheknowlator.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<{head} {rel} {tail}>> ai:split ai:{typ} . " + "\n")

read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()

In [None]:
zipObj = zipfile.ZipFile('data.zip', 'w')
zipObj.write('pheknowlator.ttl', 'pheknowlator.ttl', zipfile.ZIP_DEFLATED)
zipObj.close()