In [1]:
import rdflib
from rdflib import Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD
import os
import wget
import gzip
import shutil
import tarfile
from os.path import exists
import wget
import urllib
import py7zr

def quote(id):
    return urllib.parse.quote(id, safe=';,/?:@&=+$-_.!~\'()#')

# Methods
def read_nodes(lst):
    nodes = set()
    for path in lst:
        content = None
        with open(path, encoding="utf8") as f:
            content = f.readlines()
        content = [x.strip() for x in content]

        for line in content:
            head,rel,tail = line.split("\t")
            nodes.add(head.replace("\\u0022", "\""))
            nodes.add(tail.replace("\\u0022", "\""))
    return nodes

def read_line(path):
    with open(path, encoding="utf8") as infile:
        c = 0
        while True:
            line = infile.readline()
            if not line:
                break
            if c % 100000 == 0:
                print(c)
            yield line
            c = c + 1

In [2]:
# Path setup
train_path = r"cache\train.txt"
test_path = r"cache\test.txt"
valid_path = r"cache\valid.txt"

In [3]:
if not os.path.exists('cache'):
    os.makedirs('cache')

if not exists('cache/YAGO3-10.tar.gz'):
    url = "https://github.com/TimDettmers/ConvE/raw/master/YAGO3-10.tar.gz"
    wget.download(url, 'cache/YAGO3-10.tar.gz')
    file = tarfile.open('cache/YAGO3-10.tar.gz')
    file.extractall('./cache')
    file.close()

"""
if not exists('cache/yagoLabels.ttl'):
    url = "https://yago-knowledge.org/data/yago3/yago-3.0.2-turtle-simple.7z"
    wget.download(url, 'cache/yago-3.0.2-turtle-simple.7z')
    with py7zr.SevenZipFile("cache/yago-3.0.2-turtle-simple.7z", 'r') as archive:
        archive.extract(path=str(os.path.abspath("/cache")), targets=['yagoLabels.ttl'])
"""

if not exists('cache/yago-wd-labels.nt.gz'):
    url = "https://yago-knowledge.org/data/yago4/en/2020-02-24/yago-wd-labels.nt.gz"
    wget.download(url, 'cache/yago-wd-labels.nt.gz')
    with gzip.open('cache/yago-wd-labels.nt.gz', 'rb') as f_in:
        with open('cache/yago-wd-labels.nt', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)


if not exists('cache/yago-wd-full-types.nt.gz'):
    url = "https://yago-knowledge.org/data/yago4/en/2020-02-24/yago-wd-full-types.nt.gz"
    wget.download(url, 'cache/yago-wd-full-types.nt.gz')
    with gzip.open('cache/yago-wd-full-types.nt.gz', 'rb') as f_in:
        with open('cache/yago-wd-full-types.nt', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)


nodes = read_nodes([train_path, test_path, valid_path])

# filter only english
if not exists('cache/yago310.nt'):
    outfile = open("cache\yago310.nt","w", encoding="utf8")
    namespace = "http://yago-knowledge.org/resource/"
    for line in read_line(r"cache\yago-wd-labels.nt"):
        s,p,o,_ = line.split("\t")
        if s.replace(namespace, "")[1:-1] in nodes and o.endswith("@en"):
            if "rdf-schema#label" in p or "rdf-schema#comment" in p:
                outfile.write(line)

    for line in read_line(r"cache\yago-wd-full-types.nt"):
        s,p,o,_ = line.split("\t")
        if s.replace(namespace, "")[1:-1] in nodes:
            outfile.write(line)

    outfile.close()

In [4]:
g = rdflib.Graph()
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.parse("cache/yago310.nt", format="nt")

<Graph identifier=Nd9e26c15098f47cb8e22c0d8c1ffedcc (<class 'rdflib.graph.Graph'>)>

In [22]:
node = "Kurgan_Airport"
entity_uri = "http://yago-knowledge.org/resource/" + quote(node)
uri_ref = rdflib.URIRef(entity_uri)
label = g.value(uri_ref, RDFS.label, default=None)
print(entity_uri)
description = None
print(RDFS.comment)
for description in g.objects(uri_ref, RDFS.comment):
    print(description)
if description != None:
    print(description)
print(label)

http://yago-knowledge.org/resource/Kurgan_Airport
http://www.w3.org/2000/01/rdf-schema#comment
airport in Russia
airport in Russia
Kurgan Airport


In [28]:
out = rdflib.Graph()
out.bind("rdf", RDF)
out.bind("rdfs", RDFS)
ai = Namespace("https://ai-strategies.org/kgc/")
out.bind("ai", ai)

"""
for node in nodes:
    entity = node
    entity_uri = "http://yago-knowledge.org/resource/" + node
    uri_ref = rdflib.URIRef(entity_uri)
    label = g.value(uri_ref, RDFS.label, default=None)
    print(label)
    break
"""



for node in nodes:
    entity = node
    entity_uri = "http://yago-knowledge.org/resource/" + node.replace("\"", "%22")
    uri_ref = rdflib.URIRef(entity_uri)
    label = g.value(uri_ref, RDFS.label, default=None)


    if label != None:
        out.add((
            ai.term(quote(entity)),
            RDFS.label,
            rdflib.Literal(label, datatype=XSD.string)
        ))

    out.add((
        ai.term(quote(entity)),
        ai.wwwresource,
        rdflib.Literal(entity_uri, datatype=XSD.string)
    ))

    description = None
    for description in g.objects(uri_ref, RDFS.comment):
        break
    if description != None:
        out.add((
            ai.term(quote(entity)),
            RDFS.comment,
            rdflib.Literal(description, datatype=XSD.string)
        ))

    entity_type = None
    for entity_type in g.objects(uri_ref, RDF.type):
        entity_type = entity_type.replace("http://schema.org/", "").replace("http://yago-knowledge.org/resource/", "")
        out.add((
            ai.term(quote(entity)),
            RDF.type,
            rdflib.Literal(entity_type, datatype=XSD.string)
        ))


In [24]:
relations = {
    "isPoliticianOf":"is politician of",
    "hasNeighbor":"has neighbor",
    "playsFor":"plays for",
    "isInterestedIn":"is interested in",
    "hasCurrency":"has currency",
    "dealsWith":"deals with",
    "edited":"edited",
    "livesIn":"lives in",
    "owns":"owns",
    "created":"created",
    "hasChild":"has child",
    "influences":"influences",
    "hasWebsite":"has website",
    "hasCapital":"has capital",
    "hasOfficialLanguage":"has official language",
    "worksAt":"works at",
    "wroteMusicFor":"wrote music for",
    "isCitizenOf":"is citizen of",
    "hasWonPrize":"has won prize",
    "actedIn":"acted in",
    "exports":"exports",
    "hasAcademicAdvisor":"has academic advisor",
    "isKnownFor":"is known for",
    "graduatedFrom":"graduated from",
    "isLocatedIn":"is located in",
    "happenedIn":"happened in",
    "directed":"directed",
    "isMarriedTo":"is married to",
    "isConnectedTo":"is connected to",
    "diedIn":"died in",
    "hasGender":"has gender",
    "participatedIn":"participated in",
    "imports":"imports",
    "isAffiliatedTo":"is affiliated to",
    "isLeaderOf":"is leader of",
    "wasBornIn":"was born in",
    "hasMusicalRole":"has musical role",
}

for relation, label in relations.items():
    out.add((
        ai.term(relation),
        RDFS.label,
        rdflib.Literal(label, datatype=XSD.string)
    ))

In [25]:
out.serialize(os.path.abspath(r"yago310.ttl"),format="turtle")

In [16]:

outfile = open(r"yago310.ttl", 'a', encoding="utf8")

def read_set(path, typ):
    content = None
    with open(path, encoding="utf8") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<https://ai-strategies.org/kgc/{quote(head)}> <https://ai-strategies.org/kgc/{quote(rel)}> <https://ai-strategies.org/kgc/{quote(tail)}>>> ai:split ai:{typ} . " + "\n")

read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()

In [17]:
import zipfile
zipObj = zipfile.ZipFile('data.zip', 'w')
zipObj.write('yago310.ttl', 'yago310.ttl', zipfile.ZIP_DEFLATED)
zipObj.close()