In [1]:
from __future__ import print_function
import json
import uuid
import itertools
import urllib
from tqdm import tqdm

import rdflib
from rdflib import Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

import re
import os

In [3]:
def add_node(node):
    nodes.add(node)

def read_nodes(path):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]

    for line in content:
        head,rel,tail = line.split("\t")
        add_node(head)
        add_node(tail)
        
def divide_chunks(l, n):
    for i in range(0, len(l), n): 
        yield l[i:i + n]
        
def load_json_file(path):
    with open(path) as json_file:
        return json.load(json_file)

def load_id_tsv(path):
    content = None
    with open(path, encoding="utf-8") as f:
        content = f.readlines()
    content = [x.strip() for x in content]

    idValueDict = {}
    for line in content:
        id,value = line.split("\t")
        idValueDict[id] = value
    return idValueDict
        

In [4]:
# https://developers.google.com/knowledge-graph/

root = r"workspace/labels"
train_path = r"workspace\data\FB15-237\train.txt"
test_path = r"workspace\data\FB15-237\test.txt"
valid_path = r"workspace\data\FB15-237\valid.txt"

nodes = set()
read_nodes(train_path)
read_nodes(test_path)
read_nodes(valid_path)
nodes = list(nodes)

print(len(nodes))

14541


## Download files


In [9]:
api_key = open('.api_key').read()

service_url = 'https://kgsearch.googleapis.com/v1/entities:search'

limit = 500

result = {
    'result': []
}

for node_list in tqdm(divide_chunks(nodes, limit)):
    params = [
        ('limit', limit),
        ('indent', True),
        ('key', api_key),
    ]
    params.extend(zip(itertools.repeat('ids'), node_list))
    
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    result['result'].extend(response['itemListElement'])
        
with open(root + "//fb.json", 'w') as outfile:
    json.dump(result, outfile, indent=4, sort_keys=True)

30it [00:40,  1.33s/it]


### Retrieve missing from wikidata

In [21]:
terms = set()
missing = []

jsonfile = load_json_file(root + "//fb.json")
for x in tqdm(jsonfile["result"]):
    term = x["result"]["@id"].replace("kg:","")
    terms.add(term)

for node in nodes:
    if node not in terms:
        missing.append(node)

missing = ["\"" + x + "\"" for x in missing]
print("Missing: " + str(len(missing)))

100%|██████████| 13948/13948 [00:00<00:00, 1190810.78it/s]Missing: 593



In [22]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """query=
SELECT ?item ?code ?itemLabel ?itemDescription WHERE {
  ?item wdt:P646 ?code.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  VALUES ?code {""" + " ".join(missing) + "}}"

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    'Accept': "application/sparql-results+json"
}

x = requests.post(endpoint_url, data = query, headers=headers)

y = json.loads(x.text)
with open(root + "//fb_missing_wikidata_sparql.json", 'w') as outfile:
    json.dump(y, outfile, indent=4, sort_keys=True)

Retrieve missing from https://github.com/yao8839836/kg-bert/blob/master/data/FB15K/

## Create graph

In [21]:
kg = Namespace("http://g.co/kg")

g = rdflib.Graph()

g.bind("kg", kg)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)

In [25]:

missing = nodes.copy()

jsonfile = load_json_file(root + "//fb.json")
for x in tqdm(jsonfile["result"]):
    term = x["result"]["@id"].replace("kg:","")
    missing.remove(term)
    g.add((
        kg.term(term),
        RDFS.label,
        rdflib.Literal(x["result"]["name"], datatype=XSD.string)

    ))
    if "@type" in x["result"]:
        for type_ in x["result"]["@type"]:
            g.add((
                kg.term(term),
                RDF.type,
                rdflib.Literal(type_, datatype=XSD.string)
            ))
    if "detailedDescription" in x["result"]:
        g.add((
            kg.term(term),
            RDFS.comment,
            rdflib.Literal(x["result"]["detailedDescription"]["articleBody"], datatype=XSD.string)
        ))
    elif "description" in x["result"]:
        g.add((
            kg.term(term),
            RDFS.comment,
            rdflib.Literal(x["result"]["description"], datatype=XSD.string)
        ))


alt_labels = load_id_tsv(root + "/FB15k_mid2name.txt")
alt_descriptions = load_id_tsv(root + "/FB15k_mid2description.txt")
for entity in missing:
    label = alt_labels.get(entity, None)
    description = alt_descriptions.get(entity, "")
    assert (label != None), "Again missing " + entity

    g.add((
            kg.term(entity),
            RDFS.label,
            rdflib.Literal(label.replace("_", " "), datatype=XSD.string)

        ))
    g.add((
        kg.term(entity),
        RDFS.comment,
        rdflib.Literal(description.replace("@en","").replace("\\n", "").replace("\\\"", "\"")[1:-1], datatype=XSD.string)

    ))

r"""
missing = load_json_file(root + "//fb_missing_wikidata_sparql.json")
p = re.compile('Q[1-9]+')
for result in tqdm(missing["results"]["bindings"]):
    if(p.match(result["itemLabel"]["value"]) == None):
        term = result["code"]["value"]
        g.add((
            kg.term(term),
            RDFS.label,
            rdflib.Literal(result["itemLabel"]["value"], datatype=XSD.string)

        ))
        if "itemDescription" in result:
            g.add((
                kg.term(term),
                RDFS.comment,
                rdflib.Literal(result["itemDescription"]["value"], datatype=XSD.string)

            ))
    else:
        print(result["itemLabel"]["value"])
"""
    
g.serialize(os.path.abspath(r"workspace/graphs/fb15k237.ttl"),format="turtle")

100%|██████████| 13948/13948 [00:03<00:00, 4219.00it/s]


In [26]:

outfile = open(r"workspace/graphs/fb15k237.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<http://g.co/kg{head}> <http://g.co/kg{rel}> <http://g.co/kg{tail}>>> obl:split obl:{typ} . " + "\n")

read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()