In [6]:
from __future__ import print_function
import json
import uuid
import itertools
import urllib
from tqdm import tqdm

import rdflib
from rdflib import Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

In [2]:
# https://developers.google.com/knowledge-graph/

root = r"C:\Users\Simon\Desktop\SAFRANExplorer\labelgraph\Labels"
train_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\workspace\data\FB15-237\train.txt"
test_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\workspace\data\FB15-237\test.txt"
valid_path = r"C:\Users\Simon\Desktop\SAFRANExplorer\workspace\data\FB15-237\valid.txt"

In [3]:
def add_node(node):
    nodes.add(node)

def read_nodes(path):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]

    for line in content:
        head,rel,tail = line.split("\t")
        add_node(head)
        add_node(tail)
        
def divide_chunks(l, n):
    for i in range(0, len(l), n): 
        yield l[i:i + n]
        
def load_json_file(path):
    with open(path) as json_file:
        return json.load(json_file)
        

## Download files


In [46]:
nodes = set()
read_nodes(train_path)
read_nodes(test_path)
read_nodes(valid_path)
nodes = list(nodes)

In [41]:

# api_key = open('.api_key').read()
api_key = ""

ids = ["/m/08966", "/m/05lf_"]
query = 'Taylor Swift'
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'

limit = 500

result = {
    'result': []
}

for node_list in divide_chunks(nodes, limit):
    params = [
        ('limit', limit),
        ('indent', True),
        ('key', api_key),
    ]
    params.extend(zip(itertools.repeat('ids'), node_list))
    
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    result['result'].extend(response['itemListElement'])
        
with open(root + "//fb.json", 'w') as outfile:
    json.dump(result, outfile, indent=4, sort_keys=True)

## Create graph

In [7]:
kg = Namespace("http://g.co/kg")

g = rdflib.Graph()

g.bind("kg", kg)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)

In [10]:

json_path = root + "//fb.json"
jsonfile = load_json_file(json_path)
for x in tqdm(jsonfile["result"]):
    term = x["result"]["@id"].replace("kg:","")
    g.add((
        kg.term(term),
        RDFS.label,
        rdflib.Literal(x["result"]["name"], datatype=XSD.string)

    ))
    if "@type" in x["result"]:
        for type_ in x["result"]["@type"]:
            g.add((
                kg.term(term),
                RDF.type,
                rdflib.Literal(type_, datatype=XSD.string)
            ))
    if "detailedDescription" in x["result"]:
        g.add((
            kg.term(term),
            RDFS.comment,
            rdflib.Literal(x["result"]["detailedDescription"]["articleBody"], datatype=XSD.string)
        ))
    elif "description" in x["result"]:
        g.add((
            kg.term(term),
            RDFS.comment,
            rdflib.Literal(x["result"]["description"], datatype=XSD.string)
        ))
    
g.serialize(r"C:\Users\Simon\Desktop\SAFRANExplorer\labelgraph\Graphs\fb15k237_with_labels.ttl",format="turtle")

100%|██████████| 13954/13954 [00:03<00:00, 4129.43it/s]


In [11]:


outfile = open(r"C:\Users\Simon\Desktop\SAFRANExplorer\labelgraph\Graphs\fb15k237_with_labels.ttl", 'a')

def read_set(path, typ):
    content = None
    with open(path) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    for line in content:
        head,rel,tail = line.split("\t")
        outfile.write(f"<<<http://g.co/kg{head}> <http://g.co/kg{rel}> <http://g.co/kg{tail}>>> obl:split obl:{typ} . " + "\n")

read_set(train_path, 'train')
read_set(test_path, 'test')
read_set(valid_path, 'valid')

outfile.close()

In [18]:
g = rdflib.Graph()
g.parse(r"C:\Users\ottsi\SAFRANExplorer\labelgraph\Graphs\fb15k237.xml",format="xml")
g.serialize(r"C:\Users\ottsi\SAFRANExplorer\labelgraph\Graphs\fb15k237.ttl",format="turtle")