In [102]:
from rdflib import Graph, URIRef, Literal
from elasticsearch import helpers
import itertools
from tqdm.auto import tqdm
import json

import sys
sys.path.append("..")

from heritageconnector import datastore
from heritageconnector.namespace import RDFS
from heritageconnector.utils.wikidata import is_qid, url_to_qid

## heritage connector graph from triples

In [37]:
dump_path = "../GITIGNORE_DATA/triples/hc_dump_latest.nt"

g = Graph()
g.parse(dump_path, format='nt')


<Graph identifier=N0224a797f7d94f70a0d13493a91f0d07 (<class 'rdflib.graph.Graph'>)>

In [69]:
wikidata_nodes = set([n for n in g.all_nodes() if "wikidata" in str(n)])

In [72]:
list(wikidata_nodes)[0]

rdflib.term.URIRef('http://www.wikidata.org/entity/Q501953')

## parse a fixed number of wikidata entities in the heritage connector to a separate graph

In [94]:
def parse_wikidump_doc(g: Graph, doc: dict):
    _id = "http://www.wikidata.org/entity/" + doc["_source"]["id"]
    src = doc["_source"]
    claims = doc["_source"]["claims"]
    
    if "labels" in src.keys():
        g.add((URIRef(_id), RDFS.label, Literal(src["labels"])))
        
    for prop, vals in claims.items():
        prop_uriref = URIRef("http://www.wikidata.org/prop/direct/"+prop)
        for v in vals:
            if is_qid(v):
                g.add((URIRef(_id), prop_uriref, URIRef("http://www.wikidata.org/entity/"+v)))
            else:
                g.add((URIRef(_id), prop_uriref, Literal(v)))

In [96]:
wiki_entities_limit = 1000
wg = Graph()

for ent in tqdm(list(wikidata_nodes)[0:wiki_entities_limit]):
    res = datastore.es.search(
            body = {"query": {"bool": {"must": [{"term": {"id.keyword": url_to_qid(str(ent))}}]}}}, 
            index="wikidump"
        )["hits"]["hits"]
    
    if res:
        parse_wikidump_doc(wg, res[0])

  0%|          | 0/1000 [00:00<?, ?it/s]

http://www.wikidata.org/entity/Boer War Q71976.jpg does not look like a valid URI, trying to serialize this will break.


In [97]:
len(wg)

11042

## add the graphs and check that the triples are as expected

In [98]:
G = g + wg
len(G)

3316054

In [99]:
test_wiki_id = list(wikidata_nodes)[0]

In [101]:
list(G.triples((test_wiki_id, None, None))),  list(G.triples((None, None, test_wiki_id)))

([(rdflib.term.URIRef('http://www.wikidata.org/entity/Q501953'),
   rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P856'),
   rdflib.term.Literal('http://www.ville-badonviller.fr/')),
  (rdflib.term.URIRef('http://www.wikidata.org/entity/Q501953'),
   rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P166'),
   rdflib.term.URIRef('http://www.wikidata.org/entity/Q2727598')),
  (rdflib.term.URIRef('http://www.wikidata.org/entity/Q501953'),
   rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P646'),
   rdflib.term.Literal('/m/0b9kt0')),
  (rdflib.term.URIRef('http://www.wikidata.org/entity/Q501953'),
   rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P17'),
   rdflib.term.URIRef('http://www.wikidata.org/entity/Q142')),
  (rdflib.term.URIRef('http://www.wikidata.org/entity/Q501953'),
   rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P31'),
   rdflib.term.URIRef('http://www.wikidata.org/entity/Q484170')),
  (rdflib.term.URIRef('http://www.wikidata.org/