In [1]:
%%capture 
!pip install wikidataintegrator
from rdflib import Graph, URIRef, Literal, Namespace, BNode
from rdflib.namespace import RDF, RDFS, SKOS, XSD, OWL, PROV
from wikidataintegrator import wdi_core, wdi_config
import uuid
import urllib.parse

In [2]:
uriformat = dict()
query = """
   SELECT DISTINCT ?prop ?format WHERE {
   ?prop wdt:P1921 ?format .
}
"""
df = wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True)
for index, row in df.iterrows():
    if row["prop"].replace("http://www.wikidata.org/entity/", "") not in uriformat.keys():
        uriformat[row["prop"].replace("http://www.wikidata.org/entity/", "")] = []
    uriformat[row["prop"].replace("http://www.wikidata.org/entity/", "")].append(row["format"])

In [3]:
uriformat

{'Q43649390': ['http://www.wikidata.org/entity/$1'],
 'Q2013': ['http://www.wikidata.org/entity/$1'],
 'P220': ['http://lexvo.org/id/iso639-3/$1'],
 'P665': ['https://www.kegg.jp/entry/$1'],
 'P950': ['http://datos.bne.es/resource/$1'],
 'P981': ['http://bag.basisregistraties.overheid.nl/bag/id/woonplaats/$1'],
 'P957': ['urn:ISBN:$1'],
 'P1006': ['http://data.bibliotheken.nl/id/thes/p$1'],
 'P1025': ['http://www.sudoc.fr/$1/id'],
 'P212': ['urn:ISBN:$1'],
 'P1422': ['http://ta.sandrart.net/-person-$1'],
 'P1566': ['http://sws.geonames.org/$1/'],
 'P2410': ['http://identifiers.org/wikipathways/$1'],
 'P2950': ['http://nomisma.org/id/$1'],
 'P3153': ['https://doi.org/10.13039/$1'],
 'P3224': ['https://bartoc-skosmos.unibas.ch/naics/en/page/$1'],
 'P3916': ['http://vocabularies.unesco.org/thesaurus/$1'],
 'P4104': ['https://data.carnegiehall.org/names/$1/about'],
 'P5587': ['https://libris.kb.se/$1'],
 'P5739': ['http://catalogo.pusc.it/auth/$1'],
 'P5813': ['http://musicbrainz.org/relea

In [4]:
rdf_item = Graph()
ns = dict()
for prefix in wdi_config.prefix.keys():
    ns[prefix] = Namespace( wdi_config.prefix[prefix])
    rdf_item.namespace_manager.bind(prefix, ns[prefix])
ns['schema'] = Namespace('http://schema.org/') # can be removed since it will be in WDI version 0.8.21

In [5]:
qid = "Q35869"
#qid = "Q38"
item = wdi_core.WDItemEngine(wd_item_id=qid)
json_item = item.get_wd_json_representation()
rdf_item.add((ns["wd"][qid], RDF.type, ns["wikibase"].Item)) 

In [6]:
import requests
import json

json_item = json.loads(requests.get("http://www.wikidata.org/entity/"+qid+".json").text)["entities"][qid]


In [7]:
properties = dict()
linked_items = []

metadata = {"identifiers": 0, "sitelinks": 0, "statements": 0}

def owlPropertyTypes(owlType):
    rdf_item.add((ns["wd"][pid], RDF.type, owlType))
    rdf_item.add((ns["p"][pid], RDF.type, owlType))
    rdf_item.add((ns["wdtn"][pid], RDF.type, owlType))
    rdf_item.add((ns["wdt"][pid], RDF.type, owlType))
    rdf_item.add((ns["pq"][pid], RDF.type, owlType))
    rdf_item.add((ns["pqn"][pid], RDF.type, owlType))
    rdf_item.add((ns["pqv"][pid], RDF.type, owlType))
    rdf_item.add((ns["pr"][pid], RDF.type, owlType))
    rdf_item.add((ns["prn"][pid], RDF.type, owlType))
    rdf_item.add((ns["prv"][pid], RDF.type,owlType))
    rdf_item.add((ns["ps"][pid], RDF.type, owlType))
    rdf_item.add((ns["psn"][pid], RDF.type, owlType))
    rdf_item.add((ns["psv"][pid], RDF.type, owlType))

def propdefs(pid, item, datatype):
    ## Properties and their derivatives
    object_properties = ["wikibase-item", 'external-id', 'string', 'commonsMedia',  'time', 'edtf', 'globe-coordinate', 'url', 'quantity', 'wikibase-property', 'monolingualtext', 'math', 'tabular-data', 'form', 'lexeme', 'geo-shape', 'musical-notation', 'sense']
    data_properties = ['external-id', 'string', 'time', 'edtf', 'globe-coordinate', 'quantity', 'monolingualtext', 'math', 'geo-shape', 'form', 'lexeme', 'musical-notation', 'sense' ]
    print(pid)  
    # ObjectProperty
    if datatype in object_properties:
        owlPropertyTypes(OWL.ObjectProperty)
    # Data Properties
    if datatype in data_properties:
        owlPropertyTypes(OWL.DatatypeProperty)  
    rdf_item.add((ns["wd"][pid], RDF.type, ns["wikibase"].Property))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].directClaim, ns["wdt"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].claim, ns["p"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].statementProperty, ns["ps"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].statementValue, ns["psv"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].qualifier, ns["pq"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].qualifierValue, ns["pqv"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].reference, ns["pr"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].referenceValue, ns["prv"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].novalue, ns["wdno"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].directClaimNormalized, ns["wdtn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].qualifierValueNormalized, ns["pqn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].referenceValueNormalized, ns["prn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].statementValueNormalized, ns["psn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].propertyType, URIRef(wdi_config.property_value_types[datatype])))
    rdf_item.add((ns["wdno"][pid], RDF.type, OWL.Class))
    owl_restriction = BNode()
    rdf_item.add((owl_restriction, RDF.type, OWL.Restriction))
    rdf_item.add((owl_restriction, OWL.onProperty, ns["wdt"][pid]))
    rdf_item.add((owl_restriction, OWL.someValuesFrom, OWL.Thing))    
    rdf_item.add((ns["wdno"][pid], OWL.complementOf, owl_restriction)) 
    
def parseSnak(statement):
    value = statement["datavalue"]["value"]
    if statement["datatype"] == "commonsMedia":
        return URIRef("http://commons.wikimedia.org/wiki/Special:FilePath/"+value.replace(" ", "_"))
    elif statement["datatype"] == "string": 
        return Literal(value)   
    elif statement["datatype"] == "external-id": 
        return Literal(value)
    elif statement["datatype"] == "wikibase-item":
        if value["id"] not in linked_items:
            linked_items.append(value["id"])
        return ns["wd"][value["id"]]
    elif statement["datatype"] == "monolingualtext": 
        return Literal(value["text"], value["language"])
    elif statement["datatype"] == "geo-shape": 
        return URIRef("http://commons.wikimedia.org/data/main/"+value)
    elif statement["datatype"] == "globe-coordinate":
        latitude = value["latitude"]
        longitude = value["longitude"]
        # altitude = claim["mainsnak"]["datavalue"]["value"]["altitude"] # not used
        precision = value["precision"] # not used
        globe = value["globe"]   # not used
        return Literal("Point("+str(longitude)+","+str(latitude)+")", datatype=ns["geo"].wktLiteral)
    elif statement["datatype"] == "quantity":
        amount = value["amount"]
        unit =  value["unit"] 
        return Literal(value["amount"], datatype=XSD.decimal)
    elif statement["datatype"] == "time": 
        return Literal(value["time"].replace("+", "").replace("Z", "+00:00"), datatype=XSD.dateTime)
    
    elif statement["datatype"] == "url": 
        return URIRef(value) 
    else: raise ValueError('unknown snak datatype ' + statement["datatype"])

for pid in json_item['claims'].keys():
    if pid not in properties.keys():
        properties[pid] = json_item['claims'][pid][0]["mainsnak"]["datatype"]
    ## Ststements 
    for claim in json_item['claims'][pid]: 
        statement_uri = ns["s"][claim["id"].replace("$","-")]
        #rank
        if claim["rank"] == "normal": 
            rdf_item.add((statement_uri, ns["wikibase"].rank, ns["wikibase"].NormalRank))
        if claim["rank"] == "preferred":
            rdf_item.add((statement_uri, ns["wikibase"].rank, ns["wikibase"].PreferredRank))
        if claim["rank"] == "deprecated":
            rdf_item.add((statement_uri, ns["wikibase"].rank, ns["wikibase"].DeprecatedRank))
        
        # values   
        for claim2 in json_item['claims'][pid]:
            if claim2["rank"] == "preferred":
                preferredSet = True
                break
        else:
            preferredSet = False
        
        ## first no value
        if claim["mainsnak"]["datatype"] == "external-id": 
            metadata["identifiers"] += 1
        else:
            metadata["statements"] += 1
        if claim["mainsnak"]["snaktype"] == "novalue":
            rdf_item.add((statement_uri, RDF.type, ns["wdno"][pid]))
        else: 
            objectValue = parseSnak(claim["mainsnak"])
            rdf_item.add((statement_uri, ns["ps"][pid], objectValue))
            if pid in uriformat.keys():
                for normProp in uriformat[pid]:
                    rdf_item.add((statement_uri, ns["psn"][pid], URIRef(normProp.replace("$1", objectValue))))
            if preferredSet:
                if claim["rank"] == "preferred":
                    rdf_item.add((ns["wd"][qid], ns["wdt"][pid], objectValue))
                    if pid in uriformat.keys():
                        for normProp in uriformat[pid]:
                            rdf_item.add((ns["wd"][qid], ns["wdtn"][pid], URIRef(normProp.replace("$1", objectValue))))
            else:
                if claim["rank"] == "normal":
                    rdf_item.add((ns["wd"][qid], ns["wdt"][pid], objectValue))
                    if pid in uriformat.keys():
                        for normProp in uriformat[pid]:
                            rdf_item.add((ns["wd"][qid], ns["wdtn"][pid], URIRef(normProp.replace("$1", objectValue))))
        
        rdf_item.add((ns["wd"][qid],ns["p"][pid], statement_uri))
        rdf_item.add((statement_uri,RDF.type, ns["wikibase"].Statement))
        if preferredSet:
            if claim["rank"] == "preferred":
                  rdf_item.add((statement_uri,RDF.type, ns["wikibase"].BestRank))
        else:
            if claim["rank"] == "normal":
                  rdf_item.add((statement_uri,RDF.type, ns["wikibase"].BestRank))
                    
        
        #qualifiers
        if "qualifiers" in claim.keys():
            for qualifier in claim["qualifiers"].keys():
                if qualifier not in properties.keys():
                    properties[qualifier] = claim["qualifiers"][qualifier][0]["datatype"]
                for qualifier_prop in claim["qualifiers"][qualifier]:
                    object = parseSnak(qualifier_prop)
                    rdf_item.add((statement_uri, ns["pq"][qualifier], object))
                
        #references
        if "references" in claim.keys():
            for reference in claim["references"]:
                reference_uri = ns["ref"][reference["hash"]]
                rdf_item.add((reference_uri, RDF.type, ns["wikibase"].Reference))
                rdf_item.add((statement_uri, PROV.wasDerivedFrom, reference_uri))

                for ref_prop in reference["snaks"].keys():
                    if ref_prop not in properties.keys():
                        properties[ref_prop] = reference["snaks"][ref_prop][0]["datatype"]
                    for ref_prop_statement in reference["snaks"][ref_prop]:
                        object = parseSnak(ref_prop_statement)
                        rdf_item.add((reference_uri, ns["pr"][ref_prop], object))

In [8]:
# sitelinks
for sitelink in json_item['sitelinks'].keys():
    metadata["sitelinks"] += 1
    wiki = URIRef(json_item['sitelinks'][sitelink]["url"])
    #print(json_item['sitelinks'][sitelink]["url"])
    partof = URIRef(json_item['sitelinks'][sitelink]["url"].split("wiki/")[0])
    if "commons" in str(partof):
        group = str(partof).split(".")[0].replace("https://", "")
        print(group)
    else:
        group = str(partof).split(".")[1]
    rdf_item.add((partof, ns["wikibase"].wikiGroup, Literal(group)))
    if "quote" in sitelink:
        language = sitelink.replace("wikiquote", "")
    elif sitelink == "simplewiki":
        language = "en-simple"
    elif sitelink == "commonswiki":
        language = "en"
    elif sitelink == "zh_yuewiki":
        language = "yue"
    elif sitelink == "zh_min_nanwiki":
        language = "nan"
    elif sitelink == "nowiki":
        language = "nb"
    else:
        language = sitelink.replace("wiki", "")
    rdf_item.add((wiki, RDF.type, ns["schema"].Article))
    rdf_item.add((wiki, ns['schema'].about, ns["wd"][qid]))
    rdf_item.add((wiki, ns['schema'].isPartOf, URIRef(partof)))
    for badge in json_item['sitelinks'][sitelink]["badges"]:
        ns["wikibase"].badge
        rdf_item.add((wiki, ns["wikibase"].badge, ns["wd"][badge]))
    try:
        rdf_item.add((wiki, ns['schema'].name, Literal(json_item['sitelinks'][sitelink]["title"], language)))
        rdf_item.add((wiki, ns['schema'].inLanguage, Literal(language)))
    except:
        print(language)
    

commons


In [9]:
# Metadata
metadata["statements"] += metadata["identifiers"]
rdf_item.add((ns["data"][qid], RDF.type, ns["schema"].Dataset))
rdf_item.add((ns["data"][qid], ns["cc"].license, URIRef("http://creativecommons.org/publicdomain/zero/1.0/")))
rdf_item.add((ns["data"][qid], ns["schema"].about, ns["wd"][qid]))
rdf_item.add((ns["data"][qid], ns["schema"].softwareVersion, Literal("1.0.0")))
rdf_item.add((ns["data"][qid], ns["wikibase"].identifiers, Literal(metadata["identifiers"])))
rdf_item.add((ns["data"][qid], ns["wikibase"].sitelinks, Literal(metadata["sitelinks"])))
rdf_item.add((ns["data"][qid], ns["wikibase"].statements, Literal(metadata["statements"])))
rdf_item.add((ns["data"][qid], ns["schema"].version, Literal(json_item["lastrevid"]))) 
rdf_item.add((ns["data"][qid], ns["schema"].dateModified, Literal(json_item["modified"].replace("Z", "+00:00"), datatype=XSD.dateTime))) 


In [10]:
def transformLabels(qid, json_item):
    # Heading
    for language in json_item["labels"].keys():
        rdf_item.add((ns["wd"][qid], RDFS.label, Literal(json_item["labels"][language]["value"], language)))
        rdf_item.add((ns["wd"][qid], ns["schema"].name, Literal(json_item["labels"][language]["value"], language)))
        rdf_item.add((ns["wd"][qid], ns["skos"].prefLabel, Literal(json_item["labels"][language]["value"], language)))

    for language in json_item["descriptions"].keys():
        rdf_item.add((ns["wd"][qid], ns["schema"].description, Literal(json_item["descriptions"][language]["value"], language)))

    for language in json_item["aliases"].keys():
        for label in json_item["aliases"][language]:
            rdf_item.add((ns["wd"][qid], SKOS.altLabel, Literal(label["value"], language))) 
                          
transformLabels(qid, json_item)     
    
for pid in properties.keys():
    pid_item = wdi_core.WDItemEngine(wd_item_id=pid).get_wd_json_representation()
    transformLabels(pid, pid_item)
    propdefs(pid, pid_item, properties[pid])



P373
P508
P248
P604
P143
P557
P673
P910
P646
P577
P227
P244
P1461
P854
P1051
P1995
P279
P813
P699
P935
P667
P2176
P2115
P407
P580
P790
P2293
P459
P2888
P3841
P2892
P3417
P3569
P3827
P3471
P18
P2096
P4229
P3219
P2924
P4254
P1417
P492
P1692
P672
P5642
P1542
P698
P5395
P1296
P7033
P1245
P5806
P7818
P7829
P7827
P5082
P4390
P3222
P5019
P7982
P7995
P2347
P5008
P1889
P4656
P5131
P486
P1810
P8349
P780
P920
P8408
P7329
P7807
P665
P8785
P4527


In [11]:
for linked_qid in linked_items:
    rdf_item.add((ns["wd"][linked_qid], RDF.type, ns["wikibase"].Item))
    linked_qid_item = wdi_core.WDItemEngine(wd_item_id=linked_qid).get_wd_json_representation()
    transformLabels(linked_qid, linked_qid_item)
    

In [12]:
compareRDF = Graph()
compareRDF.parse("http://www.wikidata.org/entity/"+qid+".ttl")

<Graph identifier=N60858d72e9404e2aa9908d70e6b9adfa (<class 'rdflib.graph.Graph'>)>

In [13]:
diffRdf = compareRDF-rdf_item
for prefix in wdi_config.prefix.keys():
    ns[prefix] = Namespace( wdi_config.prefix[prefix])
    diffRdf.namespace_manager.bind(prefix, ns[prefix])
diffRdf.serialize(format="turtle", destination="1.ttl")
print(len(diffRdf))
print(len(rdf_item))
rdf_item.serialize(format="turtle", destination="2.ttl")

485
42353
