In [1]:
%%capture 
!pip install wikidataintegrator
!pip install phpserialize
from rdflib import Graph, URIRef, Literal, Namespace, BNode
from rdflib.namespace import RDF, RDFS, SKOS, XSD, OWL, PROV
from wikidataintegrator import wdi_core, wdi_config
import urllib.parse
import hashlib
import uuid
from phpserialize import *

In [2]:
# Get all properties where the values are converted into IRI based on
# URI formats (P1921)

uriformat = dict()
query = """
   SELECT DISTINCT ?prop ?format WHERE {
   ?prop wdt:P1921 ?format .
}
"""
df = wdi_core.WDFunctionsEngine.execute_sparql_query(query, as_dataframe=True)
for index, row in df.iterrows():
    if row["prop"].replace("http://www.wikidata.org/entity/", "") not in uriformat.keys():
        uriformat[row["prop"].replace("http://www.wikidata.org/entity/", "")] = []
    uriformat[row["prop"].replace("http://www.wikidata.org/entity/", "")].append(row["format"])

In [3]:
rdf_item = Graph()
ns = dict()
for prefix in wdi_config.prefix.keys():
    ns[prefix] = Namespace( wdi_config.prefix[prefix])
    rdf_item.namespace_manager.bind(prefix, ns[prefix])
ns['schema'] = Namespace('http://schema.org/') # can be removed since it will be in WDI version 0.8.21

In [4]:
#qid = "Q35869"
qid = "Q38"
item = wdi_core.WDItemEngine(wd_item_id=qid)
json_item = item.get_wd_json_representation()
rdf_item.add((ns["wd"][qid], RDF.type, ns["wikibase"].Item)) 

In [5]:
import requests
import json

json_item = json.loads(requests.get("http://www.wikidata.org/entity/"+qid+".json").text)["entities"][qid]


with open('unitConversionConfig.json') as json_file:
    siconversion = json.load(json_file)

siconversion

{'Q199': {'factor': '1', 'unit': 'Q199', 'label': '1', 'siLabel': '1'},
 'Q531': {'factor': '9460800000000000',
  'unit': 'Q11573',
  'label': 'light-year',
  'siLabel': 'metre'},
 'Q573': {'factor': '86400',
  'unit': 'Q11574',
  'label': 'day',
  'siLabel': 'second'},
 'Q577': {'factor': '31536000',
  'unit': 'Q11574',
  'label': 'year',
  'siLabel': 'second'},
 'Q1811': {'factor': '149597870700',
  'unit': 'Q11573',
  'label': 'astronomical unit',
  'siLabel': 'metre'},
 'Q2101': {'factor': '0.0000000000000000001602176634',
  'unit': 'Q25406',
  'label': 'elementary charge',
  'siLabel': 'coulomb'},
 'Q3710': {'factor': '0.3048006',
  'unit': 'Q11573',
  'label': 'foot',
  'siLabel': 'metre'},
 'Q7727': {'factor': '60',
  'unit': 'Q11574',
  'label': 'minute',
  'siLabel': 'second'},
 'Q11229': {'factor': '0.01',
  'unit': 'Q199',
  'label': 'percent',
  'siLabel': '1'},
 'Q11570': {'factor': '1',
  'unit': 'Q11570',
  'label': 'kilogram',
  'siLabel': 'kilogram'},
 'Q11573': {'fact

In [6]:
properties = dict()
linked_items = []

metadata = {"identifiers": 0, "sitelinks": 0, "statements": 0}

def owlPropertyTypes(owlType):
    rdf_item.add((ns["wd"][pid], RDF.type, owlType))
    rdf_item.add((ns["p"][pid], RDF.type, owlType))
    rdf_item.add((ns["wdtn"][pid], RDF.type, owlType))
    rdf_item.add((ns["wdt"][pid], RDF.type, owlType))
    rdf_item.add((ns["pq"][pid], RDF.type, owlType))
    rdf_item.add((ns["pqn"][pid], RDF.type, owlType))
    rdf_item.add((ns["pqv"][pid], RDF.type, owlType))
    rdf_item.add((ns["pr"][pid], RDF.type, owlType))
    rdf_item.add((ns["prn"][pid], RDF.type, owlType))
    rdf_item.add((ns["prv"][pid], RDF.type,owlType))
    rdf_item.add((ns["ps"][pid], RDF.type, owlType))
    rdf_item.add((ns["psn"][pid], RDF.type, owlType))
    rdf_item.add((ns["psv"][pid], RDF.type, owlType))

def propdefs(pid, item, datatype):
    ## Properties and their derivatives
    object_properties = ["wikibase-item", 'external-id', 'string', 'commonsMedia',  'time', 'edtf', 'globe-coordinate', 'url', 'quantity', 'wikibase-property', 'monolingualtext', 'math', 'tabular-data', 'form', 'lexeme', 'geo-shape', 'musical-notation', 'sense']
    data_properties = ['external-id', 'string', 'time', 'edtf', 'globe-coordinate', 'quantity', 'monolingualtext', 'math', 'geo-shape', 'form', 'lexeme', 'musical-notation', 'sense' ]
    # ObjectProperty
    if datatype in object_properties:
        owlPropertyTypes(OWL.ObjectProperty)
    # Data Properties
    if datatype in data_properties:
        owlPropertyTypes(OWL.DatatypeProperty)  
    rdf_item.add((ns["wd"][pid], RDF.type, ns["wikibase"].Property))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].directClaim, ns["wdt"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].claim, ns["p"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].statementProperty, ns["ps"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].statementValue, ns["psv"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].qualifier, ns["pq"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].qualifierValue, ns["pqv"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].reference, ns["pr"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].referenceValue, ns["prv"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].novalue, ns["wdno"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].directClaimNormalized, ns["wdtn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].qualifierValueNormalized, ns["pqn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].referenceValueNormalized, ns["prn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].statementValueNormalized, ns["psn"][pid]))
    rdf_item.add((ns["wd"][pid], ns["wikibase"].propertyType, URIRef(wdi_config.property_value_types[datatype])))
    rdf_item.add((ns["wdno"][pid], RDF.type, OWL.Class))
    owl_restriction = BNode()
    rdf_item.add((owl_restriction, RDF.type, OWL.Restriction))
    rdf_item.add((owl_restriction, OWL.onProperty, ns["wdt"][pid]))
    rdf_item.add((owl_restriction, OWL.someValuesFrom, OWL.Thing))    
    rdf_item.add((ns["wdno"][pid], OWL.complementOf, owl_restriction)) 
    
def parseSnak(statement):
    value = statement["datavalue"]["value"]
    if statement["datatype"] == "commonsMedia":
        return URIRef("http://commons.wikimedia.org/wiki/Special:FilePath/"+value.replace(" ", "_"))
    elif statement["datatype"] == "string": 
        return Literal(value)   
    elif statement["datatype"] == "external-id": 
        return Literal(value)
    elif statement["datatype"] == "wikibase-item":
        if value["id"] not in linked_items:
            linked_items.append(value["id"])
        return ns["wd"][value["id"]]
    elif statement["datatype"] == "monolingualtext": 
        return Literal(value["text"], value["language"])
    elif statement["datatype"] == "geo-shape": 
        return URIRef("http://commons.wikimedia.org/data/main/"+value)
    elif statement["datatype"] == "globe-coordinate":
        latitude = value["latitude"]
        longitude = value["longitude"]
        # altitude = claim["mainsnak"]["datavalue"]["value"]["altitude"] # not used
        precision = value["precision"] # not used
        globe = value["globe"]   # not used
        return Literal("Point("+str(longitude)+","+str(latitude)+")", datatype=ns["geo"].wktLiteral)
    elif statement["datatype"] == "quantity":
        amount = value["amount"]
        unit =  value["unit"] 
        return Literal(value["amount"], datatype=XSD.decimal)
    elif statement["datatype"] == "time": 
        return Literal(value["time"].replace("+", "").replace("Z", "+00:00"), datatype=XSD.dateTime)   
    elif statement["datatype"] == "url": 
        return URIRef(value) 
    else: raise ValueError('unknown snak datatype ' + statement["datatype"])

        
def normalized_values(snakuri, snak, prop, value, snaktype):
    if snaktype == "qualifier":
        normprop = ns["pqv"]
    elif snaktype == "reference":
        normprop = ns["prv"]
    elif snaktype == "statement":
        normprop = ns["pv"]
    else:
        raise ValueError('unknown snak type (statement, qualifier, reference).')

    if snak["datatype"] == "time":
        uri = ns["v"][uuid.uuid4()] # TODO: fix to wikidata hash once I figured out how the hashes are composed
        rdf_item.add((snakuri, normprop[prop], uri))
        rdf_item.add((uri, RDF.type, ns["wikibase"].TimeValue))
        rdf_item.add((uri, ns["wikibase"].timeValue, value))
        rdf_item.add((uri, ns["wikibase"].timePrecision, Literal(snak["datavalue"]["value"]["precision"])))
        rdf_item.add((uri, ns["wikibase"].timeTimezone, Literal(snak["datavalue"]["value"]["timezone"])))
        rdf_item.add((uri, ns["wikibase"].timeCalendarModel, URIRef(snak["datavalue"]["value"]["calendarmodel"])))
    if snak["datatype"] == "quantity":
        uri = ns["v"][uuid.uuid4()] # TODO: fix to wikidata hash once I figured out how the hashes are composed
        if prop in siconversion.keys():
            if siconversion[prop]["factor"] == "1":
                rdf_item.add((uri, ns["wikibase"].quantityNormalized, uri))
            else:
                normalized_uri = ns["v"][uuid.uuid4()] # TODO: fix to wikidata hash once I figured out how the hashes are composed
                rdf_item.add((normalized_uri, normprop[prop], normalized_uri)) 
                rdf_item.add((normalized_uri, RDF.type, ns["wikibase"].QuantityValue))
                rdf_item.add((normalized_uri, ns["wikibase"].quantityAmount, Literal(snak["datavalue"]["value"]["amount"]*siconversion[prop]["factor"])))       
                if snak["datavalue"]["value"]["unit"] == "1":
                    rdf_item.add((uri, ns["wikibase"].quantityUnit, ns["wd"].Q199))
                else:
                    rdf_item.add((uri, ns["wikibase"].quantityUnit, ns["wd"][siconversion[prop]["unit"]]))
            
        
        rdf_item.add((snakuri, normprop[prop], uri))
        rdf_item.add((uri, RDF.type, ns["wikibase"].QuantityValue))
        rdf_item.add((uri, ns["wikibase"].quantityAmount, Literal(snak["datavalue"]["value"]["amount"])))       
        if snak["datavalue"]["value"]["unit"] == "1":
            rdf_item.add((uri, ns["wikibase"].quantityUnit, ns["wd"].Q199))
        else:
            rdf_item.add((uri, ns["wikibase"].quantityUnit, URIRef(snak["datavalue"]["value"]["unit"])))
    if snak["datatype"] == "globe-coordinate":
        uri = ns["v"][uuid.uuid4()] # TODO: fix to wikidata hash once I figured out how the hashes are composed
        rdf_item.add((snakuri, normprop[prop], uri))
        rdf_item.add((uri, RDF.type, ns["wikibase"].GlobecoordinateValue))
        rdf_item.add((uri, ns["wikibase"].geoLatitude, Literal(snak["datavalue"]["value"]["latitude"]))) 
        rdf_item.add((uri, ns["wikibase"].geoLongitude, Literal(snak["datavalue"]["value"]["longitude"]))) 
        rdf_item.add((uri, ns["wikibase"].geoGlobe, ns["wd"][snak["datavalue"]["value"]["globe"]]))
        
        

for pid in json_item['claims'].keys():
    if pid not in properties.keys():
        properties[pid] = json_item['claims'][pid][0]["mainsnak"]["datatype"]
    ## Ststements 
    for claim in json_item['claims'][pid]: 
        statement_uri = ns["s"][claim["id"].replace("$","-")]
        #rank
        if claim["rank"] == "normal": 
            rdf_item.add((statement_uri, ns["wikibase"].rank, ns["wikibase"].NormalRank))
        if claim["rank"] == "preferred":
            rdf_item.add((statement_uri, ns["wikibase"].rank, ns["wikibase"].PreferredRank))
        if claim["rank"] == "deprecated":
            rdf_item.add((statement_uri, ns["wikibase"].rank, ns["wikibase"].DeprecatedRank))
        
        # values   
        for claim2 in json_item['claims'][pid]:
            if claim2["rank"] == "preferred":
                preferredSet = True
                break
        else:
            preferredSet = False
        
        ## first no value
        if claim["mainsnak"]["datatype"] == "external-id": 
            metadata["identifiers"] += 1
        else:
            metadata["statements"] += 1
        if claim["mainsnak"]["snaktype"] == "novalue":
            rdf_item.add((statement_uri, RDF.type, ns["wdno"][pid]))
        else: 
            objectValue = parseSnak(claim["mainsnak"])
            rdf_item.add((statement_uri, ns["ps"][pid], objectValue))
            if pid in uriformat.keys():
                for normProp in uriformat[pid]:
                    rdf_item.add((statement_uri, ns["psn"][pid], URIRef(normProp.replace("$1", objectValue))))
            if preferredSet:
                if claim["rank"] == "preferred":
                    rdf_item.add((ns["wd"][qid], ns["wdt"][pid], objectValue))
                    if pid in uriformat.keys():
                        for normProp in uriformat[pid]:
                            rdf_item.add((ns["wd"][qid], ns["wdtn"][pid], URIRef(normProp.replace("$1", objectValue))))
            else:
                if claim["rank"] == "normal":
                    rdf_item.add((ns["wd"][qid], ns["wdt"][pid], objectValue))
                    if pid in uriformat.keys():
                        for normProp in uriformat[pid]:
                            rdf_item.add((ns["wd"][qid], ns["wdtn"][pid], URIRef(normProp.replace("$1", objectValue))))
        
        rdf_item.add((ns["wd"][qid],ns["p"][pid], statement_uri))
        rdf_item.add((statement_uri,RDF.type, ns["wikibase"].Statement))
        if preferredSet:
            if claim["rank"] == "preferred":
                  rdf_item.add((statement_uri,RDF.type, ns["wikibase"].BestRank))
        else:
            if claim["rank"] == "normal":
                  rdf_item.add((statement_uri,RDF.type, ns["wikibase"].BestRank))
                    
        #qualifiers
        if "qualifiers" in claim.keys():
            for qualifier in claim["qualifiers"].keys():
                if qualifier not in properties.keys():
                    properties[qualifier] = claim["qualifiers"][qualifier][0]["datatype"]
                for qualifier_prop in claim["qualifiers"][qualifier]:
                    object = parseSnak(qualifier_prop)
                    
                    h = hashlib.sha1(dumps(qualifier_prop["property"]))
                    rdf_item.add((statement_uri, ns["pq"][qualifier], object))
                    normalized_values(statement_uri, qualifier_prop, qualifier, object, "qualifier")
                
        #references
        if "references" in claim.keys():
            for reference in claim["references"]:
                reference_uri = ns["ref"][reference["hash"]]
                rdf_item.add((reference_uri, RDF.type, ns["wikibase"].Reference))
                rdf_item.add((statement_uri, PROV.wasDerivedFrom, reference_uri))

                for ref_prop in reference["snaks"].keys():
                    if ref_prop not in properties.keys():
                        properties[ref_prop] = reference["snaks"][ref_prop][0]["datatype"]
                    for ref_prop_statement in reference["snaks"][ref_prop]:
                        object = parseSnak(ref_prop_statement)
                        rdf_item.add((reference_uri, ns["pr"][ref_prop], object))
                        normalized_values(reference_uri, ref_prop_statement, ref_prop, object, "reference")
                        

                            

In [7]:
merged_items = json.loads(requests.get("https://www.wikidata.org/w/api.php?action=query&prop=redirects&format=json&titles="+qid).text)
for page in merged_items["query"]["pages"].keys():
    for redirect in merged_items["query"]["pages"][page]["redirects"]:
        rdf_item.add((ns["wd"][redirect["title"]], OWL.sameAs, ns["wd"][qid]))



In [8]:
# sitelinks
for sitelink in json_item['sitelinks'].keys():
    metadata["sitelinks"] += 1
    wiki = URIRef(json_item['sitelinks'][sitelink]["url"])
    #print(json_item['sitelinks'][sitelink]["url"])
    partof = URIRef(json_item['sitelinks'][sitelink]["url"].split("wiki/")[0])
    if "commons" in str(partof):
        group = str(partof).split(".")[0].replace("https://", "")
        print(group)
    else:
        group = str(partof).split(".")[1]
    rdf_item.add((partof, ns["wikibase"].wikiGroup, Literal(group)))
    if "quote" in sitelink:
        language = sitelink.replace("wikiquote", "")
    elif sitelink == "simplewiki":
        language = "en-simple"
    elif sitelink == "commonswiki":
        language = "en"
    elif sitelink == "zh_yuewiki":
        language = "yue"
    elif sitelink == "zh_min_nanwiki":
        language = "nan"
    elif sitelink == "nowiki":
        language = "nb"
    else:
        language = sitelink.replace("wiki", "")
    rdf_item.add((wiki, RDF.type, ns["schema"].Article))
    rdf_item.add((wiki, ns['schema'].about, ns["wd"][qid]))
    rdf_item.add((wiki, ns['schema'].isPartOf, URIRef(partof)))
    for badge in json_item['sitelinks'][sitelink]["badges"]:
        ns["wikibase"].badge
        rdf_item.add((wiki, ns["wikibase"].badge, ns["wd"][badge]))
    try:
        rdf_item.add((wiki, ns['schema'].name, Literal(json_item['sitelinks'][sitelink]["title"], language)))
        rdf_item.add((wiki, ns['schema'].inLanguage, Literal(language)))
    except:
        print(language)
    

bat_smg
be_x_old
cbk_zam
commons
fiu_vro
map_bms
nds_nl
roa_rup
roa_tara
zh_classical


In [9]:
# Metadata
metadata["statements"] += metadata["identifiers"]
rdf_item.add((ns["data"][qid], RDF.type, ns["schema"].Dataset))
rdf_item.add((ns["data"][qid], ns["cc"].license, URIRef("http://creativecommons.org/publicdomain/zero/1.0/")))
rdf_item.add((ns["data"][qid], ns["schema"].about, ns["wd"][qid]))
rdf_item.add((ns["data"][qid], ns["schema"].softwareVersion, Literal("1.0.0")))
rdf_item.add((ns["data"][qid], ns["wikibase"].identifiers, Literal(metadata["identifiers"])))
rdf_item.add((ns["data"][qid], ns["wikibase"].sitelinks, Literal(metadata["sitelinks"])))
rdf_item.add((ns["data"][qid], ns["wikibase"].statements, Literal(metadata["statements"])))
rdf_item.add((ns["data"][qid], ns["schema"].version, Literal(json_item["lastrevid"]))) 
rdf_item.add((ns["data"][qid], ns["schema"].dateModified, Literal(json_item["modified"].replace("Z", "+00:00"), datatype=XSD.dateTime))) 


In [10]:
def transformLabels(qid, json_item):
    # Heading
    for language in json_item["labels"].keys():
        rdf_item.add((ns["wd"][qid], RDFS.label, Literal(json_item["labels"][language]["value"], language)))
        rdf_item.add((ns["wd"][qid], ns["schema"].name, Literal(json_item["labels"][language]["value"], language)))
        rdf_item.add((ns["wd"][qid], ns["skos"].prefLabel, Literal(json_item["labels"][language]["value"], language)))

    for language in json_item["descriptions"].keys():
        rdf_item.add((ns["wd"][qid], ns["schema"].description, Literal(json_item["descriptions"][language]["value"], language)))

    for language in json_item["aliases"].keys():
        for label in json_item["aliases"][language]:
            rdf_item.add((ns["wd"][qid], SKOS.altLabel, Literal(label["value"], language))) 
                          
transformLabels(qid, json_item)     
    
for pid in properties.keys():
    pid_item = wdi_core.WDItemEngine(wd_item_id=pid).get_wd_json_representation()
    transformLabels(pid, pid_item)
    propdefs(pid, pid_item, properties[pid])



In [11]:
for linked_qid in linked_items:
    rdf_item.add((ns["wd"][linked_qid], RDF.type, ns["wikibase"].Item))
    linked_qid_item = wdi_core.WDItemEngine(wd_item_id=linked_qid).get_wd_json_representation()
    transformLabels(linked_qid, linked_qid_item)

In [12]:
compareRDF = Graph()
compareRDF.parse("http://www.wikidata.org/entity/"+qid+".ttl", )

<Graph identifier=Nb6897a12bc424a64b3b75b51e34a3f0c (<class 'rdflib.graph.Graph'>)>

In [13]:
diffRdf = compareRDF-rdf_item
for prefix in wdi_config.prefix.keys():
    ns[prefix] = Namespace( wdi_config.prefix[prefix])
    diffRdf.namespace_manager.bind(prefix, ns[prefix])
diffRdf.serialize(format="turtle", destination="1.ttl")
print(len(diffRdf))
print(len(rdf_item))
rdf_item.serialize(format="turtle", destination="2.ttl")

3748
282744


In [14]:
hash(5)

5

In [15]:
wdi_config.property_value_types

{'commonsMedia': 'http://wikiba.se/ontology#CommonsMedia',
 'external-id': 'http://wikiba.se/ontology#ExternalId',
 'geo-shape': 'http://wikiba.se/ontology#GeoShape',
 'globe-coordinate': 'http://wikiba.se/ontology#GlobeCoordinate',
 'math': 'http://wikiba.se/ontology#Math',
 'monolingualtext': 'http://wikiba.se/ontology#Monolingualtext',
 'quantity': 'http://wikiba.se/ontology#Quantity',
 'string': 'http://wikiba.se/ontology#String',
 'tabular-data': 'http://wikiba.se/ontology#TabularData',
 'time': 'http://wikiba.se/ontology#Time',
 'edtf': '<http://wikiba.se/ontology#Edtf>',
 'url': 'http://wikiba.se/ontology#Url',
 'wikibase-item': 'http://wikiba.se/ontology#WikibaseItem',
 'wikibase-property': 'http://wikiba.se/ontology#WikibaseProperty',
 'lexeme': 'http://wikiba.se/ontology#WikibaseLexeme',
 'form': 'http://wikiba.se/ontology#WikibaseForm',
 'sense': 'http://wikiba.se/ontology#WikibaseSense',
 'musical-notation': 'http://wikiba.se/ontology#MusicalNotation'}

In [16]:
def serialize(v):
    ''' Serialize Python data into a PHP serialized string.
    
    Acccepted types: None, str, unicode, int, float, long, decimal (if available), list, typle, dict.
    '''
    if v is None:
        return 'N;'
    elif isinstance(v, str):
        # TODO: utf8 casting and encoding
        return 's:%d:"%s";' % (len(v), v)
    elif isinstance(v, bool):
        return 'b:%d;' % (int(v),)
    elif isinstance(v, int):
        return 'i:%s;' % (v,)
    elif isinstance(v, float):
        return 'd:%s;' % (v,)
    elif isinstance(v, (dict, list, tuple)):
        return 'a:%d:{%s};' % (len(v), ''.join(serialize(k) + serialize(v) for (k, v) in (isinstance(v, dict) and v.iteritems() or enumerate(iter(v)))))
    # TODO: Objects?
    else:
        raise TypeError('Cannot serialize type %r.' % (type(v),))