In [77]:
import re
import sys
import csv
import datetime
import requests
from tqdm.notebook import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Literal, BNode
from rdflib import Namespace
from rdflib.namespace import SDO, RDF, XSD
from random import randint
from bs4 import BeautifulSoup

TODO doku

In [85]:
searched_languages = ["en", "de", "ru", "fr", "hy", "be", "lt", "uk", "ba"]
searched_wd_type = "wd:Q183" # e.g. type of biomolecule 
limit = 4000
endpoint_url = "https://query.wikidata.org/sparql"

In [86]:
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

In [87]:
def find_articles(searched_wd_type, limit):
    query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX schema: <http://schema.org/>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT DISTINCT ?cid WHERE {{
        ?cid wdt:P17 {} . #/wdt:P279* entfenen falls die anfrage zu lange dauert
        ?article  schema:about ?cid .
    }}
    OFFSET {}
    LIMIT {}""".format(searched_wd_type, 1000, limit)

    results = get_results(endpoint_url, query)
    articles = results["results"]["bindings"]
    
    return articles

In [88]:
def create_graph(articles, searched_languages):
    g = Graph()
    i = 1
    
    for article in tqdm(articles):
        # print("{} --> {}".format(i, len(articles)))
        i = i + 1
        
        query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX schema: <http://schema.org/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?resource ?label ?language ?article WHERE {{
        ?resource rdfs:label ?label .
        ?article schema:about ?resource .
        ?article schema:inLanguage ?language .
        FILTER (lang(?label) = "en") .
        FILTER (SUBSTR(STR(?article ), 1, 30) != "https://commons.wikimedia.org/") # filtert übersichts seiten
        BIND(<{}> as ?resource)
    }}""".format(article['cid']['value'])

        results = get_results(endpoint_url, query)
        languages = results["results"]["bindings"]

        for language in languages:
            if language["language"]["value"] in searched_languages:
                # Wikipedia article are talking about the Wikidata entity
                g.add((URIRef(language["article"]["value"]), SDO.about, Literal(language["resource"]["value"])))
                # a specific Wikipedia article is dedicated to a language
                g.add((URIRef(language["article"]["value"]), SDO.inLanguage, Literal(language["language"]["value"])))
                
                page = requests.get(language["article"]["value"])
                soup = BeautifulSoup(page.content, "html.parser")

                old_id_patter = r"\"wgCurRevisionId\":[0-9]*"
                script = soup.find("script", ).get_text()
                match = re.search(old_id_patter, script)
                old_id = match.group().replace('"wgCurRevisionId":', '')
                content = soup.find(id='content').get_text()

                # a specific Wikipedia article revision (URI with oldid) is an instance of the official Wikipedia article
                g.add((URIRef("{}?oldid={}".format(language["article"]["value"], old_id)), URIRef("http://www.w3.org/ns/prov#wasRevisionOf"), URIRef(language["article"]["value"])))

                # a blank node of type WikipediaPageStatistics contains the properties that were derived from a specific Wikipedia article revision (URI with oldid)
                # I used the oldid also as blank node ID
                g.add((URIRef("_:{}".format(old_id)), URIRef("http://www.w3.org/ns/prov#wasDerivedFrom"), URIRef("{}?oldid={}".format(language["article"]["value"], old_id))))
                g.add((URIRef("_:{}".format(old_id)), RDF.type, URIRef("urn:stats:WikipediaPageStatistics")))
                g.add((URIRef("_:{}".format(old_id)), URIRef("urn:stats:characters_count"), Literal(len(content), datatype=XSD.integer)))
                g.add((URIRef("_:{}".format(old_id)), URIRef("urn:stats:characters_count_without_whitspaces"), Literal(len("".join(content.split())), datatype=XSD.integer)))
                g.add((URIRef("_:{}".format(old_id)), URIRef("urn:stats:content_text"), Literal(content,lang=language["language"]["value"])))

                query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX schema: <http://schema.org/>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>

            SELECT ?resource ?types WHERE {{
                ?resource wdt:P31 ?types .
                BIND(<{}> as ?resource)
            }}""".format(article['cid']['value'])

                results = get_results(endpoint_url, query)
                wd_types = results["results"]["bindings"]

                for wd_type in wd_types:
                    # each Wikidata resource has several types
                    g.add((URIRef(wd_type["resource"]["value"]), URIRef("http://www.wikidata.org/prop/direct/P31"), Literal(language["resource"]["value"])))
    
    return g
    
def save_graph_to_file(graph, path):
    graph.serialize(destination=path)
    


In [89]:
def result_to_csv(results, path):
    with open(path, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile)

        spamwriter.writerow(["wikidata","wikipedia","language","characters_count","characters_count_without_whitspaces"])
        for row in results: 
            spamwriter.writerow(row)

In [90]:
articles = find_articles(searched_wd_type, limit)

print(len(articles))

4000


In [91]:
graph = create_graph(articles, searched_languages)
filename = "./{}".format(datetime.datetime.now().timestamp())

save_graph_to_file(graph, "{}.ttl".format(searched_wd_type))

HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>
PREFIX org: <http://www.w3.org/ns/prov#>
SELECT DISTINCT ?wikidata ?wikipedia ?language ?characters_count ?characters_count_without_whitspaces WHERE {
    ?wikipedia schema:about ?wikidata .
    ?wikipedia schema:inLanguage ?language .
    ?wikpediaWithOldId org:wasRevisionOf ?wikipedia .
    ?oldId org:wasDerivedFrom ?wikpediaWithOldId .
    ?oldId <urn:stats:characters_count> ?characters_count .
    ?oldId <urn:stats:characters_count_without_whitspaces> ?characters_count_without_whitspaces .
}"""

results = graph.query(query)
result_to_csv(results, "{}.csv".format(filename))

In [None]:
# human (Q5), city (Q515),  public company (Q891723), historical event (Q13418847), body of water (Q15324)