In [1]:
import json
import base64
import requests
import pandas as pd
from time import sleep
from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLWrapper
from tqdm.notebook import tqdm

wdt_endpoint = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
dbpedia_endpoint = 'https://dbpedia.org/sparql'

In [20]:
sparql_wdt = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
    
    SELECT ?types 
    WHERE {{
        <{uri}> wdt:P31 ?instanceOf .
        ?instanceOf wdt:P279 ?types .
    }}
"""

sparql_dbpedia = """
    PREFIX dbr: <http://dbpedia.org/resource/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

    SELECT ?aType
    WHERE {{
      ?uri rdfs:subClassOf* ?aType .
      {{
        SELECT DISTINCT ?uri (COUNT(?subClass) as ?level)
        WHERE {{
          <{uri}> rdf:type ?uri .
          ?uri rdfs:subClassOf* ?subClass .
          FILTER(CONTAINS(STR(?uri), "dbpedia.org/ontology"))
        }}
        GROUP BY ?uri
        ORDER BY DESC(?level)
        LIMIT 1
      }}
      FILTER(CONTAINS(STR(?aType), "dbpedia.org/ontology"))
    }}
"""

In [42]:
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def execute(query: str, endpoint_url: str):
    """
    https://dbpedia.org/sparql
    https://query.wikidata.org/bigdata/namespace/wdq/sparql
    """
    timeout = 1
    result = False
    e = ''
    while not result or timeout < 15:
        try:
            sleep(timeout)
            sparql = SPARQLWrapper(endpoint_url)
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            response = sparql.query().convert()
            result = True
            return response
        except Exception as e:
            e = str(e)
            timeout += 1
    
    return {'error': e}

def get_values(sparql_result):
    return [list(b.values())[0]['value'] for b in sparql_result['results']['bindings']]

def run(dataset, name, endpoint):
    data_new = list()
    cnt = 0
    for q in tqdm(dataset):
        uid = q['_id']
        category, types = '', []
        try:
            q_result = execute(q['sparql_query'], endpoint)
            if 'error' not in q_result.keys() and not 'boolean' in q_result.keys() and len(q_result['results']['bindings']) > 0:
                if list(q_result['results']['bindings'][0].values())[0]['type'] == 'uri':
                    category = 'resource'
                    types = execute(sparql_dbpedia.format(uri=list(q_result['results']['bindings'][0].values())[0]['value']), endpoint)
                    types = get_values(types)

                elif (list(q_result['results']['bindings'][0].values())[0]['type'] == 'typed-literal' or list(q_result['results']['bindings'][0].values())[0]['type'] == 'literal') and len(q_result['results']['bindings']) > 0:
                    category = 'literal'
                    types = [list(q_result['results']['bindings'][0].values())[0]['datatype']]
            elif 'error' not in q_result.keys() and 'boolean' in q_result.keys():
                category = 'boolean'
                types = ['boolean']
        except Exception as e:
            category = 'Error' + str(e)
        if len(types) > 0:
            data_new.append({'uid': uid, 'category': category, 'type': types})
        
        if cnt%50 == 0:
            write_json(data_new, f"../../data/at/{name}.json")
            
        cnt += 1

In [43]:
lcquad_train_results = read_json("../../data/lcquad/train-data.json")
lcquad_test_results = read_json("../../data/lcquad/test-data.json")

In [44]:
lcquad_train_results[5]

{'_id': '106',
 'corrected_question': 'What is the incumbent of the Al Gore presidential campaign, 2000 and also the president of the Ann Lewis ?',
 'intermediary_question': 'What is the <incumbent> of the <Al Gore presidential campaign, 2000> and <president> of the <Ann Lewis>',
 'sparql_query': ' SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Al_Gore_presidential_campaign,_2000> <http://dbpedia.org/ontology/incumbent> ?uri. <http://dbpedia.org/resource/Ann_Lewis> <http://dbpedia.org/ontology/president> ?uri} ',
 'sparql_template_id': 16}

In [45]:
run(lcquad_train_results, 'dbpedia/lcquad-train-types', dbpedia_endpoint)

HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))




In [46]:
run(lcquad_test_results, 'dbpedia/lcquad-test-types', dbpedia_endpoint)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


