In [1]:
import json
import re
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON


In [2]:
def query_wikidata(sparql_query):
    ENDPOINT_URL = "https://query.wikidata.org/sparql"
    
    sparql = SPARQLWrapper(ENDPOINT_URL)
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    
    try:
        response = sparql.query().convert()
        answers = [item["answer"]["value"] for item in response["results"]["bindings"] if "answer" in item]
        return answers
    except Exception as e:
        print(f"SPARQL query failed: {e}")
        return []

In [3]:
def is_invalid_sparql(query):
    # Kiểm tra nếu truy vấn chứa "ASK WHERE" hoặc "COUNT"
    if "ASK WHERE" in query or "COUNT" in query:
        return True
    
    # Kiểm tra nếu SELECT chứa từ hai biến "?" trở lên trước WHERE
    select_match = re.search(r"SELECT[^{}]*?WHERE", query, re.IGNORECASE)
    if select_match:
        variables = re.findall(r"\?\w+", select_match.group())
        if len(variables) >= 2:
            return True
    
    return False

In [4]:

def process_file(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    filtered_data = []
    for item in tqdm(data, desc=f"Processing {input_file}"):
        sparql_query = item.get("sparql_wikidata", "")
        
        # Bỏ qua nếu truy vấn không hợp lệ
        if is_invalid_sparql(sparql_query) or "FILTER" in sparql_query:
            continue
        
        # Gửi truy vấn để lấy kết quả
        answers = query_wikidata(sparql_query)
        
        # Bỏ qua nếu kết quả truy vấn rỗng
        if not answers:
            continue
        
        item["answer"] = answers
        filtered_data.append(item)
    
    # Lưu dữ liệu đã xử lý
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(filtered_data, f, ensure_ascii=False, indent=4)
    
    print(f"Processed {len(filtered_data)} items. Saved to {output_file}")


In [5]:
process_file("LC-QuAD2.0/translate/test.json", "LC-QuAD2.0/filter/test.json")

Processing LC-QuAD2.0/translate/test.json:  35%|███▍      | 2107/6036 [28:28<1:08:19,  1.04s/it]

SPARQL query failed: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'SPARQL-QUERY: queryStr=SELECT ?obj WHERE { wd:Q25267 p:P144 ?s . ?s ps:P144 ?obj . ?s pq:P2534 ?x filter(contains(?x,\'^{\\circ}\\text{C} = \\text{K} - 273.15\')) }\njava.util.concurrent.ExecutionException: org.openrdf.query.MalformedQueryException: Lexical error at line 1, column 99.  Encountered: "c" (99), after : "\\\'^{\\\\"\n\tat java.util.concurrent.FutureTask.report(FutureTask.java:122)\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:206)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet

Processing LC-QuAD2.0/translate/test.json:  87%|████████▋ | 5233/6036 [1:10:35<2:40:04, 11.96s/it]

SPARQL query failed: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=select ?ent where { ?ent wdt:P31 wd:Q523 . ?ent wdt:P2214 ?obj } ORDER BY DESC(?obj)LIMIT 5 \njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:865)\n\tat org.eclipse.jetty.servlet.ServletHandler$Cached

Processing LC-QuAD2.0/translate/test.json: 100%|██████████| 6036/6036 [1:21:42<00:00,  1.23it/s]  


Processed 1681 items. Saved to LC-QuAD2.0/filter/test.json
