In [1]:

import json
import re
import time
import concurrent.futures
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm


In [None]:

WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
label_cache = {}

def query_sparql(query):
    try:
        sparql = SPARQLWrapper(WIKIDATA_SPARQL_ENDPOINT)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        return results["results"]["bindings"]
    except Exception as e:
        print(f"Error querying {WIKIDATA_SPARQL_ENDPOINT}: {e}")
        return None

def get_label_vi(entity):
    if entity in label_cache:
        return label_cache[entity]
    
    query = f"""
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?label WHERE {{
      wd:{entity} rdfs:label ?label .
      FILTER (lang(?label) = "vi")
    }} LIMIT 1
    """
    
    results = query_sparql(query)
    if not results:
            query = f"""
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

            SELECT ?label WHERE {{
            wd:{entity} rdfs:label ?label .
            FILTER (lang(?label) = "en")
            }} LIMIT 1
            """
            results = query_sparql(query)
    label = results[0]["label"]["value"] if results else entity
    label_cache[entity] = label
    return label

def process_dataset(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    with open(output_path, "w", encoding="utf-8") as f_out:
        for entry in tqdm(data, desc=f"Processing {file_path}"):
            s_expr = entry.get("s_expr", "")
            entities = set(re.findall(r"\b(Q\d+|P\d+)\b", s_expr))

            with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
                entity_labels = {e: label for e, label in zip(entities, executor.map(get_label_vi, entities))}
            
            nor_s_expr = s_expr
            for entity, label in entity_labels.items():
                nor_s_expr = nor_s_expr.replace(entity, f"[ {label} ]")
            
            entry["nor_s_expr_vi"] = nor_s_expr

            # Ghi từng entry vào file JSONL (mỗi dòng là một JSON object)
            f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")
            f_out.flush()  # Đảm bảo dữ liệu được ghi ngay lập tức



In [4]:
process_dataset("LC-QuAD2.0/nor_sexpr/train.json", "LC-QuAD2.0/nor_sexpr/train_nor_vi.json")