In [1]:
import json
import requests
import time
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


In [2]:
def create_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
    session.mount("https://", HTTPAdapter(max_retries=retries))
    return session

session = create_session()

def get_label_from_wikidata(entity_id, lang="en"):
    """Truy vấn Wikidata API để lấy nhãn (tên) của mã thực thể hoặc quan hệ bằng ngôn ngữ cụ thể."""
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "format": "json",
        "languages": lang
    }
    
    for _ in range(3):  # Thử lại tối đa 3 lần
        try:
            response = session.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            
            if "entities" in data and entity_id in data["entities"] and "labels" in data["entities"][entity_id]:
                label = data["entities"][entity_id]["labels"].get(lang, {}).get("value")
                if label:
                    return label
            return get_label_from_wikidata(entity_id, "en") if lang != "en" else entity_id
        except requests.RequestException:
            time.sleep(1)
    
    return entity_id

In [3]:
def process_triples(triples):
    """Xử lý triples để tạo gold_entity_map, gold_relation_map, gold_entity_vi_map, gold_relation_vi_map"""
    gold_entity_map = {}
    gold_relation_map = {}
    gold_entity_vi_map = {}
    gold_relation_vi_map = {}
    
    for triple in tqdm(triples, desc="Processing triples"):
        for item in triple:
            if item.startswith("wd:"):
                entity_id = item.replace("wd:", "")
                gold_entity_map[entity_id] = get_label_from_wikidata(entity_id, "en")
                gold_entity_vi_map[entity_id] = get_label_from_wikidata(entity_id, "vi")
            elif item.startswith("wdt:"):
                relation_id = item.replace("wdt:", "")
                gold_relation_map[relation_id] = get_label_from_wikidata(relation_id, "en")
                gold_relation_vi_map[relation_id] = get_label_from_wikidata(relation_id, "vi")
    
    return gold_entity_map, gold_relation_map, gold_entity_vi_map, gold_relation_vi_map


In [4]:
def process_lc_quad(input_path, output_path):
    """Đọc file LC-QuAD2.0_test.json, bổ sung thuộc tính và lưu lại"""
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    for item in tqdm(data, desc="Processing questions"):
        triples = item.get("triples", [])
        gold_entity_map, gold_relation_map, gold_entity_vi_map, gold_relation_vi_map = process_triples(triples)
        
        item["gold_entity_map"] = gold_entity_map
        item["gold_relation_map"] = gold_relation_map
        item["gold_entity_vi_map"] = gold_entity_vi_map
        item["gold_relation_vi_map"] = gold_relation_vi_map
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    
    print("Đã xử lý xong và lưu vào", output_path)

In [None]:

# Chạy xử lý trên file LC-QuAD2.0_test.json
process_lc_quad("LC-QuAD2.0/nor_sexpr/test_nor.json","LC-QuAD2.0/label_map/LC-QuAD2.0_test.json")