In [None]:
!pip install evaluate

import os
import json
import re
import random
import math
import unicodedata
from typing import List, Dict, Any

import numpy as np
import torch
import networkx as nx
from collections import Counter
from tqdm import tqdm

from datasets import Dataset, DatasetDict
import evaluate
import community.community_louvain as community_louvain
from sentence_transformers import SentenceTransformer
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

import gradio as gr



In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x7cb8684957d0>

In [None]:
duong_articles = "all_articles.json"
with open(duong_articles, "r", encoding="utf-8") as f:
    articles = json.load(f).get("articles", [])

print("Số bài báo:", len(articles))

Số bài báo: 256


In [None]:
def tach_cau(text: str) -> List[str]:
    text = re.sub(r"\s+", " ", text).strip()
    if not text:
        return []
    return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+", text) if s.strip()]

In [None]:
print("Đang trích câu cho NER...")
tat_ca_cau = []
for art in articles:
    c = art.get("content", "") or ""
    tat_ca_cau.extend(tach_cau(c))

random.shuffle(tat_ca_cau)
cau_ner = tat_ca_cau[:1500]
print("Số câu dùng cho NER auto-label:", len(cau_ner))

Đang trích câu cho NER...
Số câu dùng cho NER auto-label: 1500


In [None]:
model_name_ner_pre = "dslim/bert-large-NER"
print("Đang tải mô hình NER gốc:", model_name_ner_pre)
tokenizer_ner_pre = AutoTokenizer.from_pretrained(model_name_ner_pre)
model_ner_pre = AutoModelForTokenClassification.from_pretrained(model_name_ner_pre)


Đang tải mô hình NER gốc: dslim/bert-large-NER


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
ner_pipe_pre = torch.no_grad()(lambda x: x)
from transformers import pipeline as hf_pipeline

ner_pipe_pre = hf_pipeline(
    "ner",
    model=model_ner_pre,
    tokenizer=tokenizer_ner_pre,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1,
)

Device set to use cuda:0


In [None]:
def gan_nhan_cau_ner(sentence: str):
    return ner_pipe_pre(sentence)

def tao_example_ner(sentence: str):
    words = sentence.split()
    if not words:
        return None
    entities = gan_nhan_cau_ner(sentence)
    char_to_word = {}
    idx = 0
    for i, w in enumerate(words):
        start = idx
        end = idx + len(w)
        for c in range(start, end):
            char_to_word[c] = i
        idx = end + 1
    labels = ["O"] * len(words)
    for ent in entities:
        start = ent.get("start")
        end = ent.get("end")
        label = ent.get("entity_group", "O")
        if label == "O":
            continue
        mapped_indices = set()
        for c in range(start, end):
            if c in char_to_word:
                mapped_indices.add(char_to_word[c])
        mapped_indices = sorted(mapped_indices)
        for j, wi in enumerate(mapped_indices):
            if j == 0:
                if not label.startswith("B-") and not label.startswith("I-"):
                    tag = "B-" + label
                else:
                    tag = label
            else:
                base = label[2:] if label.startswith(("B-", "I-")) else label
                tag = "I-" + base
            labels[wi] = tag
    return {"tokens": words, "ner_tags_str": labels}

print("Đang gán nhãn NER tự động...")
examples_ner = []
for i, s in enumerate(cau_ner):
    ex = tao_example_ner(s)
    if ex is not None:
        examples_ner.append(ex)
print("Tổng số câu có nhãn NER:", len(examples_ner))

Đang gán nhãn NER tự động...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Tổng số câu có nhãn NER: 1500


In [None]:
tat_ca_nhan = set()
for ex in examples_ner:
    for tag in ex["ner_tags_str"]:
        tat_ca_nhan.add(tag)
tat_ca_nhan = sorted(list(tat_ca_nhan))
if "O" in tat_ca_nhan:
    tat_ca_nhan.remove("O")
tat_ca_nhan = ["O"] + tat_ca_nhan

label2id_ner = {l: i for i, l in enumerate(tat_ca_nhan)}
id2label_ner = {i: l for l, i in label2id_ner.items()}

print("Danh sách nhãn NER:")
for i, l in enumerate(tat_ca_nhan):
    print(i, l)

for ex in examples_ner:
    ex["ner_tags"] = [label2id_ner[t] for t in ex["ner_tags_str"]]

dataset_ner = Dataset.from_list(
    [{"tokens": e["tokens"], "ner_tags": e["ner_tags"]} for e in examples_ner]
)
dataset_ner = dataset_ner.train_test_split(test_size=0.1, seed=42)
ds_ner = DatasetDict(train=dataset_ner["train"], validation=dataset_ner["test"])
print("Tập train NER:", len(ds_ner["train"]))
print("Tập val NER:", len(ds_ner["validation"]))

Danh sách nhãn NER:
0 O
1 B-LOC
2 B-MISC
3 B-ORG
4 B-PER
5 I-LOC
6 I-MISC
7 I-ORG
8 I-PER
Tập train NER: 1350
Tập val NER: 150


In [None]:
!pip install seqeval
tokenizer_ner = AutoTokenizer.from_pretrained(model_name_ner_pre)

def tokenize_and_align_labels_ner(example):
    tokenized = tokenizer_ner(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=256,
    )
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

print("Đang mã hóa dữ liệu NER...")
encoded_ner = ds_ner.map(tokenize_and_align_labels_ner, batched=False)
data_collator_ner = DataCollatorForTokenClassification(tokenizer_ner)
metric_ner = evaluate.load("seqeval")

def compute_metrics_ner(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)
    true_labels = []
    true_predictions = []
    for pred, lab in zip(predictions, labels):
        cur_true_labels = []
        cur_true_preds = []
        for p_i, l_i in zip(pred, lab):
            if l_i == -100:
                continue
            cur_true_labels.append(id2label_ner[l_i])
            cur_true_preds.append(id2label_ner[p_i])
        true_labels.append(cur_true_labels)
        true_predictions.append(cur_true_preds)
    results = metric_ner.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=4b98fc3948a8d6d494b42619d1131890d7759935c65eeada8a620b76ff6bd9f9
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Đang mã hóa dữ liệu NER...


Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
output_dir_ner = "models/bert-large-ner-epl"

training_args_ner = TrainingArguments(
    output_dir=output_dir_ner,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
)

print("Đang khởi tạo mô hình NER để fine-tune...")
model_ner_ft = AutoModelForTokenClassification.from_pretrained(
    model_name_ner_pre,
    num_labels=len(tat_ca_nhan),
    id2label=id2label_ner,
    label2id=label2id_ner,
)

trainer_ner = Trainer(
    model=model_ner_ft,
    args=training_args_ner,
    train_dataset=encoded_ner["train"],
    eval_dataset=encoded_ner["validation"],
    tokenizer=tokenizer_ner,
    data_collator=data_collator_ner,
    compute_metrics=compute_metrics_ner,
)

print("Bắt đầu fine-tune NER...")
trainer_ner.train()
print("Hoàn thành fine-tune NER.")

trainer_ner.save_model(output_dir_ner)
tokenizer_ner.save_pretrained(output_dir_ner)
print("Đã lưu mô hình NER tại:", output_dir_ner)

Đang khởi tạo mô hình NER để fine-tune...


Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer_ner = Trainer(


Bắt đầu fine-tune NER...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [huggingface_hub.inference, mcp] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0589,0.045765,0.938053,0.905983,0.921739,0.988639
2,0.0279,0.032175,0.953744,0.925214,0.939262,0.990447
3,0.0117,0.028441,0.958425,0.935897,0.947027,0.991738


Hoàn thành fine-tune NER.
Đã lưu mô hình NER tại: models/bert-large-ner-epl


In [None]:
print("Đang chuẩn bị dữ liệu RE...")
tat_ca_cau_re = []
for art in articles:
    c = art.get("content", "") or ""
    tat_ca_cau_re.extend(tach_cau(c))

random.shuffle(tat_ca_cau_re)
cau_re = tat_ca_cau_re[:2500]
print("Số câu dùng cho RE:", len(cau_re))

Đang chuẩn bị dữ liệu RE...
Số câu dùng cho RE: 2500


In [None]:
cac_quan_he = [
    "player_club",
    "manager_club",
    "player_score",
    "transfer",
    "injury",
    "other",
]

label2id_re = {l: i for i, l in enumerate(cac_quan_he)}
id2label_re = {i: l for l, i in label2id_re.items()}

def gan_nhan_re_tu_dong(cau: str):
    cau_thuong = cau.lower()
    if "joins" in cau_thuong or "signed" in cau_thuong or "transfer" in cau_thuong:
        return "transfer"
    if "manager" in cau_thuong or "appointed" in cau_thuong:
        return "manager_club"
    if "scored" in cau_thuong or "goal" in cau_thuong:
        return "player_score"
    if "injury" in cau_thuong or "injured" in cau_thuong:
        return "injury"
    if "club" in cau_thuong or "fc " in cau_thuong:
        return "player_club"
    return "other"

du_lieu_re = []
for c in cau_re:
    qh = gan_nhan_re_tu_dong(c)
    du_lieu_re.append({"text": c, "label": label2id_re[qh]})

print("Tổng số mẫu RE:", len(du_lieu_re))

Tổng số mẫu RE: 2500


In [None]:
cac_quan_he = [
    "player_club",
    "manager_club",
    "player_score",
    "transfer",
    "injury",
    "other",
]

label2id_re = {l: i for i, l in enumerate(cac_quan_he)}
id2label_re = {i: l for l, i in label2id_re.items()}

def gan_nhan_re_tu_dong(cau: str):
    cau_thuong = cau.lower()
    if "joins" in cau_thuong or "signed" in cau_thuong or "transfer" in cau_thuong:
        return "transfer"
    if "manager" in cau_thuong or "appointed" in cau_thuong:
        return "manager_club"
    if "scored" in cau_thuong or "goal" in cau_thuong:
        return "player_score"
    if "injury" in cau_thuong or "injured" in cau_thuong:
        return "injury"
    if "club" in cau_thuong or "fc " in cau_thuong:
        return "player_club"
    return "other"

du_lieu_re = []
for c in cau_re:
    qh = gan_nhan_re_tu_dong(c)
    du_lieu_re.append({"text": c, "label": label2id_re[qh]})

print("Tổng số mẫu RE:", len(du_lieu_re))

Tổng số mẫu RE: 2500


In [None]:
ds_re = Dataset.from_list(du_lieu_re)
ds_re = ds_re.train_test_split(test_size=0.1, seed=42)
dataset_re = DatasetDict(train=ds_re["train"], validation=ds_re["test"])
print("Tập train RE:", len(dataset_re["train"]))
print("Tập val RE:", len(dataset_re["validation"]))

Tập train RE: 2250
Tập val RE: 250


In [None]:
model_name_re = "bert-base-uncased"
tokenizer_re = AutoTokenizer.from_pretrained(model_name_re)

def tokenize_re(record):
    return tokenizer_re(
        record["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

print("Đang mã hóa dữ liệu RE...")
encoded_re = dataset_re.map(tokenize_re, batched=True)
encoded_re = encoded_re.remove_columns(["text"])
encoded_re = encoded_re.rename_column("label", "labels")
encoded_re.set_format(type="torch")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Đang mã hóa dữ liệu RE...


Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
model_re_ft = AutoModelForSequenceClassification.from_pretrained(
    model_name_re,
    num_labels=len(cac_quan_he),
    id2label=id2label_re,
    label2id=label2id_re,
)

metric_re = evaluate.load("accuracy")

def compute_metrics_re(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric_re.compute(predictions=preds, references=p.label_ids)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
output_dir_re = "models/bert-base-re-epl"

training_args_re = TrainingArguments(
    output_dir=output_dir_re,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
)

trainer_re = Trainer(
    model=model_re_ft,
    args=training_args_re,
    train_dataset=encoded_re["train"],
    eval_dataset=encoded_re["validation"],
    tokenizer=tokenizer_re,
    compute_metrics=compute_metrics_re,
)

print("Bắt đầu fine-tune RE...")
trainer_re.train()
print("Hoàn thành fine-tune RE.")

trainer_re.save_model(output_dir_re)
tokenizer_re.save_pretrained(output_dir_re)
print("Đã lưu mô hình RE tại:", output_dir_re)

  trainer_re = Trainer(


Bắt đầu fine-tune RE...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6371,0.341574,0.948
2,0.148,0.204071,0.968
3,0.1232,0.139745,0.968


Hoàn thành fine-tune RE.
Đã lưu mô hình RE tại: models/bert-base-re-epl


In [None]:
ner_model_dir = output_dir_ner
re_model_dir = output_dir_re

print("Đang nạp lại mô hình NER đã fine-tune:", ner_model_dir)
tokenizer_ner = AutoTokenizer.from_pretrained(ner_model_dir)
model_ner = AutoModelForTokenClassification.from_pretrained(ner_model_dir)

ner_pipe_ft = hf_pipeline(
    "ner",
    model=model_ner,
    tokenizer=tokenizer_ner,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1,
)

print("Đang nạp lại mô hình RE đã fine-tune:", re_model_dir)
tokenizer_re = AutoTokenizer.from_pretrained(re_model_dir)
model_re = AutoModelForSequenceClassification.from_pretrained(re_model_dir)

re_pipe = hf_pipeline(
    "text-classification",
    model=model_re,
    tokenizer=tokenizer_re,
    device=0 if torch.cuda.is_available() else -1,
)

Device set to use cuda:0


Đang nạp lại mô hình NER đã fine-tune: models/bert-large-ner-epl


Device set to use cuda:0


Đang nạp lại mô hình RE đã fine-tune: models/bert-base-re-epl


In [None]:
def chay_ner(sentence: str):
    ents = ner_pipe_ft(sentence)
    return [{"text": e["word"], "type": e["entity_group"], "start": e["start"], "end": e["end"], "score": float(e["score"])} for e in ents]

def chay_re(sentence: str):
    out = re_pipe(sentence)[0]
    return {"label": out["label"], "score": float(out["score"])}

def loc_thuc_the(ents):
    uu_tien = {"PER", "ORG", "LOC", "MISC"}
    return [e for e in ents if e["type"] in uu_tien]

def tao_cap_thuc_the(ents):
    out = []
    for i in range(len(ents)):
        for j in range(i+1, len(ents)):
            out.append({"head": ents[i], "tail": ents[j]})
    return out

def trich_quan_he_tu_cau(sentence: str):
    ents = loc_thuc_the(chay_ner(sentence))
    if len(ents) < 2:
        return []
    pred = chay_re(sentence)
    if pred["label"] == "other":
        return []
    pairs = tao_cap_thuc_the(ents)
    triples = []
    for p in pairs:
        triples.append({
            "sentence": sentence,
            "relation": pred["label"],
            "relation_score": pred["score"],
            "head": p["head"]["text"],
            "head_type": p["head"]["type"],
            "tail": p["tail"]["text"],
            "tail_type": p["tail"]["type"],
        })
    return triples

def trich_quan_he_tu_bai_bao(article):
    content = article.get("content", "") or ""
    if not content.strip():
        return []
    triples = []
    for s in tach_cau(content):
        triples.extend(trich_quan_he_tu_cau(s))
    meta = {
        "url": article.get("url"),
        "title": article.get("title"),
        "source": article.get("source"),
        "published_date": article.get("published_date"),
    }
    for t in triples:
        t["metadata"] = meta
    return triples

def trich_quan_he_tu_ds_bai_bao(articles):
    out = []
    for art in tqdm(articles, desc="Đang trích NER+RE", ncols=100):
        out.extend(trich_quan_he_tu_bai_bao(art))
    return out

In [None]:
duong_triples_raw = "triples_raw.json"

triples_raw = trich_quan_he_tu_ds_bai_bao(articles)

thu_muc = os.path.dirname(duong_triples_raw)
if thu_muc not in ["", None]:
    os.makedirs(thu_muc, exist_ok=True)

with open(duong_triples_raw, "w", encoding="utf-8") as f:
    json.dump(triples_raw, f, ensure_ascii=False, indent=2)

print("Tổng triple thô:", len(triples_raw))
print("Đã lưu triple thô tại:", duong_triples_raw)


Đang trích NER+RE: 100%|██████████████████████████████████████████| 256/256 [01:27<00:00,  2.92it/s]

Tổng triple thô: 5454
Đã lưu triple thô tại: triples_raw.json





In [None]:
duong_wikidata = "wikidata_epl_entities_clean.json"
with open(duong_wikidata, "r", encoding="utf-8") as f:
    wikidata = json.load(f)

clubs = wikidata.get("clubs", [])
players = wikidata.get("players", [])
managers = wikidata.get("managers", [])
stadiums = wikidata.get("stadiums", [])

print("Số CLB:", len(clubs))
print("Số cầu thủ:", len(players))
print("Số huấn luyện viên:", len(managers))
print("Số sân vận động:", len(stadiums))

Số CLB: 20
Số cầu thủ: 4013


In [None]:
def bo_dau(s):
    s = unicodedata.normalize("NFD", s)
    return "".join(c for c in s if unicodedata.category(c) != "Mn")

def chuan_ten(s):
    s = s.lower().strip()
    s = bo_dau(s)
    s = s.replace("’", "'").replace("`", "'")
    return " ".join(s.split())

def chuan_ten_club(s):
    s = chuan_ten(s)
    for suf in [" football club", " fc", " afc", " f.c", " f.c."]:
        if s.endswith(suf):
            s = s[: -len(suf)]
    if s.startswith("afc "):
        s = s[4:]
    if s.startswith("fc "):
        s = s[3:]
    return s

In [None]:
player_idx = {}
for p in players:
    key = chuan_ten(p["label"])
    player_idx.setdefault(key, []).append(p)

club_idx = {}
club_idx_short = {}
for c in clubs:
    key = chuan_ten_club(c["label"])
    club_idx.setdefault(key, []).append(c)
    key2 = chuan_ten_club(c.get("short_label", c["label"]))
    club_idx_short.setdefault(key2, []).append(c)

def link_player(name):
    key = chuan_ten(name)
    if key in player_idx:
        return player_idx[key][0]
    for k, v in player_idx.items():
        if key in k:
            return v[0]
    return {}

def link_club(name):
    key = chuan_ten_club(name)
    if key in club_idx:
        return club_idx[key][0]
    if key in club_idx_short:
        return club_idx_short[key][0]
    for k, v in club_idx.items():
        if key in k:
            return v[0]
    return {}

In [None]:
def link_triple(t):
    h = t["head"]
    ht = t["head_type"]
    tname = t["tail"]
    tt = t["tail_type"]

    if ht == "PER":
        t["head_link"] = link_player(h)
    elif ht == "ORG":
        t["head_link"] = link_club(h)
    else:
        t["head_link"] = link_player(h) or link_club(h)

    if tt == "PER":
        t["tail_link"] = link_player(tname)
    elif tt == "ORG":
        t["tail_link"] = link_club(tname)
    else:
        t["tail_link"] = link_player(tname) or link_club(tname)

    return t

duong_triples_linked = "triples_linked.json"
triples_linked = [link_triple(t) for t in tqdm(triples_raw, desc="Entity Linking", ncols=100)]

with open(duong_triples_linked, "w", encoding="utf-8") as f:
    json.dump(triples_linked, f, ensure_ascii=False, indent=2)

print("Đã lưu triple sau EL tại:", duong_triples_linked)

Entity Linking: 100%|█████████████████████████████████████████| 5454/5454 [00:01<00:00, 5406.60it/s]


Đã lưu triple sau EL tại: triples_linked.json


In [None]:
nodes_dict = {}
edges_kg = []

for t in triples_linked:
    h = t.get("head_link", {}).get("id")
    hl = t.get("head_link", {}).get("label")
    ht = t.get("head_link", {}).get("type")

    ta = t.get("tail_link", {}).get("id")
    tl = t.get("tail_link", {}).get("label")
    tt = t.get("tail_link", {}).get("type")

    if not h or not ta:
        continue

    nodes_dict[h] = {"id": h, "label": hl, "type": ht}
    nodes_dict[ta] = {"id": ta, "label": tl, "type": tt}

    edges_kg.append({
        "head": h,
        "tail": ta,
        "relation": t["relation"],
        "sentence": t["sentence"],
        "source": t["metadata"]["source"],
        "date": t["metadata"]["published_date"],
    })

duong_nodes = "kg_nodes.json"
duong_edges = "kg_edges.json"

with open(duong_nodes, "w", encoding="utf-8") as f:
    json.dump(list(nodes_dict.values()), f, ensure_ascii=False, indent=2)

with open(duong_edges, "w", encoding="utf-8") as f:
    json.dump(edges_kg, f, ensure_ascii=False, indent=2)

print("Số node KG:", len(nodes_dict))
print("Số edge KG:", len(edges_kg))
print("Đã lưu KG tại:", duong_nodes, "và", duong_edges)

Số node KG: 134
Số edge KG: 972
Đã lưu KG tại: kg_nodes.json và kg_edges.json


In [None]:
with open(duong_nodes, "r", encoding="utf-8") as f:
    nodes = json.load(f)

with open(duong_edges, "r", encoding="utf-8") as f:
    edges = json.load(f)

G = nx.Graph()
DG = nx.DiGraph()
nodes_by_id = {}

for n in nodes:
    nid = n["id"]
    nodes_by_id[nid] = n
    G.add_node(nid, label=n.get("label"), type=n.get("type"))

for e in edges:
    h = e["head"]
    t = e["tail"]
    rel = e.get("relation")
    G.add_edge(h, t, relation=rel, sentence=e.get("sentence"), source=e.get("source"), date=e.get("date"))
    DG.add_edge(h, t, relation=rel, sentence=e.get("sentence"), source=e.get("source"), date=e.get("date"))

cac_tp = sorted(nx.connected_components(G), key=len, reverse=True)
tp_lon_nhat = cac_tp[0]
G_gc = G.subgraph(tp_lon_nhat).copy()

print("Số thành phần liên thông:", len(cac_tp))
print("Thành phần lớn nhất:", G_gc.number_of_nodes(), "node,", G_gc.number_of_edges(), "edge")

Số thành phần liên thông: 6
Thành phần lớn nhất: 123 node, 455 edge


In [None]:
print("Đang tính small-world...")
avg_spl = nx.average_shortest_path_length(G_gc)
clust_gc = nx.average_clustering(G_gc)

G_rand = nx.gnm_random_graph(G_gc.number_of_nodes(), G_gc.number_of_edges())
avg_spl_rand = nx.average_shortest_path_length(G_rand)
clust_rand = nx.average_clustering(G_rand)

sigma = (clust_gc / clust_rand) / (avg_spl / avg_spl_rand)

print("L:", avg_spl)
print("L_random:", avg_spl_rand)
print("C:", clust_gc)
print("C_random:", clust_rand)
print("Small-world sigma:", sigma)

Đang tính small-world...
L: 2.472477675596428
L_random: 2.602558976409436
C: 0.6478798022798674
C_random: 0.05146553988017402
Small-world sigma: 13.250922345321122


In [None]:
print("Đang tính PageRank...")
pr = nx.pagerank(DG, alpha=0.85)
top_pr = sorted(pr.items(), key=lambda x: x[1], reverse=True)[:20]
print("Top 20 node PageRank cao nhất:")
for nid, score in top_pr:
    data = G.nodes[nid]
    print(data.get("label"), "| loại=", data.get("type"), "| PageRank=", score)

Đang tính PageRank...
Top 20 node PageRank cao nhất:
Nahim Khadi, Jr. | loại= player | PageRank= 0.09513224569751529
Manchester City F.C. | loại= club | PageRank= 0.04644726077103638
Jackie McGugan | loại= player | PageRank= 0.045363335638140094
Archie Needham | loại= player | PageRank= 0.0416592458815942
Aston Villa F.C. | loại= club | PageRank= 0.040847114559303024
Duncan Hutchison | loại= player | PageRank= 0.03839040526237938
Nico O'Reilly | loại= player | PageRank= 0.03592304862831878
Arsenal F.C. | loại= club | PageRank= 0.031277230617997004
Liverpool F.C. | loại= club | PageRank= 0.02999464461311581
Wally Halsall | loại= player | PageRank= 0.018862707015453898
Leeds United F.C. | loại= club | PageRank= 0.018094901117283124
Sunderland A.F.C. | loại= club | PageRank= 0.016912144198993638
Harry Kinghorn | loại= player | PageRank= 0.014426626350775023
West Ham United F.C. | loại= club | PageRank= 0.01387926544804716
Alex Mowatt | loại= player | PageRank= 0.01363290092157284
Hakeeb A

In [None]:
print("Đang chạy Louvain community detection...")
partition = community_louvain.best_partition(G_gc)
com_dict = {}
for node, cid in partition.items():
    com_dict.setdefault(cid, []).append(node)
com_list = sorted(com_dict.items(), key=lambda x: len(x[1]), reverse=True)
print("Số cộng đồng:", len(com_list))

Đang chạy Louvain community detection...
Số cộng đồng: 8


In [None]:
big_six_labels = {
    "Arsenal",
    "Chelsea",
    "Liverpool",
    "Manchester City",
    "Manchester United",
    "Tottenham Hotspur",
}

def thong_tin_cong_dong(cid, members):
    sub = G_gc.subgraph(members)
    so_node = sub.number_of_nodes()
    so_edge = sub.number_of_edges()
    types = [sub.nodes[n].get("type") for n in sub.nodes()]
    dem_type = Counter(types)
    ds_club = [n for n in sub.nodes() if sub.nodes[n].get("type") == "club"]
    big6_count = 0
    for n in ds_club:
        if sub.nodes[n].get("label") in big_six_labels:
            big6_count += 1
    transfer_edges = 0
    for u, v, d in sub.edges(data=True):
        if d.get("relation") == "transfer":
            transfer_edges += 1
    avg_pr = float(np.mean([pr.get(n, 0.0) for n in sub.nodes()])) if sub.number_of_nodes() > 0 else 0.0
    return {
        "cid": cid,
        "size": so_node,
        "edges": so_edge,
        "type_count": dict(dem_type),
        "big6_count": big6_count,
        "transfer_edges": transfer_edges,
        "avg_pagerank": avg_pr,
    }

com_stats = []
for cid, members in com_list:
    com_stats.append(thong_tin_cong_dong(cid, members))

cid_big_six = max(com_stats, key=lambda x: x["big6_count"])["cid"]
cid_transfer = max(com_stats, key=lambda x: x["transfer_edges"])["cid"]
cong_dong_khac = [c for c in com_stats if c["cid"] not in [cid_big_six, cid_transfer]]
cong_dong_khac_sorted = sorted(cong_dong_khac, key=lambda x: x["avg_pagerank"], reverse=True)
cid_mid_table = cong_dong_khac_sorted[0]["cid"]
cid_relegation = cong_dong_khac_sorted[-1]["cid"]

community_labels = {}
for c in com_stats:
    if c["cid"] == cid_big_six:
        community_labels[c["cid"]] = "Big Six"
    elif c["cid"] == cid_mid_table:
        community_labels[c["cid"]] = "Mid-table"
    elif c["cid"] == cid_relegation:
        community_labels[c["cid"]] = "Relegation/low-influence"
    elif c["cid"] == cid_transfer:
        community_labels[c["cid"]] = "Transfer cluster"
    else:
        community_labels[c["cid"]] = "Other"

print("Thống kê cộng đồng chính:")
for cid in [cid_big_six, cid_mid_table, cid_relegation, cid_transfer]:
    c = [x for x in com_stats if x["cid"] == cid][0]
    print("CID:", cid, "| Nhãn:", community_labels[cid], "| Kích thước:", c["size"], "| BigSix:", c["big6_count"], "| Transfer edges:", c["transfer_edges"])

Thống kê cộng đồng chính:
CID: 2 | Nhãn: Big Six | Kích thước: 23 | BigSix: 0 | Transfer edges: 0
CID: 7 | Nhãn: Mid-table | Kích thước: 11 | BigSix: 0 | Transfer edges: 0
CID: 6 | Nhãn: Relegation/low-influence | Kích thước: 17 | BigSix: 0 | Transfer edges: 0
CID: 2 | Nhãn: Big Six | Kích thước: 23 | BigSix: 0 | Transfer edges: 0


In [None]:
def thong_ke_cong_dong_theo_nhan():
    dem = Counter(community_labels.values())
    for nhan, so in dem.items():
        print(nhan, ":", so)

print("Thống kê số cộng đồng theo nhãn:")
thong_ke_cong_dong_theo_nhan()

Thống kê số cộng đồng theo nhãn:
Big Six : 1
Other : 5
Relegation/low-influence : 1
Mid-table : 1


In [None]:
def tao_text_fact(head_id, tail_id, rel, sentence, source, date):
    h = nodes_by_id.get(head_id, {})
    t = nodes_by_id.get(tail_id, {})
    h_label = h.get("label", str(head_id))
    t_label = t.get("label", str(tail_id))
    h_type = h.get("type", "")
    t_type = t.get("type", "")
    parts = []
    parts.append(f"{h_label} ({h_type}) có quan hệ {rel} với {t_label} ({t_type})")
    if date:
        parts.append(f"vào thời điểm {date}")
    if source:
        parts.append(f"nguồn: {source}")
    if sentence:
        parts.append(f"câu gốc: {sentence}")
    return ". ".join(parts)

In [None]:
print("Đang xây dựng fact từ KG...")
graph_facts = []
for i, e in enumerate(edges):
    fact_text = tao_text_fact(e["head"], e["tail"], e["relation"], e.get("sentence"), e.get("source"), e.get("date"))
    graph_facts.append({
        "id": i,
        "head": e["head"],
        "tail": e["tail"],
        "relation": e["relation"],
        "text": fact_text,
        "source": e.get("source"),
        "date": e.get("date"),
        "sentence": e.get("sentence"),
    })

print("Tổng số fact:", len(graph_facts))

Đang xây dựng fact từ KG...
Tổng số fact: 972


In [None]:
print("Đang tải mô hình embedding GraphRAG...")
embed_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embed_model = SentenceTransformer(embed_model_name)

texts = [f["text"] for f in graph_facts]
print("Đang sinh embedding cho fact...")
embeddings = embed_model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

emb_path_np = "graph_facts_emb.npy"
facts_path_json = "graph_facts.json"

np.save(emb_path_np, embeddings)
with open(facts_path_json, "w", encoding="utf-8") as f:
    json.dump(graph_facts, f, ensure_ascii=False, indent=2)

print("Đã lưu embedding và fact.")

Đang tải mô hình embedding GraphRAG...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Đang sinh embedding cho fact...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Đã lưu embedding và fact.


In [None]:
embeddings = np.load(emb_path_np)
with open(facts_path_json, "r", encoding="utf-8") as f:
    graph_facts = json.load(f)

print("Đã nạp lại fact và embedding.")

Đã nạp lại fact và embedding.


In [None]:
def lay_anchors_tu_cau(question: str):
    ents = ner_pipe_ft(question)
    ket_qua = []
    for e in ents:
        loai = e["entity_group"]
        text = e["word"]
        if loai in ["PER", "ORG"]:
            ket_qua.append({"text": text, "type": loai})
    return ket_qua

def link_anchor(ent):
    ten = ent["text"]
    loai = ent["type"]
    if loai == "PER":
        p = link_player(ten)
        if p:
            return p.get("id")
    if loai == "ORG":
        c = link_club(ten)
        if c:
            return c.get("id")
    p = link_player(ten)
    if p:
        return p.get("id")
    c = link_club(ten)
    if c:
        return c.get("id")
    return None

def lay_node_anchor(question: str):
    ents = lay_anchors_tu_cau(question)
    ids = []
    for e in ents:
        nid = link_anchor(e)
        if nid is not None:
            ids.append(nid)
    return list(set(ids))

def lay_lan_can_k_hop(anchors, k_hop=2):
    tap = set()
    for a in anchors:
        if a not in G:
            continue
        tap.add(a)
        frontier = {a}
        for _ in range(k_hop):
            ke_tiep = set()
            for u in frontier:
                for v in G.neighbors(u):
                    if v not in tap:
                        ke_tiep.add(v)
            tap.update(ke_tiep)
            frontier = ke_tiep
    return tap

def chon_fact_ket_hop_node(nodes_tap: set):
    chi_so = []
    for i, f in enumerate(graph_facts):
        if f["head"] in nodes_tap or f["tail"] in nodes_tap:
            chi_so.append(i)
    return chi_so

In [None]:
def lay_context_tu_graph(question: str, top_k: int = 30, k_hop: int = 2):
    anchors = lay_node_anchor(question)
    print("Anchor entities (node id):", anchors)
    if not anchors:
        q_emb = embed_model.encode([question], convert_to_numpy=True, normalize_embeddings=True)[0]
        scores = embeddings @ q_emb
        idx = np.argsort(-scores)[:top_k]
        facts_chon = [graph_facts[i] for i in idx]
        context = "\n".join([f["text"] for f in facts_chon])
        return context, facts_chon

    nodes_k = lay_lan_can_k_hop(anchors, k_hop=k_hop)
    print("Số node trong k-hop:", len(nodes_k))
    idx_ung_vien = chon_fact_ket_hop_node(nodes_k)
    print("Số fact ứng viên liên quan anchor:", len(idx_ung_vien))

    if not idx_ung_vien:
        q_emb = embed_model.encode([question], convert_to_numpy=True, normalize_embeddings=True)[0]
        scores = embeddings @ q_emb
        idx = np.argsort(-scores)[:top_k]
        facts_chon = [graph_facts[i] for i in idx]
        context = "\n".join([f["text"] for f in facts_chon])
        return context, facts_chon

    q_emb = embed_model.encode([question], convert_to_numpy=True, normalize_embeddings=True)[0]
    emb_ung_vien = embeddings[idx_ung_vien]
    scores = emb_ung_vien @ q_emb
    thu_tu = np.argsort(-scores)[:top_k]
    chon_idx = [idx_ung_vien[i] for i in thu_tu]
    facts_chon = [graph_facts[i] for i in chon_idx]
    context = "\n".join([f["text"] for f in facts_chon])
    return context, facts_chon

In [None]:
def tao_prompt(question: str, context: str) -> str:
    intro = (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh (EPL).\n"
        "Dưới đây là các thông tin tri thức được trích xuất từ đồ thị tri thức (Knowledge Graph) và bài báo:\n\n"
    )
    huong_dan = (
        "\n\nYêu cầu:\n"
        "- Chỉ dựa trên thông tin ở trên để trả lời.\n"
        "- Nếu không đủ thông tin, hãy nói rõ là không chắc chắn hoặc thiếu dữ liệu.\n"
        "- Trả lời bằng tiếng Việt, mạch lạc, dễ hiểu.\n"
        "- Có thể tóm tắt, tổng hợp, nhưng không được bịa ra chi tiết không có trong context.\n"
    )
    cau_hoi = f"\nCâu hỏi của người dùng: {question}\n\nCâu trả lời:"
    return intro + context + huong_dan + cau_hoi

In [None]:
llm_model_name = "Qwen/Qwen2-0.5B-Instruct"
print("Đang tải mô hình LLM:", llm_model_name)
tokenizer_llm = AutoTokenizer.from_pretrained(llm_model_name)
model_llm = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("Đã tải xong LLM.")

Đang tải mô hình LLM: Qwen/Qwen2-0.5B-Instruct


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Đã tải xong LLM.


In [None]:
def answer_question_graph(question: str, top_k: int = 20, k_hop: int = 2, max_new_tokens: int = 256, top_p: float = 0.9, temperature: float = 0.7):
    context, facts = lay_context_tu_graph(question, top_k=top_k, k_hop=k_hop)
    prompt = tao_prompt(question, context)
    inputs = tokenizer_llm(prompt, return_tensors="pt").to(model_llm.device)
    with torch.no_grad():
        output_ids = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=top_p,
            temperature=temperature,
            pad_token_id=tokenizer_llm.eos_token_id,
        )
    full_text = tokenizer_llm.decode(output_ids[0], skip_special_tokens=True)
    if full_text.startswith(prompt):
        answer = full_text[len(prompt):].strip()
    else:
        answer = full_text.strip()
    anchors = lay_node_anchor(question)
    return {
        "question": question,
        "answer": answer,
        "context": context,
        "facts": facts,
        "anchors": anchors,
    }

In [None]:
def tim_paths_multi_hop(start_nodes, max_hops=3, max_paths=20):
    paths = []
    seen_pairs = set()
    for s in start_nodes:
        for t in start_nodes:
            if s == t:
                continue
            key = tuple(sorted([s, t]))
            if key in seen_pairs:
                continue
            try:
                all_paths = nx.all_simple_paths(G_gc, source=s, target=t, cutoff=max_hops)
                for p in all_paths:
                    if 2 <= len(p) - 1 <= max_hops:
                        paths.append(p)
                        if len(paths) >= max_paths:
                            return paths
                seen_pairs.add(key)
            except nx.NetworkXNoPath:
                continue
    return paths

def text_cho_path(path):
    doan = []
    for i in range(len(path) - 1):
        u = path[i]
        v = path[i + 1]
        data_u = G_gc.nodes[u]
        data_v = G_gc.nodes[v]
        rel = G_gc[u][v].get("relation", "related_to")
        u_label = data_u.get("label", str(u))
        v_label = data_v.get("label", str(v))
        u_type = data_u.get("type", "")
        v_type = data_v.get("type", "")
        doan.append(f"{u_label} ({u_type}) -[{rel}]-> {v_label} ({v_type})")
    return " ; ".join(doan)

def build_multi_hop_context(question, max_hops=3, max_paths=10):
    anchors = lay_node_anchor(question)
    if not anchors:
        return ""
    paths = tim_paths_multi_hop(anchors, max_hops=max_hops, max_paths=max_paths)
    if not paths:
        return ""
    lines = []
    lines.append("Các chuỗi quan hệ multi-hop liên quan đến câu hỏi:")
    for p in paths:
        lines.append(text_cho_path(p))
    return "\n".join(lines)

def tao_prompt_multi_hop(question: str, context: str, context_multi: str) -> str:
    intro = (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh (EPL).\n"
        "Dưới đây là các thông tin tri thức được trích xuất từ đồ thị tri thức (Knowledge Graph), bao gồm cả ngữ cảnh đơn-hop và multi-hop:\n\n"
    )
    huong_dan = (
        "\n\nYêu cầu:\n"
        "- Dựa trên các thông tin ở trên để suy luận nhiều bước nếu cần.\n"
        "- Nếu không đủ thông tin, hãy nói rõ là không chắc chắn hoặc thiếu dữ liệu.\n"
        "- Trả lời bằng tiếng Việt, mạch lạc, theo kiểu giải thích từng bước suy luận nếu câu hỏi phức tạp.\n"
    )
    phan_context = context + "\n\n" + context_multi if context_multi else context
    cau_hoi = f"\nCâu hỏi của người dùng: {question}\n\nCâu trả lời:"
    return intro + phan_context + huong_dan + cau_hoi

def answer_question_multi_hop(question: str, top_k: int = 20, k_hop: int = 2, max_new_tokens: int = 256, top_p: float = 0.9, temperature: float = 0.7):
    context, facts = lay_context_tu_graph(question, top_k=top_k, k_hop=k_hop)
    context_multi = build_multi_hop_context(question, max_hops=3, max_paths=10)
    prompt = tao_prompt_multi_hop(question, context, context_multi)
    inputs = tokenizer_llm(prompt, return_tensors="pt").to(model_llm.device)
    with torch.no_grad():
        output_ids = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=top_p,
            temperature=temperature,
            pad_token_id=tokenizer_llm.eos_token_id,
        )
    full_text = tokenizer_llm.decode(output_ids[0], skip_special_tokens=True)
    if full_text.startswith(prompt):
        answer = full_text[len(prompt):].strip()
    else:
        answer = full_text.strip()
    anchors = lay_node_anchor(question)
    return {
        "question": question,
        "answer": answer,
        "context": context,
        "context_multi": context_multi,
        "facts": facts,
        "anchors": anchors,
    }

In [None]:
def tao_menh_de_edge(e):
    h = nodes_by_id.get(e["head"], {})
    t = nodes_by_id.get(e["tail"], {})
    rel = e["relation"]
    h_label = h.get("label", str(e["head"]))
    t_label = t.get("label", str(e["tail"]))
    if rel == "player_club":
        return f"{h_label} hiện đang thi đấu cho {t_label}"
    if rel == "manager_club":
        return f"{h_label} là huấn luyện viên của {t_label}"
    if rel == "player_score":
        return f"{h_label} đã ghi bàn cho {t_label}"
    if rel == "transfer":
        return f"{h_label} đã chuyển nhượng liên quan tới câu lạc bộ {t_label}"
    if rel == "injury":
        return f"{h_label} đang gặp chấn thương trong giai đoạn thi đấu cho {t_label}"
    return f"{h_label} có quan hệ {rel} với {t_label}"

def tao_menh_de_path(path):
    doan = []
    for i in range(len(path) - 1):
        u = path[i]
        v = path[i + 1]
        data_u = nodes_by_id.get(u, {})
        data_v = nodes_by_id.get(v, {})
        rel = G_gc[u][v].get("relation", "liên quan")
        u_label = data_u.get("label", str(u))
        v_label = data_v.get("label", str(v))
        if rel == "player_club":
            doan.append(f"{u_label} thi đấu cho {v_label}")
        elif rel == "manager_club":
            doan.append(f"{u_label} dẫn dắt {v_label}")
        elif rel == "transfer":
            doan.append(f"{u_label} từng liên quan chuyển nhượng với {v_label}")
        elif rel == "player_score":
            doan.append(f"{u_label} ghi bàn cho {v_label}")
        else:
            doan.append(f"{u_label} có quan hệ {rel} với {v_label}")
    return " ; ".join(doan)

def tao_cau_hoi_true_false_tu_edge(e, hops=1):
    menh_de = tao_menh_de_edge(e)
    q = f"Câu sau đây đúng hay sai: {menh_de}?"
    return q, True, hops

def tao_cau_hoi_true_false_tu_path(path, hops=2):
    menh_de = tao_menh_de_path(path)
    q = f"Câu sau đây đúng hay sai: {menh_de}?"
    return q, True, hops

def tao_cau_hoi_false_tu_edge(e, nodes_list, hops=1):
    h = nodes_by_id.get(e["head"], {})
    t = nodes_by_id.get(e["tail"], {})
    rel = e["relation"]
    h_label = h.get("label", str(e["head"]))
    t_label = t.get("label", str(e["tail"]))
    same_type_nodes = [n for n in nodes_list if nodes_by_id[n].get("type") == t.get("type")]
    if not same_type_nodes:
        return None
    t_fake_id = np.random.choice(same_type_nodes)
    t_fake = nodes_by_id[t_fake_id]
    t_fake_label = t_fake.get("label", str(t_fake_id))
    if rel == "player_club":
        menh_de = f"{h_label} hiện đang thi đấu cho {t_fake_label}"
    elif rel == "manager_club":
        menh_de = f"{h_label} là huấn luyện viên của {t_fake_label}"
    elif rel == "player_score":
        menh_de = f"{h_label} đã ghi bàn cho {t_fake_label}"
    elif rel == "transfer":
        menh_de = f"{h_label} đã chuyển nhượng liên quan tới câu lạc bộ {t_fake_label}"
    elif rel == "injury":
        menh_de = f"{h_label} đang gặp chấn thương trong giai đoạn thi đấu cho {t_fake_label}"
    else:
        menh_de = f"{h_label} có quan hệ {rel} với {t_fake_label}"
    q = f"Câu sau đây đúng hay sai: {menh_de}?"
    return q, False, hops

In [None]:
def sinh_tap_cau_hoi_2000(so_cau=2000):
    ds = []
    nodes_ids = list(nodes_by_id.keys())
    np.random.shuffle(nodes_ids)
    edges_shuffled = edges.copy()
    np.random.shuffle(edges_shuffled)

    for e in edges_shuffled:
        if len(ds) >= so_cau:
            break
        q_true, ans_true, hops_true = tao_cau_hoi_true_false_tu_edge(e, hops=1)
        ds.append({"question": q_true, "answer": ans_true, "hops": hops_true, "source": "edge_true"})
        if len(ds) >= so_cau:
            break
        qf = tao_cau_hoi_false_tu_edge(e, nodes_ids, hops=1)
        if qf is not None:
            q_false, ans_false, hops_false = qf
            ds.append({"question": q_false, "answer": ans_false, "hops": hops_false, "source": "edge_false"})

    if len(ds) < so_cau:
        anchors_sample = nodes_ids[:20]
        paths = tim_paths_multi_hop(anchors_sample, max_hops=3, max_paths=200)
        for p in paths:
            if len(ds) >= so_cau:
                break
            q_true, ans_true, hops_true = tao_cau_hoi_true_false_tu_path(p, hops=len(p) - 1)
            ds.append({"question": q_true, "answer": ans_true, "hops": hops_true, "source": "path_true"})

    np.random.shuffle(ds)
    return ds[:so_cau]

eval_questions = sinh_tap_cau_hoi_2000(so_cau=2000)
duong_eval = "graph_eval_qa_2000.jsonl"
with open(duong_eval, "w", encoding="utf-8") as f:
    for rec in eval_questions:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Đã sinh", len(eval_questions), "câu hỏi đánh giá tại:", duong_eval)

Đã sinh 2000 câu hỏi đánh giá tại: graph_eval_qa_2000.jsonl


In [None]:
def tao_prompt_baseline(question: str) -> str:
    return (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh (EPL).\n"
        "Hãy trả lời câu hỏi sau dựa trên hiểu biết chung của bạn, trả lời ngắn gọn bằng tiếng Việt.\n\n"
        f"Câu hỏi: {question}\n\nCâu trả lời:"
    )

def baseline_answer(question: str, max_new_tokens: int = 128, top_p: float = 0.9, temperature: float = 0.7) -> str:
    prompt = tao_prompt_baseline(question)
    inputs = tokenizer_llm(prompt, return_tensors="pt").to(model_llm.device)
    with torch.no_grad():
        output_ids = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=top_p,
            temperature=temperature,
            pad_token_id=tokenizer_llm.eos_token_id,
        )
    full_text = tokenizer_llm.decode(output_ids[0], skip_special_tokens=True)
    if full_text.startswith(prompt):
        ans = full_text[len(prompt):].strip()
    else:
        ans = full_text.strip()
    return ans

def parse_bool_from_answer(text: str):
    t = text.lower()
    first_sentence = t.split("\n")[0]
    if "đúng" in first_sentence or "true" in first_sentence:
        return True
    if "sai" in first_sentence or "false" in first_sentence:
        return False
    return None

In [None]:
def danh_gia_chatbot(duong_eval_file: str, so_mau: int = 200):
    ds = []
    with open(duong_eval_file, "r", encoding="utf-8") as f:
        for line in f:
            ds.append(json.loads(line))
    ds = ds[:so_mau]

    dung_graph = 0
    dung_base = 0
    tong = len(ds)

    for i, rec in enumerate(ds):
        q = rec["question"]
        gold = rec["answer"]

        ans_graph = answer_question_graph(q)
        pred_graph = parse_bool_from_answer(ans_graph["answer"])

        ans_base = baseline_answer(q)
        pred_base = parse_bool_from_answer(ans_base)

        if pred_graph is not None and pred_graph == gold:
            dung_graph += 1
        if pred_base is not None and pred_base == gold:
            dung_base += 1

        print(f"Mẫu {i+1}/{tong}")
        print("Câu hỏi:", q)
        print("Đáp án đúng:", gold)
        print("GraphRAG:", ans_graph["answer"][:120].replace("\n", " "))
        print("Baseline:", ans_base[:120].replace("\n", " "))
        print("Kết quả GraphRAG:", pred_graph, "| Baseline:", pred_base)
        print("----")

    acc_graph = dung_graph / tong
    acc_base = dung_base / tong
    print("Độ chính xác GraphRAG:", acc_graph)
    print("Độ chính xác Baseline:", acc_base)
    return acc_graph, acc_base

In [None]:
def chat_graphrag(message, history):
    kq = answer_question_graph(message)
    return kq["answer"]

def chat_graphrag_multi(message, history):
    kq = answer_question_multi_hop(message)
    return kq["answer"]

def chat_baseline_fn(message, history):
    return baseline_answer(message)

In [None]:
def chat_router(message, history, mode):
    if mode == "GraphRAG":
        return chat_graphrag(message, history)
    if mode == "GraphRAG + Multi-hop":
        return chat_graphrag_multi(message, history)
    if mode == "Baseline (LLM thuần)":
        return chat_baseline_fn(message, history)
    return chat_graphrag(message, history)

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("# Chatbot EPL - GraphRAG và Multi-hop")
    mode_radio = gr.Radio(
        choices=["GraphRAG", "GraphRAG + Multi-hop", "Baseline (LLM thuần)"],
        value="GraphRAG + Multi-hop",
        label="Chọn chế độ",
    )
    chat = gr.ChatInterface(
        fn=lambda msg, hist: chat_router(msg, hist, mode_radio.value),
        title="Chat EPL",
        description="Đặt câu hỏi về Ngoại hạng Anh, cầu thủ, CLB, HLV...",
    )

demo.launch()

  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7afda961760a57ae32.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
acc_graph, acc_base = danh_gia_chatbot("graph_eval_qa_2000.jsonl", so_mau=20)

Anchor entities (node id): ['Q16007411', 'Q50602']
Số node trong k-hop: 118
Số fact ứng viên liên quan anchor: 966
Mẫu 1/20
Câu hỏi: Câu sau đây đúng hay sai: Thierno Barry đã ghi bàn cho Gordon Brice?
Đáp án đúng: False
GraphRAG: Không.   Giới thiệu về Thierno Barry:  - Thầy trò của Moyes, Arsenal, đã nhận thêm một cầu thủ mới vào mùa giải này khi 
Baseline: Đúng. Thierno Barry đã ghi bàn cho Gordon Brice trong trận đấu giữa hai đội của EFL Premier League.
Kết quả GraphRAG: None | Baseline: True
----
Anchor entities (node id): ['Q18739', 'Q104737230']
Số node trong k-hop: 94
Số fact ứng viên liên quan anchor: 956
Mẫu 2/20
Câu hỏi: Câu sau đây đúng hay sai: Liverpool F.C. hiện đang thi đấu cho Mamadou Sylla?
Đáp án đúng: False
GraphRAG: Không chắc chắn, vì Liverpool F.C. hiện tại vẫn còn ở vòng bảng Ngoại hạng Anh, và họ cần phải đánh bại một đội trước mớ
Baseline: Không chính xác. Liverpool F.C. không phải là một đội bóng thuộc EPL và cũng không thể thi đấu trong mùa giải hiện tại. 
K

In [None]:
import json

with open("triples_raw.json","r",encoding="utf-8") as f:
    raw = json.load(f)

out = []
for t in raw:
    if t.get("relation") in ["other", None, ""]:
        continue
    if t.get("relation_score",1) < 0.55:
        continue
    if t.get("head") in ["", None] or t.get("tail") in ["", None]:
        continue
    if t.get("head_type") not in ["PER","ORG","LOC","MISC"]:
        continue
    if t.get("tail_type") not in ["PER","ORG","LOC","MISC"]:
        continue
    out.append(t)

with open("triples_filtered.json","w",encoding="utf-8") as f:
    json.dump(out,f,ensure_ascii=False,indent=2)

print("Tổng triple sau lọc:", len(out))
print("Đã tạo: triples_filtered.json")

Tổng triple sau lọc: 5223
Đã tạo: triples_filtered.json


In [None]:
def norm(s):
    if not s:
        return ""
    s = s.lower()
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    s = re.sub(r"[^\w ]"," ",s)
    s = re.sub(r"\s+"," ",s).strip()
    rep = ["fc","afc","cf","sc","the","club"]
    for r in rep:
        s = s.replace(" "+r+" "," ")
        s = s.replace(" "+r," ")
        s = s.replace(r+" "," ")
    s = re.sub(r"\s+"," ",s).strip()
    return s


In [None]:
# Load wiki trước
with open("wikidata_epl_entities_clean.json","r",encoding="utf-8") as f:
    wiki = json.load(f)

players = wiki["players"]
clubs = wiki["clubs"]
managers = wiki.get("managers", [])   # FIX
stadiums = wiki.get("stadiums", [])   # FIX

def to_dic(lst, etype):
    d = {}
    for x in lst:
        k = norm(x["label"])
        d.setdefault(k, []).append(x)
    return d

dic = {
    "player": to_dic(players, "player"),
    "club": to_dic(clubs, "club"),
    "manager": to_dic(managers, "manager"),   # now defined
    "stadium": to_dic(stadiums, "stadium")
}

In [None]:
cand = {
    "player":[p["label"] for p in players],
    "club":[c["label"] for c in clubs],
    "manager":[m["label"] for m in managers],
    "stadium":[s["label"] for s in stadiums]
}

cand_norm = {
    k: [norm(x) for x in v] for k,v in cand.items()
}

In [None]:
def fuzzy_three_way(q, lst):
    r1 = process.extractOne(q, lst, scorer=fuzz.ratio)
    r2 = process.extractOne(q, lst, scorer=fuzz.partial_ratio)
    r3 = process.extractOne(q, lst, scorer=fuzz.token_sort_ratio)
    rs = [x for x in [r1,r2,r3] if x]
    if not rs:
        return None
    return max(rs, key=lambda x:x[1])


In [None]:
def match_type(text, etype):
    q = norm(text)
    lst = cand_norm[etype]
    kq = fuzzy_three_way(q, lst)
    if not kq:
        return None
    label_norm, score, idx = kq
    if score < 55:
        return None
    lab = cand[etype][idx]
    return dic[etype][norm(lab)][0]


In [None]:
def link_entity(text, etype):
    r = match_type(text, etype)
    if r:
        return r
    toks = text.split()
    if len(toks)>1:
        for t in toks:
            r = match_type(t, etype)
            if r:
                return r
    q = norm(text)
    for k, lst in dic[etype].items():
        if q == k:
            return lst[0]
        if q in k:
            return lst[0]
        if k in q:
            return lst[0]
    return None

In [None]:
def map_type(t):
    if t.startswith("B-PER") or t.startswith("I-PER"):
        return ["player","manager"]
    if t.startswith("B-ORG") or t.startswith("I-ORG"):
        return ["club"]
    if t.startswith("B-LOC") or t.startswith("I-LOC"):
        return ["stadium"]
    return []


In [None]:
def el_triple(t):
    tt = map_type(t["head_type"])
    hh = None
    for tp in tt:
        hh = link_entity(t["head"], tp)
        if hh:
            break
    if not hh:
        return None

    tt2 = map_type(t["tail_type"])
    th = None
    for tp in tt2:
        th = link_entity(t["tail"], tp)
        if th:
            break
    if not th:
        return None

    return {
        "head":hh["id"],
        "head_label":hh["label"],
        "head_type":hh.get("type",""),
        "tail":th["id"],
        "tail_label":th["label"],
        "tail_type":th.get("type",""),
        "relation":t["relation"],
        "sentence":t["sentence"],
        "source":t["metadata"].get("source",""),
        "date":t["metadata"].get("published_date","")
    }

In [None]:
!pip install rapidfuzz

import json, re, unicodedata
from rapidfuzz import fuzz, process

with open("wikidata_epl_entities_clean.json","r",encoding="utf-8") as f:
    wiki = json.load(f)

players = wiki.get("players",[])
clubs = wiki.get("clubs",[])
managers = wiki.get("managers",[])
stadiums = wiki.get("stadiums",[])



In [None]:
def norm(s):
    if not s:
        return ""
    s = s.lower()
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    s = re.sub(r"[^\w ]"," ",s)
    s = re.sub(r"\s+"," ",s).strip()
    rep = ["fc","afc","cf","sc","the","club"]
    for r in rep:
        s = s.replace(" "+r+" "," ")
        s = s.replace(" "+r," ")
        s = s.replace(r+" "," ")
    s = re.sub(r"\s+"," ",s).strip()
    return s

In [None]:
def to_dic(lst):
    d = {}
    for x in lst:
        k = norm(x["label"])
        d.setdefault(k,[]).append(x)
    return d

dic = {
    "player": to_dic(players),
    "club": to_dic(clubs),
    "manager": to_dic(managers),
    "stadium": to_dic(stadiums)
}

cand = {
    "player":[p["label"] for p in players],
    "club":[c["label"] for c in clubs],
    "manager":[m["label"] for m in managers],
    "stadium":[s["label"] for s in stadiums]
}

cand_norm = {k:[norm(x) for x in v] for k,v in cand.items()}

In [None]:
def fuzzy_three_way(q, lst):
    r1 = process.extractOne(q, lst, scorer=fuzz.ratio)
    r2 = process.extractOne(q, lst, scorer=fuzz.partial_ratio)
    r3 = process.extractOne(q, lst, scorer=fuzz.token_sort_ratio)
    rs = [x for x in [r1,r2,r3] if x]
    if not rs:
        return None
    return max(rs, key=lambda x:x[1])

def match_type(text, etype):
    q = norm(text)
    lst = cand_norm[etype]
    if not lst:
        return None
    kq = fuzzy_three_way(q, lst)
    if not kq:
        return None
    label_norm, score, idx = kq
    if score < 55:
        return None
    lab = cand[etype][idx]
    k = norm(lab)
    objs = dic[etype].get(k,[])
    if not objs:
        return None
    return objs[0]

In [None]:
def link_entity(text, etype):
    r = match_type(text, etype)
    if r:
        return r
    toks = text.split()
    if len(toks) > 1:
        for t in toks:
            r = match_type(t, etype)
            if r:
                return r
    q = norm(text)
    for k,lst in dic[etype].items():
        if q == k or q in k or k in q:
            return lst[0]
    return None

In [None]:
def map_type(tag):
    if tag == "PER":
        return ["player","manager"]
    if tag == "ORG":
        return ["club"]
    if tag == "LOC":
        return ["stadium"]
    if tag == "MISC":
        return ["club","player","manager","stadium"]
    return []

In [None]:
def el_triple(t):
    heads = map_type(t.get("head_type",""))
    tails = map_type(t.get("tail_type",""))
    if not heads or not tails:
        return None

    h_ent = None
    for tp in heads:
        h_ent = link_entity(t["head"], tp)
        if h_ent:
            break
    if not h_ent:
        return None

    t_ent = None
    for tp in tails:
        t_ent = link_entity(t["tail"], tp)
        if t_ent:
            break
    if not t_ent:
        return None

    return {
        "head":h_ent["id"],
        "head_label":h_ent["label"],
        "head_type":h_ent.get("type",""),
        "tail":t_ent["id"],
        "tail_label":t_ent["label"],
        "tail_type":t_ent.get("type",""),
        "relation":t["relation"],
        "sentence":t["sentence"],
        "source":t["metadata"].get("source",""),
        "date":t["metadata"].get("published_date","")
    }


In [None]:
with open("triples_filtered.json","r",encoding="utf-8") as f:
    raw = json.load(f)

out = []
for t in raw:
    r = el_triple(t)
    if r:
        out.append(r)

with open("triples_linked.json","w",encoding="utf-8") as f:
    json.dump(out,f,ensure_ascii=False,indent=2)

print("Số triple filtered:", len(raw))
print("Triple đã liên kết:", len(out))

Số triple filtered: 5223
Triple đã liên kết: 4825


In [None]:
!pip install torch transformers datasets sentence-transformers networkx faiss-cpu -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json, re, torch, faiss, numpy as np, networkx as nx
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

In [None]:
with open("triples_linked.json","r",encoding="utf-8") as f:
    triples = json.load(f)

In [None]:
texts = []
labels = []
label_set = sorted(list(set([t["relation"] for t in triples])))
label2id = {l:i for i,l in enumerate(label_set)}
id2label = {i:l for l,i in label2id.items()}

for t in triples:
    texts.append(t["sentence"])
    labels.append(label2id[t["relation"]])

In [None]:
ds = Dataset.from_dict({"text":texts,"label":labels})
ds = ds.train_test_split(test_size=0.2,seed=42)

In [None]:
tok_re = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
def enc_re(ex):
    r = tok_re(ex["text"],truncation=True,padding="max_length",max_length=128)
    r["labels"] = ex["label"]
    return r

ds = ds.map(enc_re)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/3860 [00:00<?, ? examples/s]

Map:   0%|          | 0/965 [00:00<?, ? examples/s]

In [None]:
mdl_re = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-small",
    num_labels=len(label_set),
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    "re_model_advanced",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    eval_strategy="epoch", # Corrected parameter name
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=mdl_re,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tok_re
)

trainer.train()
trainer.save_model("re_model_advanced")
tok_re.save_pretrained("re_model_advanced")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss
1,No log,0.000582
2,No log,0.00023
3,0.118000,0.000143
4,0.118000,0.000125


('re_model_advanced/tokenizer_config.json',
 're_model_advanced/special_tokens_map.json',
 're_model_advanced/spm.model',
 're_model_advanced/added_tokens.json',
 're_model_advanced/tokenizer.json')

In [None]:
model_re = AutoModelForSequenceClassification.from_pretrained("re_model_advanced")
tokenizer_re = AutoTokenizer.from_pretrained("re_model_advanced")

The tokenizer you are loading from 're_model_advanced' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [None]:
def score_relation(sentence):
    ip = tokenizer_re(sentence,return_tensors="pt",truncation=True,max_length=128)
    with torch.no_grad():
        out = model_re(**ip).logits[0]
    prob = torch.softmax(out,dim=0)
    idx = int(torch.argmax(prob))
    return id2label[idx], float(prob[idx])

In [None]:
filtered = []
for t in triples:
    rel, score = score_relation(t["sentence"])
    if score >= 0.65:
        t["relation"] = rel
        t["relation_score"] = score
        filtered.append(t)

with open("triples_final.json","w",encoding="utf-8") as f:
    json.dump(filtered,f,ensure_ascii=False,indent=2)

print("Triple cuối:",len(filtered))

Triple cuối: 4825


In [None]:
import pickle # Add import for pickle

G = nx.DiGraph()
for t in filtered:
    G.add_node(t["head"],label=t["head_label"],type=t["head_type"])
    G.add_node(t["tail"],label=t["tail_label"],type=t["tail_type"])
    G.add_edge(t["head"],t["tail"],relation=t["relation"],sentence=t["sentence"])

pickle.dump(G, open("kg_epl.gpickle", "wb")) # Use pickle.dump instead of nx.write_gpickle
print("Đã build KG")

Đã build KG


In [None]:
embedder = SentenceTransformer("BAAI/bge-base-en-v1.5")
facts = []
fact_ids = []

for u,v,d in G.edges(data=True):
    txt = f'{G.nodes[u]["label"]} {d["relation"]} {G.nodes[v]["label"]}'
    facts.append(txt)
    fact_ids.append((u,v))

In [None]:
emb = embedder.encode(facts,convert_to_numpy=True,normalize_embeddings=True)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

In [None]:
def graphrag_retrieve(question, top_k=30):
    qemb = embedder.encode([question],convert_to_numpy=True,normalize_embeddings=True)
    D,I = index.search(qemb,top_k)
    ctx = []
    for i in I[0]:
        if i < len(facts):
            ctx.append(facts[i])
    return "\n".join(ctx)

In [None]:
llm_name = "Qwen/Qwen2-0.5B-Instruct"
tok_llm = AutoTokenizer.from_pretrained(llm_name)
mdl_llm = AutoModelForCausalLM.from_pretrained(
    llm_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
def tao_prompt(question, context):
    return (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh.\n"
        "Chỉ dựa trên thông tin sau để trả lời.\n"
        "Nếu thông tin không đủ, trả lời Sai.\n"
        "Chỉ trả lời ĐÚNG hoặc SAI.\n\n"
        "Thông tin:\n"
        f"{context}\n\n"
        "Câu hỏi:\n"
        f"{question}\n\n"
        "Trả lời:"
    )

In [None]:
def answer_graphrag(question):
    ctx = graphrag_retrieve(question,top_k=40)
    prompt = tao_prompt(question,ctx)
    ip = tok_llm(prompt,return_tensors="pt").to(mdl_llm.device)
    with torch.no_grad():
        out = mdl_llm.generate(
            **ip,
            max_new_tokens=8,
            do_sample=False,
            pad_token_id=tok_llm.eos_token_id
        )
    txt = tok_llm.decode(out[0],skip_special_tokens=True)
    if txt.startswith(prompt):
        return txt[len(prompt):].strip()
    return txt.strip()

In [None]:
print(answer_graphrag("Câu sau đây đúng hay sai: Erling Haaland hiện đang thi đấu cho Manchester City?"))

Đúng


In [None]:
import json, re, torch
from tqdm.auto import tqdm

In [None]:
duong_eval = "graph_eval_qa_2000.jsonl"


In [None]:
def doc_eval(path):
    ds=[]
    with open(path,"r",encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            ds.append(json.loads(line))
    return ds

In [None]:
def tach_bool(s):
    s = (s or "").strip().lower()
    dong_dau = s.split("\n")[0]
    if "đúng" in dong_dau or re.fullmatch(r"\s*true\s*", dong_dau):
        return True
    if "sai" in dong_dau or re.fullmatch(r"\s*false\s*", dong_dau):
        return False
    return None

In [None]:
def danh_gia_tap(ds, fn_answer):
    dung=0
    tong=0
    bo_qua=0
    for rec in tqdm(ds, desc="Đang đánh giá"):
        q = rec["question"]
        gold = rec["answer"]
        ans = fn_answer(q)
        pred = tach_bool(ans)
        if pred is None:
            bo_qua += 1
        else:
            tong += 1
            if pred == gold:
                dung += 1
    acc = dung / tong if tong>0 else 0.0
    return acc, tong, bo_qua

In [None]:
def tao_prompt_baseline(question: str) -> str:
    return (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh (EPL).\n"
        "Hãy trả lời câu hỏi sau dựa trên hiểu biết chung của bạn, trả lời ngắn gọn bằng tiếng Việt.\n\n"
        f"Câu hỏi: {question}\n\nCâu trả lời:"
    )

def baseline_llm_thuan(question: str, max_new_tokens: int = 128, do_sample: bool = False) -> str:
    prompt = tao_prompt_baseline(question)
    inputs = tok_llm(prompt, return_tensors="pt").to(mdl_llm.device)
    with torch.no_grad():
        output_ids = mdl_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            pad_token_id=tok_llm.eos_token_id,
        )
    full_text = tok_llm.decode(output_ids[0], skip_special_tokens=True)
    if full_text.startswith(prompt):
        ans = full_text[len(prompt):].strip()
    else:
        ans = full_text.strip()
    return ans

def so_sanh_1_3_hop(so_mau_moi_hop=300):
    ds = doc_eval(duong_eval)
    ket_qua = {}
    for hop in [1,2,3]:
        tap = [x for x in ds if int(x.get("hops",1)) == hop]
        tap = tap[:so_mau_moi_hop]
        print("-----", hop, "hop | số mẫu:", len(tap), "-----")
        acc_g, tg, bg = danh_gia_tap(tap, answer_graphrag)
        acc_b, tb, bb = danh_gia_tap(tap, baseline_llm_thuan)
        ket_qua[hop] = {
            "GraphRAG":{"accuracy":acc_g,"count":tg,"skip":bg},
            "Baseline":{"accuracy":acc_b,"count":tb,"skip":bb}
        }
        print("GraphRAG:", ket_qua[hop]["GraphRAG"])
        print("Baseline:", ket_qua[hop]["Baseline"])
    return ket_qua

In [None]:
ket_qua_1_3 = so_sanh_1_3_hop(so_mau_moi_hop=300)
ket_qua_1_3

----- 1 hop | số mẫu: 300 -----


Đang đánh giá:   0%|          | 0/300 [00:00<?, ?it/s]

Đang đánh giá:   0%|          | 0/300 [00:00<?, ?it/s]

GraphRAG: {'accuracy': 0.5, 'count': 300, 'skip': 0}
Baseline: {'accuracy': 0.5344827586206896, 'count': 174, 'skip': 126}
----- 2 hop | số mẫu: 4 -----


Đang đánh giá:   0%|          | 0/4 [00:00<?, ?it/s]

Đang đánh giá:   0%|          | 0/4 [00:00<?, ?it/s]

GraphRAG: {'accuracy': 1.0, 'count': 4, 'skip': 0}
Baseline: {'accuracy': 1.0, 'count': 2, 'skip': 2}
----- 3 hop | số mẫu: 52 -----


Đang đánh giá:   0%|          | 0/52 [00:00<?, ?it/s]

Đang đánh giá:   0%|          | 0/52 [00:00<?, ?it/s]

GraphRAG: {'accuracy': 1.0, 'count': 52, 'skip': 0}
Baseline: {'accuracy': 1.0, 'count': 46, 'skip': 6}


{1: {'GraphRAG': {'accuracy': 0.5, 'count': 300, 'skip': 0},
  'Baseline': {'accuracy': 0.5344827586206896, 'count': 174, 'skip': 126}},
 2: {'GraphRAG': {'accuracy': 1.0, 'count': 4, 'skip': 0},
  'Baseline': {'accuracy': 1.0, 'count': 2, 'skip': 2}},
 3: {'GraphRAG': {'accuracy': 1.0, 'count': 52, 'skip': 0},
  'Baseline': {'accuracy': 1.0, 'count': 46, 'skip': 6}}}

In [None]:
import torch, json, faiss, networkx as nx
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import gradio as gr

In [None]:
import pickle
import networkx as nx # Ensure networkx is imported here as well if not imported globally earlier

G = pickle.load(open("kg_epl.gpickle", "rb"))

with open("triples_final.json","r",encoding="utf-8") as f:
    triples = json.load(f)

facts=[]
for u,v,d in G.edges(data=True):
    facts.append(f'{G.nodes[u]["label"]} {d["relation"]} {G.nodes[v]["label"]}')

In [None]:
embedder = SentenceTransformer("BAAI/bge-base-en-v1.5")
emb = embedder.encode(facts,convert_to_numpy=True,normalize_embeddings=True)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

In [None]:
def graphrag_retrieve(question, top_k=30):
    qemb = embedder.encode([question],convert_to_numpy=True,normalize_embeddings=True)
    D,I = index.search(qemb,top_k)
    ctx=[]
    for i in I[0]:
        if i < len(facts):
            ctx.append(facts[i])
    return "\n".join(ctx)

In [None]:
llm_name = "Qwen/Qwen2-0.5B-Instruct"
tok = AutoTokenizer.from_pretrained(llm_name)
mdl = AutoModelForCausalLM.from_pretrained(
    llm_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
def tao_prompt(question, context):
    return (
        "Bạn là chatbot bóng đá Ngoại hạng Anh.\n"
        "Chỉ sử dụng thông tin bên dưới để trả lời.\n"
        "Nếu không đủ thông tin, trả lời: Tôi không biết.\n\n"
        "Thông tin:\n"
        f"{context}\n\n"
        "Câu hỏi:\n"
        f"{question}\n\n"
        "Trả lời:"
    )

In [None]:
def answer_graphrag(question):
    ctx = graphrag_retrieve(question, top_k=40)
    prompt = tao_prompt(question, ctx)
    ip = tok(prompt,return_tensors="pt").to(mdl.device)
    with torch.no_grad():
        out = mdl.generate(
            **ip,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tok.eos_token_id
        )
    txt = tok.decode(out[0],skip_special_tokens=True)
    if txt.startswith(prompt):
        return txt[len(prompt):].strip()
    return txt.strip()

In [None]:
def chat_fn(message, history):
    return answer_graphrag(message)

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("# Chatbot EPL – GraphRAG")
    gr.ChatInterface(
        fn=chat_fn,
        title="Chat EPL",
        description="Hỏi đáp về cầu thủ, CLB, HLV, lịch sử và quan hệ trong Ngoại hạng Anh"
    )

demo.launch()

  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0a192b463ae488c66f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
from openai import OpenAI
import json, re, random
from tqdm.auto import tqdm

client = OpenAI()

duong_eval = "graph_eval_qa_2000.jsonl"
seed = 42
so_mau_moi_hop = 80

def doc_eval(path):
    ds=[]
    with open(path,"r",encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if line:
                ds.append(json.loads(line))
    return ds

def tach_bool(s):
    s = (s or "").strip().lower()
    dong_dau = s.split("\n")[0].strip()
    if dong_dau.startswith("đúng") or dong_dau == "true" or "đúng" == dong_dau:
        return True
    if dong_dau.startswith("sai") or dong_dau == "false" or "sai" == dong_dau:
        return False
    if "đúng" in dong_dau:
        return True
    if "sai" in dong_dau:
        return False
    return None

def chuan_hoa_cau_hoi(q):
    q = (q or "").strip()
    if not q.endswith("?"):
        q += "?"
    return q

def chatgpt_tra_loi_bool(question, model="gpt-4.1-mini"):
    prompt = (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh.\n"
        "Chỉ trả lời đúng một từ: Đúng hoặc Sai.\n"
        "Nếu không chắc chắn, trả lời Sai.\n\n"
        f"{question}\n"
        "Trả lời:"
    )
    r = client.responses.create(model=model, input=prompt)
    txt = r.output[0].content[0].text.strip()
    return txt

def chay_deu_1_3_hop(ds, n_moi_hop=80, seed=42):
    random.seed(seed)
    buckets = {1:[], 2:[], 3:[]}
    for r in ds:
        h = int(r.get("hops", 1))
        if h in buckets:
            if "question" in r and "answer" in r:
                buckets[h].append(r)
    n = min(n_moi_hop, *(len(buckets[h]) for h in [1,2,3]))
    tap = []
    for h in [1,2,3]:
        random.shuffle(buckets[h])
        tap.extend(buckets[h][:n])
    random.shuffle(tap)
    return tap, n, {h:len(buckets[h]) for h in [1,2,3]}

def danh_gia(ds, fn_tra_loi):
    dung=0
    tong=0
    bo_qua=0
    theo_hop = {1:{"dung":0,"tong":0,"bo_qua":0},2:{"dung":0,"tong":0,"bo_qua":0},3:{"dung":0,"tong":0,"bo_qua":0}}
    for r in tqdm(ds, desc="Đang đánh giá"):
        hop = int(r.get("hops",1))
        q = chuan_hoa_cau_hoi(r["question"])
        gold = bool(r["answer"])
        ans = fn_tra_loi(q)
        pred = tach_bool(ans)
        if pred is None:
            bo_qua += 1
            if hop in theo_hop:
                theo_hop[hop]["bo_qua"] += 1
            continue
        tong += 1
        if hop in theo_hop:
            theo_hop[hop]["tong"] += 1
        if pred == gold:
            dung += 1
            if hop in theo_hop:
                theo_hop[hop]["dung"] += 1
    acc = dung/tong if tong>0 else 0.0
    return acc, tong, bo_qua, theo_hop

ds = doc_eval(duong_eval)
tap, n_moi_hop_thuc_te, thong_ke = chay_deu_1_3_hop(ds, n_moi_hop=so_mau_moi_hop, seed=seed)

print("Số câu khả dụng theo hop:", thong_ke)
print("Đang dùng số mẫu mỗi hop:", n_moi_hop_thuc_te)
print("Tổng số câu đánh giá:", len(tap))

Số câu khả dụng theo hop: {1: 1944, 2: 4, 3: 52}
Đang dùng số mẫu mỗi hop: 4
Tổng số câu đánh giá: 12


In [None]:
def fn_graphrag(q):
    return answer_graphrag(q)

acc_g, tong_g, bo_g, theo_hop_g = danh_gia(tap, fn_graphrag)
print("GraphRAG | accuracy:", acc_g, "| số mẫu tính:", tong_g, "| bỏ qua:", bo_g)
print("GraphRAG | theo hop:", theo_hop_g)

Đang đánh giá:   0%|          | 0/12 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 2191, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 1696, in call_function
    prediction = await fn(*processed_input)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/utils.py", line 882, in async_wrapper
    response = await f(*args, **kwargs)
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-

KeyboardInterrupt: 

In [None]:
def fn_chatgpt(q):
    return chatgpt_tra_loi_bool(q, model="gpt-4.1-mini")

acc_c, tong_c, bo_c, theo_hop_c = danh_gia(tap, fn_chatgpt)
print("ChatGPT | accuracy:", acc_c, "| số mẫu tính:", tong_c, "| bỏ qua:", bo_c)
print("ChatGPT | theo hop:", theo_hop_c)


In [None]:
!pip -q install -U gradio faiss-cpu sentence-transformers transformers accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.0/23.0 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, json, traceback, numpy as np, torch, faiss, networkx as nx
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

In [None]:
import os, json, traceback, numpy as np, torch, faiss, networkx as nx
import pickle # Import pickle for loading the graph
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

duong_kg = "kg_epl.gpickle"
duong_facts = "graph_facts.json"
duong_emb = "graph_facts_emb.npy"

G = pickle.load(open(duong_kg, "rb")) # Use pickle.load to load the graph

if os.path.exists(duong_facts):
    with open(duong_facts, "r", encoding="utf-8") as f:
        facts = json.load(f)
else:
    facts = []
    for u, v, d in G.edges(data=True):
        hu = G.nodes[u].get("label", str(u))
        hv = G.nodes[v].get("label", str(v))
        rel = d.get("relation", "related_to")
        facts.append(f"{hu} {rel} {hv}")
    with open(duong_facts, "w", encoding="utf-8") as f:
        json.dump(facts, f, ensure_ascii=False, indent=2)

print("Số fact:", len(facts))

Số fact: 972


In [None]:
embedder_name = "BAAI/bge-base-en-v1.5"
embedder = SentenceTransformer(embedder_name, device='cpu') # Explicitly load embedder to CPU

if os.path.exists(duong_emb):
    emb = np.load(duong_emb)
else:
    emb = embedder.encode(facts, convert_to_numpy=True, normalize_embeddings=True, batch_size=128, show_progress_bar=True)
    np.save(duong_emb, emb)

emb = emb.astype("float32")
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

print("FAISS đã nạp:", index.ntotal)

FAISS đã nạp: 972


In [None]:
# This cell is attempting to load the LLM again, which causes an OutOfMemoryError.
# The LLM (Qwen/Qwen2-0.5B-Instruct) is already loaded as tok_llm and mdl_llm from cell d_4H3_iXQ3eo.
# We will reuse the existing loaded model to prevent out of memory issues.
llm_name = "Qwen/Qwen2-0.5B-Instruct"
tok = tok_llm # Reuse the tokenizer already loaded as tok_llm
mdl = mdl_llm # Reuse the model already loaded as mdl_llm

# Ensure pad_token is set if it's None (might be redundant if already set in tok_llm)
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token


In [None]:
def graphrag_retrieve(question, top_k=40):
    if not facts:
        return ""
    qemb = embedder.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    D, I = index.search(qemb, min(top_k, len(facts)))
    ctx = []
    for i in I[0]:
        if 0 <= i < len(facts):
            ctx.append(facts[i])
    return "\n".join(ctx)


In [None]:
def tao_prompt(question, context):
    return (
        "Bạn là chatbot bóng đá Ngoại hạng Anh.\n"
        "Chỉ dựa trên thông tin bên dưới.\n"
        "Nếu không đủ thông tin, trả lời: Tôi không biết.\n\n"
        "Thông tin:\n"
        f"{context}\n\n"
        "Câu hỏi:\n"
        f"{question}\n\n"
        "Trả lời:"
    )


In [None]:
def answer_graphrag(question):
    ctx = graphrag_retrieve(question, top_k=50)
    prompt = tao_prompt(question, ctx)
    ip = tok(prompt, return_tensors="pt", truncation=True, max_length=2048).to(mdl.device)
    with torch.inference_mode():
        out = mdl.generate(
            **ip,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id
        )
    txt = tok.decode(out[0], skip_special_tokens=True)
    if txt.startswith(prompt):
        ans = txt[len(prompt):].strip()
    else:
        ans = txt.strip()
    return ans if ans else "Tôi không biết."


In [None]:
def chat_fn(message, history):
    try:
        return answer_graphrag(message)
    except Exception:
        err = traceback.format_exc()
        print(err)
        return "Lỗi khi suy luận, vui lòng thử lại."


In [None]:
import numpy as np, faiss

In [None]:
print("Số fact:", len(facts))
print("Kích thước embedding facts:", emb.shape)
print("FAISS dimension:", index.d)

Số fact: 972
Kích thước embedding facts: (972, 384)
FAISS dimension: 384


In [None]:
def rebuild_faiss(embedder, facts, path_emb="graph_facts_emb.npy"):
    emb2 = embedder.encode(facts, convert_to_numpy=True, normalize_embeddings=True, batch_size=128, show_progress_bar=True)
    emb2 = emb2.astype("float32")
    np.save(path_emb, emb2)
    idx = faiss.IndexFlatIP(emb2.shape[1])
    idx.add(emb2)
    return emb2, idx

In [None]:
qtest = "Erling Haaland hiện đang thi đấu cho Manchester City đúng hay sai?"
qemb = embedder.encode([qtest], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
print("Kích thước embedding query:", qemb.shape)

Kích thước embedding query: (1, 768)


In [None]:
qtest = "Erling Haaland hiện đang thi đấu cho Manchester City đúng hay sai?"
qemb = embedder.encode([qtest], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

if qemb.shape[1] != index.d:
    print("Phát hiện lệch chiều embedding, đang rebuild FAISS...")
    emb, index = rebuild_faiss(embedder, facts, path_emb="graph_facts_emb.npy")
    print("Đã rebuild xong | emb:", emb.shape, "| index.d:", index.d)
else:
    print("Chiều embedding khớp, không cần rebuild")

NameError: name 'qemb' is not defined

In [None]:
def graphrag_retrieve(question, top_k=40):
    if not facts:
        return ""
    qemb = embedder.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    if qemb.shape[1] != index.d:
        global emb
        emb, idx2 = rebuild_faiss(embedder, facts, path_emb="graph_facts_emb.npy")
        globals()["index"] = idx2
        print("Đã tự rebuild FAISS do lệch chiều")
    D, I = index.search(qemb, min(top_k, len(facts)))
    ctx = []
    for i in I[0]:
        if 0 <= i < len(facts):
            ctx.append(facts[i])
    return "\n".join(ctx)

In [None]:
print(answer_graphrag("Erling Haaland hiện đang thi đấu cho Manchester City đúng hay sai?"))

NameError: name 'answer_graphrag' is not defined

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("# Chatbot EPL – GraphRAG")
    gr.ChatInterface(
        fn=chat_fn,
        title="Chat EPL",
        description="Hỏi đáp về cầu thủ, CLB, HLV và quan hệ trong Ngoại hạng Anh"
    )

demo.queue()
demo.launch(share=True, debug=True)


NameError: name 'gr' is not defined