In [1]:
!pip -q install spacy sentence-transformers rapidfuzz evaluate datasets networkx python-louvain tqdm faiss-cpu gradio transformers accelerate
!python -m spacy download en_core_web_sm -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.2 MB[0m [31m103.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencie

In [2]:
import os, json, re, random, unicodedata, traceback
from typing import List, Dict, Any
from collections import Counter

import numpy as np
import torch
import networkx as nx
from tqdm.auto import tqdm

import spacy
from spacy.pipeline import EntityRuler

from rapidfuzz import fuzz, process
import faiss
import gradio as gr

from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7ed3870cd710>

In [3]:
DUONG_ARTICLES = "all_articles.json"
DUONG_WIKIDATA = "wikidata_epl_entities.json"

assert os.path.exists(DUONG_ARTICLES), f"Không tìm thấy {DUONG_ARTICLES}"
assert os.path.exists(DUONG_WIKIDATA), f"Không tìm thấy {DUONG_WIKIDATA}"

with open(DUONG_ARTICLES, "r", encoding="utf-8") as f:
    articles = json.load(f).get("articles", [])
print("Số bài báo:", len(articles))

with open(DUONG_WIKIDATA, "r", encoding="utf-8") as f:
    wikidata = json.load(f)

clubs = wikidata.get("clubs", [])
players = wikidata.get("players", [])
managers = wikidata.get("managers", [])
stadiums = wikidata.get("stadiums", [])

print("Số CLB:", len(clubs))
print("Số cầu thủ:", len(players))
print("Số HLV:", len(managers))
print("Số sân:", len(stadiums))


Số bài báo: 256
Số CLB: 21
Số cầu thủ: 3980
Số HLV: 20
Số sân: 21


In [4]:
def tach_cau(text: str) -> List[str]:
    text = re.sub(r"\s+", " ", (text or "")).strip()
    if not text:
        return []
    return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+", text) if s.strip()]

In [5]:
nlp = spacy.load("en_core_web_sm")

ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = []
for p in players:
    if p.get("label"):
        patterns.append({"label": "PERSON", "pattern": p["label"]})

for c in clubs:
    if c.get("label"):
        patterns.append({"label": "ORG", "pattern": c["label"]})
    if c.get("short_label"):
        patterns.append({"label": "ORG", "pattern": c["short_label"]})

for s in stadiums:
    if s.get("label"):
        patterns.append({"label": "LOC", "pattern": s["label"]})

ruler.add_patterns(patterns)
print("Đã thêm EntityRuler | patterns:", len(patterns))

Đã thêm EntityRuler | patterns: 4022


In [6]:
SPACY2TAG = {
    "PERSON": "PER",
    "ORG": "ORG",
    "GPE": "LOC",
    "LOC": "LOC",
    "FAC": "LOC",
}

def chay_ner(sentence: str) -> List[Dict[str, Any]]:
    doc = nlp(sentence)
    out = []
    for ent in doc.ents:
        tag = SPACY2TAG.get(ent.label_)
        if not tag:
            continue
        out.append({
            "text": ent.text,
            "type": tag,
            "start": ent.start_char,
            "end": ent.end_char,
            "score": 1.0
        })
    return out

def loc_thuc_the(ents: List[Dict[str, Any]]):
    uu_tien = {"PER", "ORG", "LOC"}
    return [e for e in ents if e["type"] in uu_tien]

def tao_cap_thuc_the(ents: List[Dict[str, Any]]):
    out = []
    for i in range(len(ents)):
        for j in range(i + 1, len(ents)):
            out.append({"head": ents[i], "tail": ents[j]})
    return out

In [7]:
re_ce = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cpu")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cpu")

RELATIONS = ["player_club", "manager_club", "player_score", "transfer", "injury", "other"]

HYPOTHESIS = {
    "transfer": "The sentence is about a transfer: a player joined, signed, moved, or transferred.",
    "manager_club": "The sentence says a manager was appointed, sacked, or is managing a club.",
    "player_score": "The sentence says a player scored a goal or goals.",
    "injury": "The sentence says a player is injured or has an injury issue.",
    "player_club": "The sentence states a player is playing for or belongs to a club.",
    "other": "The sentence is not about transfers, managers, scoring, injuries, or player-club relation.",
}

def chay_re(sentence: str) -> Dict[str, Any]:
    pairs = [(sentence, HYPOTHESIS[r]) for r in RELATIONS]
    scores = re_ce.predict(pairs)
    best_i = int(np.argmax(scores))
    best_rel = RELATIONS[best_i]
    best_score = float(scores[best_i])

    s = sentence.lower()
    gate = {
        "transfer": any(k in s for k in ["join", "joined", "sign", "signed", "transfer", "moved", "deal", "loan"]),
        "manager_club": any(k in s for k in ["manager", "head coach", "appointed", "sacked", "dismissed"]),
        "player_score": any(k in s for k in ["scored", "score", "goal", "brace", "hat-trick", "penalty"]),
        "injury": any(k in s for k in ["injury", "injured", "out for", "ruled out", "hamstring", "ankle", "fitness"]),
        "player_club": any(k in s for k in ["plays for", "playing for", "for", "at", "with", "club", "fc"]),
        "other": True,
    }

    if best_rel != "other" and (not gate.get(best_rel, True)) and best_score < 1.0:
        best_rel = "other"

    return {"label": best_rel, "score": best_score}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [8]:
def trich_quan_he_tu_cau(sentence: str):
    ents = loc_thuc_the(chay_ner(sentence))
    if len(ents) < 2:
        return []
    pred = chay_re(sentence)
    if pred["label"] == "other":
        return []
    pairs = tao_cap_thuc_the(ents)
    triples = []
    for p in pairs:
        triples.append({
            "sentence": sentence,
            "relation": pred["label"],
            "relation_score": pred["score"],
            "head": p["head"]["text"],
            "head_type": p["head"]["type"],
            "tail": p["tail"]["text"],
            "tail_type": p["tail"]["type"],
        })
    return triples

def trich_quan_he_tu_bai_bao(article):
    content = article.get("content", "") or ""
    if not content.strip():
        return []
    triples = []
    for s in tach_cau(content):
        triples.extend(trich_quan_he_tu_cau(s))
    meta = {
        "url": article.get("url"),
        "title": article.get("title"),
        "source": article.get("source"),
        "published_date": article.get("published_date"),
    }
    for t in triples:
        t["metadata"] = meta
    return triples

def trich_quan_he_tu_ds_bai_bao(articles):
    out = []
    for art in tqdm(articles, desc="Đang trích NER+RE", ncols=100):
        out.extend(trich_quan_he_tu_bai_bao(art))
    return out

In [9]:
triples_raw = trich_quan_he_tu_ds_bai_bao(articles)
with open("triples_raw.json", "w", encoding="utf-8") as f:
    json.dump(triples_raw, f, ensure_ascii=False, indent=2)

print("Tổng triple thô:", len(triples_raw))
print("Đã lưu: triples_raw.json")

Đang trích NER+RE:   0%|                                                    | 0/256 [00:00<?, ?it/s]

Tổng triple thô: 881
Đã lưu: triples_raw.json


In [10]:
def bo_dau(s):
    s = unicodedata.normalize("NFD", s)
    return "".join(c for c in s if unicodedata.category(c) != "Mn")

def norm(s):
    if not s:
        return ""
    s = s.lower().strip()
    s = bo_dau(s)
    s = s.replace("’", "'").replace("`", "'")
    s = re.sub(r"[^\w ]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    rep = ["fc", "afc", "cf", "sc", "the", "club", "football"]
    for r in rep:
        s = s.replace(f" {r} ", " ")
        if s.endswith(f" {r}"):
            s = s[:-(len(r)+1)]
        if s.startswith(f"{r} "):
            s = s[(len(r)+1):]
    return re.sub(r"\s+", " ", s).strip()

def to_dic(lst):
    d = {}
    for x in lst:
        k = norm(x.get("label",""))
        if k:
            d.setdefault(k, []).append(x)
    return d

dic = {
    "player": to_dic(players),
    "club": to_dic(clubs),
    "manager": to_dic(managers),
    "stadium": to_dic(stadiums),
}

cand = {
    "player": [p["label"] for p in players if p.get("label")],
    "club": [c["label"] for c in clubs if c.get("label")],
    "manager": [m["label"] for m in managers if m.get("label")],
    "stadium": [s["label"] for s in stadiums if s.get("label")],
}
cand_norm = {k: [norm(x) for x in v] for k, v in cand.items()}

def fuzzy_three_way(q, lst):
    r1 = process.extractOne(q, lst, scorer=fuzz.ratio)
    r2 = process.extractOne(q, lst, scorer=fuzz.partial_ratio)
    r3 = process.extractOne(q, lst, scorer=fuzz.token_sort_ratio)
    rs = [x for x in [r1, r2, r3] if x]
    if not rs:
        return None
    return max(rs, key=lambda x: x[1])

def match_type(text, etype):
    q = norm(text)
    lst = cand_norm.get(etype, [])
    if not lst:
        return None
    kq = fuzzy_three_way(q, lst)
    if not kq:
        return None
    _, score, idx = kq
    if len(q) <= 4 and score < 85:
        return None
    if len(q) > 4 and score < 55:
        return None
    lab = cand[etype][idx]
    k = norm(lab)
    objs = dic[etype].get(k, [])
    return objs[0] if objs else None

def link_entity(text, etype):
    r = match_type(text, etype)
    if r:
        return r
    toks = text.split()
    if len(toks) > 1:
        for t in toks:
            r = match_type(t, etype)
            if r:
                return r
    q = norm(text)
    for k, lst in dic[etype].items():
        if q == k or q in k or k in q:
            return lst[0]
    return None

def map_type(tag):
    if tag == "PER":
        return ["player", "manager"]
    if tag == "ORG":
        return ["club"]
    if tag == "LOC":
        return ["stadium"]
    return ["club", "player", "manager", "stadium"]

def link_triple(t):
    heads = map_type(t.get("head_type",""))
    tails = map_type(t.get("tail_type",""))

    h_ent = None
    for tp in heads:
        h_ent = link_entity(t["head"], tp)
        if h_ent:
            break
    if not h_ent:
        return None

    t_ent = None
    for tp in tails:
        t_ent = link_entity(t["tail"], tp)
        if t_ent:
            break
    if not t_ent:
        return None

    return {
        "head": h_ent["id"],
        "head_label": h_ent["label"],
        "head_type": h_ent.get("type",""),
        "tail": t_ent["id"],
        "tail_label": t_ent["label"],
        "tail_type": t_ent.get("type",""),
        "relation": t["relation"],
        "relation_score": float(t.get("relation_score", 0.0)),
        "sentence": t["sentence"],
        "source": t["metadata"].get("source",""),
        "date": t["metadata"].get("published_date",""),
    }

In [11]:
triples_linked = []
for t in tqdm(triples_raw, desc="Entity Linking", ncols=100):
    r = link_triple(t)
    if r:
        triples_linked.append(r)

with open("triples_linked.json", "w", encoding="utf-8") as f:
    json.dump(triples_linked, f, ensure_ascii=False, indent=2)

print("Triple thô:", len(triples_raw))
print("Triple đã liên kết:", len(triples_linked))
print("Đã lưu: triples_linked.json")

Entity Linking:   0%|                                                       | 0/881 [00:00<?, ?it/s]

Triple thô: 881
Triple đã liên kết: 692
Đã lưu: triples_linked.json


In [12]:
nodes_dict = {}
edges_kg = []

for t in triples_linked:
    h = t.get("head")
    ta = t.get("tail")
    if not h or not ta:
        continue

    nodes_dict[h] = {"id": h, "label": t.get("head_label"), "type": t.get("head_type")}
    nodes_dict[ta] = {"id": ta, "label": t.get("tail_label"), "type": t.get("tail_type")}

    edges_kg.append({
        "id": len(edges_kg),
        "head": h,
        "tail": ta,
        "relation": t["relation"],
        "relation_score": t.get("relation_score", 0.0),
        "sentence": t.get("sentence"),
        "source": t.get("source"),
        "date": t.get("date"),
    })

with open("kg_nodes.json", "w", encoding="utf-8") as f:
    json.dump(list(nodes_dict.values()), f, ensure_ascii=False, indent=2)
with open("kg_edges.json", "w", encoding="utf-8") as f:
    json.dump(edges_kg, f, ensure_ascii=False, indent=2)

print("Số node KG:", len(nodes_dict))
print("Số edge KG:", len(edges_kg))
print("Đã lưu: kg_nodes.json, kg_edges.json")

Số node KG: 239
Số edge KG: 692
Đã lưu: kg_nodes.json, kg_edges.json


In [13]:
with open("kg_nodes.json", "r", encoding="utf-8") as f:
    nodes = json.load(f)
with open("kg_edges.json", "r", encoding="utf-8") as f:
    edges = json.load(f)

nodes_by_id = {n["id"]: n for n in nodes}

G = nx.Graph()
DG = nx.DiGraph()

for n in nodes:
    G.add_node(n["id"], label=n.get("label"), type=n.get("type"))
    DG.add_node(n["id"], label=n.get("label"), type=n.get("type"))

for e in edges:
    G.add_edge(e["head"], e["tail"], relation=e.get("relation"), sentence=e.get("sentence"), source=e.get("source"), date=e.get("date"))
    DG.add_edge(e["head"], e["tail"], relation=e.get("relation"), sentence=e.get("sentence"), source=e.get("source"), date=e.get("date"))

cac_tp = sorted(nx.connected_components(G), key=len, reverse=True)
G_gc = G.subgraph(cac_tp[0]).copy() if cac_tp else G.copy()

print("Số thành phần liên thông:", len(cac_tp))
print("Thành phần lớn nhất:", G_gc.number_of_nodes(), "node,", G_gc.number_of_edges(), "edge")

Số thành phần liên thông: 18
Thành phần lớn nhất: 200 node, 466 edge


In [14]:
def tao_text_fact(eid, head_id, tail_id, rel, sentence, source, date):
    h = nodes_by_id.get(head_id, {})
    t = nodes_by_id.get(tail_id, {})
    h_label = h.get("label", str(head_id))
    t_label = t.get("label", str(tail_id))
    h_type = h.get("type", "")
    t_type = t.get("type", "")
    parts = [f"{h_label} ({h_type}) có quan hệ {rel} với {t_label} ({t_type})"]
    if date:
        parts.append(f"thời điểm {date}")
    if source:
        parts.append(f"nguồn {source}")
    if sentence:
        parts.append(f"câu gốc: {sentence}")
    return f"[F{eid}] " + ". ".join(parts)

graph_facts = []
for e in edges:
    graph_facts.append({
        "id": e["id"],
        "head": e["head"],
        "tail": e["tail"],
        "relation": e["relation"],
        "relation_score": float(e.get("relation_score", 0.0)),
        "text": tao_text_fact(e["id"], e["head"], e["tail"], e["relation"], e.get("sentence"), e.get("source"), e.get("date")),
        "source": e.get("source"),
        "date": e.get("date"),
        "sentence": e.get("sentence"),
    })

with open("graph_facts.json", "w", encoding="utf-8") as f:
    json.dump(graph_facts, f, ensure_ascii=False, indent=2)

print("Tổng số fact:", len(graph_facts))
print("Đã lưu: graph_facts.json")

Tổng số fact: 692
Đã lưu: graph_facts.json


In [15]:
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

fact_texts = [f["text"] for f in graph_facts]
embeddings = embed_model.encode(
    fact_texts,
    batch_size=256,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")

np.save("graph_facts_emb.npy", embeddings)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

print("FAISS:", index.ntotal, "| dim:", index.d)
print("Đã lưu: graph_facts_emb.npy")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

FAISS: 692 | dim: 384
Đã lưu: graph_facts_emb.npy


In [16]:
def lay_anchors_tu_cau(question: str):
    ents = loc_thuc_the(chay_ner(question))
    out = []
    for e in ents:
        if e["type"] in ["PER", "ORG", "LOC"]:
            out.append({"text": e["text"], "type": e["type"]})
    return out

def link_anchor(ent):
    ten = ent["text"]
    loai = ent["type"]
    if loai == "PER":
        x = link_entity(ten, "player") or link_entity(ten, "manager")
        return x.get("id") if x else None
    if loai == "ORG":
        x = link_entity(ten, "club")
        return x.get("id") if x else None
    if loai == "LOC":
        x = link_entity(ten, "stadium")
        return x.get("id") if x else None
    x = link_entity(ten, "player") or link_entity(ten, "club") or link_entity(ten, "manager") or link_entity(ten, "stadium")
    return x.get("id") if x else None

def lay_node_anchor(question: str):
    ents = lay_anchors_tu_cau(question)
    ids = []
    for e in ents:
        nid = link_anchor(e)
        if nid is not None:
            ids.append(nid)
    return list(set(ids))

def lay_lan_can_k_hop(anchors, k_hop=2):
    tap = set()
    for a in anchors:
        if a not in G:
            continue
        tap.add(a)
        frontier = {a}
        for _ in range(k_hop):
            ke_tiep = set()
            for u in frontier:
                for v in G.neighbors(u):
                    if v not in tap:
                        ke_tiep.add(v)
            tap.update(ke_tiep)
            frontier = ke_tiep
    return tap

def chon_fact_ket_hop_node(nodes_tap: set):
    chi_so = []
    for i, f in enumerate(graph_facts):
        if f["head"] in nodes_tap or f["tail"] in nodes_tap:
            chi_so.append(i)
    return chi_so

def rerank_texts(question, texts, top_k=15):
    pairs = [(question, t) for t in texts]
    scores = reranker.predict(pairs)
    order = np.argsort(-scores)[:min(top_k, len(texts))]
    return [texts[i] for i in order], [float(scores[i]) for i in order]

In [17]:
def lay_context_tu_graph(question: str, top_k_faiss: int = 50, top_k_final: int = 15, k_hop: int = 2):
    anchors = lay_node_anchor(question)

    if anchors:
        nodes_k = lay_lan_can_k_hop(anchors, k_hop=k_hop)
        idx_ung_vien = chon_fact_ket_hop_node(nodes_k)
        if not idx_ung_vien:
            idx_ung_vien = list(range(len(graph_facts)))
    else:
        idx_ung_vien = list(range(len(graph_facts)))

    q_emb = embed_model.encode([question], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    cand_emb = embeddings[idx_ung_vien]
    scores = cand_emb @ q_emb[0]
    order = np.argsort(-scores)[:min(top_k_faiss, len(scores))]

    cand_facts = [graph_facts[idx_ung_vien[i]] for i in order]
    cand_texts = [f["text"] for f in cand_facts]

    reranked_texts, _ = rerank_texts(question, cand_texts, top_k=top_k_final)

    context = "\n".join(reranked_texts)

    id_map = {f["text"]: f for f in cand_facts}
    facts_final = [id_map[t] for t in reranked_texts if t in id_map]

    return context, facts_final

In [18]:
def tao_prompt(question: str, context: str) -> str:
    return (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh (EPL).\n"
        "Dưới đây là các FACT từ Knowledge Graph. Mỗi fact có ID dạng [Fxx].\n\n"
        "FACTS:\n"
        f"{context}\n\n"
        "YÊU CẦU BẮT BUỘC:\n"
        "- Chỉ sử dụng FACTS ở trên.\n"
        "- Khi trả lời, PHẢI trích dẫn ít nhất 1 ID fact [Fxx].\n"
        "- Nếu không có fact phù hợp, trả lời đúng một câu: KHÔNG ĐỦ DỮ LIỆU.\n"
        "- Không suy đoán, không bịa.\n"
        "- Trả lời tiếng Việt.\n\n"
        f"Câu hỏi: {question}\n\n"
        "Trả lời (kèm trích dẫn):"
    )

def hau_kiem(answer: str, facts: List[Dict[str, Any]]):
    if not answer:
        return "KHÔNG ĐỦ DỮ LIỆU."
    a = answer.strip()
    if "KHÔNG ĐỦ DỮ LIỆU" in a.upper():
        return "KHÔNG ĐỦ DỮ LIỆU."
    ids = {f"[F{f['id']}]" for f in facts}
    if not any(i in a for i in ids):
        return "KHÔNG ĐỦ DỮ LIỆU."
    return a

In [19]:
LLM_NAME = "Qwen/Qwen2-0.5B-Instruct"
tokenizer_llm = AutoTokenizer.from_pretrained(LLM_NAME)

use_cuda = torch.cuda.is_available()
model_llm = AutoModelForCausalLM.from_pretrained(
    LLM_NAME,
    torch_dtype=torch.float16 if use_cuda else torch.float32,
    device_map="auto" if use_cuda else None,
    low_cpu_mem_usage=True
)

if tokenizer_llm.pad_token_id is None:
    tokenizer_llm.pad_token = tokenizer_llm.eos_token

model_llm.eval()
print("Đã tải LLM | CUDA:", use_cuda)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Đã tải LLM | CUDA: True


In [20]:
def answer_question_graph(
    question: str,
    k_hop: int = 2,
    top_k_faiss: int = 50,
    top_k_final: int = 15,
    max_new_tokens: int = 220,
):
    context, facts = lay_context_tu_graph(
        question,
        top_k_faiss=top_k_faiss,
        top_k_final=top_k_final,
        k_hop=k_hop
    )

    prompt = tao_prompt(question, context)
    inputs = tokenizer_llm(prompt, return_tensors="pt", truncation=True, max_length=2048)

    if use_cuda:
        inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer_llm.eos_token_id,
            eos_token_id=tokenizer_llm.eos_token_id
        )

    full_text = tokenizer_llm.decode(out[0], skip_special_tokens=True)
    raw_answer = full_text[len(prompt):].strip() if full_text.startswith(prompt) else full_text.strip()
    answer = hau_kiem(raw_answer, facts)

    return {
        "question": question,
        "answer": answer,
        "context": context,
        "facts": facts,
        "anchors": lay_node_anchor(question),
    }

In [21]:
G_und = G_gc.copy()
DG_dir = DG.copy()

print("Số node (toàn đồ thị):", G.number_of_nodes())
print("Số edge (toàn đồ thị):", G.number_of_edges())
print("Số node (thành phần lớn nhất):", G_und.number_of_nodes())
print("Số edge (thành phần lớn nhất):", G_und.number_of_edges())

Số node (toàn đồ thị): 239
Số edge (toàn đồ thị): 492
Số node (thành phần lớn nhất): 200
Số edge (thành phần lớn nhất): 466


In [22]:
print("Đang tính thế giới nhỏ (small-world)...")

if G_und.number_of_nodes() < 3 or G_und.number_of_edges() < 2:
    print("Không đủ dữ liệu để tính small-world.")
else:
    L = nx.average_shortest_path_length(G_und)
    C = nx.average_clustering(G_und)

    G_rand = nx.gnm_random_graph(G_und.number_of_nodes(), G_und.number_of_edges(), seed=42)
    if nx.is_connected(G_rand):
        Lr = nx.average_shortest_path_length(G_rand)
    else:
        comp = max(nx.connected_components(G_rand), key=len)
        Lr = nx.average_shortest_path_length(G_rand.subgraph(comp).copy())
    Cr = nx.average_clustering(G_rand)

    sigma = (C / (Cr + 1e-9)) / (L / (Lr + 1e-9))

    print("L (độ dài đường đi ngắn nhất TB):", float(L))
    print("C (hệ số gom cụm TB):", float(C))
    print("L_random:", float(Lr))
    print("C_random:", float(Cr))
    print("Sigma (small-world):", float(sigma))

Đang tính thế giới nhỏ (small-world)...
L (độ dài đường đi ngắn nhất TB): 3.6898492462311556
C (hệ số gom cụm TB): 0.5680790087675287
L_random: 3.6327089995431705
C_random: 0.026948051948051947
Sigma (small-world): 20.754072855772986


In [23]:
print("Đang chạy PageRank...")

pr = nx.pagerank(DG_dir, alpha=0.85)
top_pr = sorted(pr.items(), key=lambda x: x[1], reverse=True)[:30]

print("Top 30 node PageRank cao nhất:")
for nid, score in top_pr:
    data = G.nodes[nid]
    print(data.get("label"), "| loại:", data.get("type"), "| PageRank:", float(score))

Đang chạy PageRank...
Top 30 node PageRank cao nhất:
Leeds United F.C. | loại: club | PageRank: 0.03503090554021312
Mohammed Kudus | loại: player | PageRank: 0.03301391568620841
Arsenal F.C. | loại: club | PageRank: 0.02588608731925086
Wolverhampton Wanderers F.C. | loại: club | PageRank: 0.020958327625877352
Liverpool F.C. | loại: club | PageRank: 0.0205934935348743
Anfield | loại: stadium | PageRank: 0.018543997199119106
Randal Kolo Muani | loại: player | PageRank: 0.01636883964257979
Aston Villa F.C. | loại: club | PageRank: 0.016293711422033764
AFC Bournemouth | loại: club | PageRank: 0.012899970164590938
Stamford Bridge | loại: stadium | PageRank: 0.012201903506733405
Nottingham Forest F.C. | loại: club | PageRank: 0.012063155766124057
Chris Swan | loại: player | PageRank: 0.012016348951146607
Bill Caesar | loại: player | PageRank: 0.011490102219108705
City Ground | loại: stadium | PageRank: 0.011219613454951238
Albert Chester | loại: player | PageRank: 0.010565580174461717
Villa 

In [25]:
import community.community_louvain as community_louvain

In [26]:
print("Đang phát hiện cộng đồng (Louvain)...")

partition = community_louvain.best_partition(G_und, random_state=42)
com_dict = {}
for node, cid in partition.items():
    com_dict.setdefault(cid, []).append(node)

com_list = sorted(com_dict.items(), key=lambda x: len(x[1]), reverse=True)
print("Số cộng đồng:", len(com_list))

def thong_ke_cong_dong(cid, members):
    sub = G_und.subgraph(members)
    types = [sub.nodes[n].get("type") for n in sub.nodes()]
    dem_type = Counter(types)
    avg_pr = float(np.mean([pr.get(n, 0.0) for n in sub.nodes()])) if sub.number_of_nodes() else 0.0
    return {
        "cid": cid,
        "size": sub.number_of_nodes(),
        "edges": sub.number_of_edges(),
        "type_count": dict(dem_type),
        "avg_pagerank": avg_pr,
    }

com_stats = [thong_ke_cong_dong(cid, members) for cid, members in com_list]

print("Top 10 cộng đồng lớn nhất:")
for c in com_stats[:10]:
    print("CID:", c["cid"], "| size:", c["size"], "| edges:", c["edges"], "| avg_pagerank:", c["avg_pagerank"])

Đang phát hiện cộng đồng (Louvain)...
Số cộng đồng: 11
Top 10 cộng đồng lớn nhất:
CID: 8 | size: 30 | edges: 98 | avg_pagerank: 0.006577804406232149
CID: 5 | size: 27 | edges: 46 | avg_pagerank: 0.004740488623821478
CID: 1 | size: 23 | edges: 30 | avg_pagerank: 0.004113817253319422
CID: 3 | size: 22 | edges: 32 | avg_pagerank: 0.004237075505996749
CID: 2 | size: 20 | edges: 32 | avg_pagerank: 0.004187742761871058
CID: 4 | size: 17 | edges: 23 | avg_pagerank: 0.003876309656391523
CID: 7 | size: 17 | edges: 24 | avg_pagerank: 0.004656535424247358
CID: 10 | size: 16 | edges: 20 | avg_pagerank: 0.004109349760291777
CID: 0 | size: 10 | edges: 14 | avg_pagerank: 0.003575657986513997
CID: 6 | size: 10 | edges: 17 | avg_pagerank: 0.0038018242993086504


In [27]:
print("Đang xây dựng suy luận Multi-hop...")

def tim_paths_multi_hop(anchors, max_hops=3, max_paths=25):
    paths = []
    anchors = [a for a in anchors if a in G_und]
    if len(anchors) < 2:
        return []
    for i in range(len(anchors)):
        for j in range(i + 1, len(anchors)):
            s = anchors[i]
            t = anchors[j]
            try:
                for p in nx.all_simple_paths(G_und, source=s, target=t, cutoff=max_hops):
                    if 2 <= (len(p) - 1) <= max_hops:
                        paths.append(p)
                        if len(paths) >= max_paths:
                            return paths
            except:
                pass
    return paths

def text_cho_path(path):
    lines = []
    for i in range(len(path) - 1):
        u = path[i]
        v = path[i + 1]
        rel = G_und[u][v].get("relation", "related_to")
        ul = G_und.nodes[u].get("label", str(u))
        vl = G_und.nodes[v].get("label", str(v))
        ut = G_und.nodes[u].get("type", "")
        vt = G_und.nodes[v].get("type", "")
        lines.append(f"{ul} ({ut}) -[{rel}]-> {vl} ({vt})")
    return " ; ".join(lines)

def build_multi_hop_context(question, max_hops=3, max_paths=10):
    anchors = lay_node_anchor(question)
    if not anchors:
        return "", []
    paths = tim_paths_multi_hop(anchors, max_hops=max_hops, max_paths=max_paths)
    if not paths:
        return "", []
    ctx_lines = ["Multi-hop paths liên quan:"]
    for p in paths:
        ctx_lines.append(text_cho_path(p))
    return "\n".join(ctx_lines), paths

def tao_prompt_multi_hop(question: str, context_1hop: str, context_multi: str) -> str:
    return (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh (EPL).\n"
        "Bạn chỉ được dùng dữ liệu sau để trả lời.\n"
        "FACTS có ID dạng [Fxx]. Multi-hop paths là chuỗi quan hệ.\n\n"
        "FACTS:\n"
        f"{context_1hop}\n\n"
        f"{context_multi}\n\n"
        "YÊU CẦU BẮT BUỘC:\n"
        "- Chỉ sử dụng thông tin ở trên.\n"
        "- Khi trả lời, PHẢI trích dẫn ít nhất 1 ID fact [Fxx] hoặc nêu rõ chuỗi multi-hop.\n"
        "- Nếu không đủ thông tin, trả lời đúng một câu: KHÔNG ĐỦ DỮ LIỆU.\n"
        "- Không suy đoán, không bịa.\n"
        "- Trả lời tiếng Việt.\n\n"
        f"Câu hỏi: {question}\n\n"
        "Trả lời:"
    )

def answer_question_multi_hop(
    question: str,
    k_hop: int = 2,
    top_k_faiss: int = 50,
    top_k_final: int = 15,
    max_hops: int = 3,
    max_paths: int = 10,
    max_new_tokens: int = 240,
):
    context_1hop, facts = lay_context_tu_graph(question, top_k_faiss=top_k_faiss, top_k_final=top_k_final, k_hop=k_hop)
    context_multi, paths = build_multi_hop_context(question, max_hops=max_hops, max_paths=max_paths)

    prompt = tao_prompt_multi_hop(question, context_1hop, context_multi)
    inputs = tokenizer_llm(prompt, return_tensors="pt", truncation=True, max_length=2048)
    if use_cuda:
        inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer_llm.eos_token_id,
            eos_token_id=tokenizer_llm.eos_token_id
        )

    full_text = tokenizer_llm.decode(out[0], skip_special_tokens=True)
    raw_answer = full_text[len(prompt):].strip() if full_text.startswith(prompt) else full_text.strip()
    answer = hau_kiem(raw_answer, facts)

    return {
        "question": question,
        "answer": answer,
        "context_1hop": context_1hop,
        "context_multi": context_multi,
        "facts": facts,
        "paths": paths,
        "anchors": lay_node_anchor(question),
    }

print("Đã sẵn sàng suy luận Multi-hop.")

Đang xây dựng suy luận Multi-hop...
Đã sẵn sàng suy luận Multi-hop.


In [28]:
print("Đang tạo tập dữ liệu đánh giá 2000 câu (Đúng/Sai)...")

def tao_menh_de_edge(e):
    h = nodes_by_id.get(e["head"], {})
    t = nodes_by_id.get(e["tail"], {})
    rel = e["relation"]
    hl = h.get("label", str(e["head"]))
    tl = t.get("label", str(e["tail"]))
    if rel == "player_club":
        return f"{hl} hiện đang thi đấu cho {tl}"
    if rel == "manager_club":
        return f"{hl} là huấn luyện viên của {tl}"
    if rel == "player_score":
        return f"{hl} đã ghi bàn cho {tl}"
    if rel == "transfer":
        return f"{hl} đã chuyển nhượng liên quan tới {tl}"
    if rel == "injury":
        return f"{hl} đang gặp chấn thương trong giai đoạn liên quan tới {tl}"
    return f"{hl} có quan hệ {rel} với {tl}"

def tao_cau_hoi_true_false_tu_edge(e, hops=1):
    m = tao_menh_de_edge(e)
    return f"Câu sau đây đúng hay sai: {m}?", True, hops

def tao_cau_hoi_false_tu_edge(e, nodes_list, hops=1):
    h = nodes_by_id.get(e["head"], {})
    t = nodes_by_id.get(e["tail"], {})
    rel = e["relation"]
    tl_type = t.get("type", "")
    same_type_nodes = [n for n in nodes_list if nodes_by_id.get(n, {}).get("type","") == tl_type and n != e["tail"]]
    if not same_type_nodes:
        return None
    t_fake_id = str(np.random.choice(same_type_nodes))
    t_fake = nodes_by_id.get(t_fake_id, {})
    hl = h.get("label", str(e["head"]))
    tl = t_fake.get("label", str(t_fake_id))
    if rel == "player_club":
        m = f"{hl} hiện đang thi đấu cho {tl}"
    elif rel == "manager_club":
        m = f"{hl} là huấn luyện viên của {tl}"
    elif rel == "player_score":
        m = f"{hl} đã ghi bàn cho {tl}"
    elif rel == "transfer":
        m = f"{hl} đã chuyển nhượng liên quan tới {tl}"
    elif rel == "injury":
        m = f"{hl} đang gặp chấn thương trong giai đoạn liên quan tới {tl}"
    else:
        m = f"{hl} có quan hệ {rel} với {tl}"
    return f"Câu sau đây đúng hay sai: {m}?", False, hops

def tao_cau_hoi_true_false_tu_path(path, hops=2):
    return f"Câu sau đây đúng hay sai: {text_cho_path(path)}?", True, hops

def sinh_tap_cau_hoi(so_cau=2000, max_hops=3):
    ds = []
    nodes_ids = list(nodes_by_id.keys())
    np.random.shuffle(nodes_ids)

    edges_shuffled = edges_kg.copy()
    np.random.shuffle(edges_shuffled)

    for e in edges_shuffled:
        if len(ds) >= so_cau:
            break
        q_true, ans_true, hops_true = tao_cau_hoi_true_false_tu_edge(e, hops=1)
        ds.append({"question": q_true, "answer": ans_true, "hops": hops_true, "source": "edge_true"})
        if len(ds) >= so_cau:
            break
        qf = tao_cau_hoi_false_tu_edge(e, nodes_ids, hops=1)
        if qf is not None:
            q_false, ans_false, hops_false = qf
            ds.append({"question": q_false, "answer": ans_false, "hops": hops_false, "source": "edge_false"})

    if len(ds) < so_cau:
        anchors_sample = nodes_ids[:40]
        paths = tim_paths_multi_hop(anchors_sample, max_hops=max_hops, max_paths=1000)
        np.random.shuffle(paths)
        for p in paths:
            if len(ds) >= so_cau:
                break
            q_true, ans_true, hops_true = tao_cau_hoi_true_false_tu_path(p, hops=len(p)-1)
            ds.append({"question": q_true, "answer": ans_true, "hops": hops_true, "source": "path_true"})

    np.random.shuffle(ds)
    return ds[:so_cau]

eval_questions = sinh_tap_cau_hoi(so_cau=2000, max_hops=3)
DUONG_EVAL = "graph_eval_qa_2000.jsonl"
with open(DUONG_EVAL, "w", encoding="utf-8") as f:
    for rec in eval_questions:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Đã tạo:", len(eval_questions), "câu hỏi tại:", DUONG_EVAL)

Đang tạo tập dữ liệu đánh giá 2000 câu (Đúng/Sai)...
Đã tạo: 2000 câu hỏi tại: graph_eval_qa_2000.jsonl


In [29]:
def tach_bool(text: str):
    t = (text or "").strip().lower()
    dong = t.split("\n")[0].strip()
    if dong.startswith("đúng") or dong == "true" or dong.startswith("true"):
        return True
    if dong.startswith("sai") or dong == "false" or dong.startswith("false"):
        return False
    if "đúng" in dong:
        return True
    if "sai" in dong:
        return False
    return None


def tao_prompt_baseline(question: str) -> str:
    return (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh.\n"
        "Chỉ trả lời đúng một từ: Đúng hoặc Sai.\n"
        "Nếu không chắc chắn, trả lời Sai.\n\n"
        f"{question}\n"
        "Trả lời:"
    )


def baseline_answer_batch(questions, max_new_tokens=2):
    prompts = [tao_prompt_baseline(q) for q in questions]
    inputs = tokenizer_llm(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    if use_cuda:
        inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer_llm.eos_token_id,
            eos_token_id=tokenizer_llm.eos_token_id
        )

    texts = tokenizer_llm.batch_decode(out, skip_special_tokens=True)
    ans = []
    for p, t in zip(prompts, texts):
        a = t[len(p):].strip() if t.startswith(p) else t.strip()
        ans.append(a)
    return ans


def graphrag_prompt_bool(question):
    ctx, _ = lay_context_tu_graph(question, top_k_faiss=40, top_k_final=10, k_hop=2)
    prompt = (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh.\n"
        "Chỉ dựa trên FACTS bên dưới để trả lời.\n"
        "Nếu không đủ dữ liệu, trả lời: Sai.\n"
        "Chỉ trả lời đúng một từ: Đúng hoặc Sai.\n\n"
        f"FACTS:\n{ctx}\n\n"
        f"Câu hỏi:\n{question}\n\n"
        "Trả lời:"
    )
    return prompt


def graphrag_answer_batch(questions, max_new_tokens=2):
    prompts = [graphrag_prompt_bool(q) for q in questions]
    inputs = tokenizer_llm(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    if use_cuda:
        inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer_llm.eos_token_id,
            eos_token_id=tokenizer_llm.eos_token_id
        )

    texts = tokenizer_llm.batch_decode(out, skip_special_tokens=True)
    ans = []
    for p, t in zip(prompts, texts):
        a = t[len(p):].strip() if t.startswith(p) else t.strip()
        ans.append(a)
    return ans


def graphrag_multihop_prompt_bool(question):
    ctx1, _ = lay_context_tu_graph(question, top_k_faiss=40, top_k_final=10, k_hop=2)
    ctxm, _ = build_multi_hop_context(question, max_hops=3, max_paths=8)
    prompt = (
        "Bạn là trợ lý bóng đá Ngoại hạng Anh.\n"
        "Chỉ dựa trên FACTS và Multi-hop bên dưới để trả lời.\n"
        "Nếu không đủ dữ liệu, trả lời: Sai.\n"
        "Chỉ trả lời đúng một từ: Đúng hoặc Sai.\n\n"
        f"FACTS:\n{ctx1}\n\n"
        f"{ctxm}\n\n"
        f"Câu hỏi:\n{question}\n\n"
        "Trả lời:"
    )
    return prompt


def graphrag_multihop_answer_batch(questions, max_new_tokens=2):
    prompts = [graphrag_multihop_prompt_bool(q) for q in questions]
    inputs = tokenizer_llm(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    if use_cuda:
        inputs = {k: v.to(model_llm.device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model_llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer_llm.eos_token_id,
            eos_token_id=tokenizer_llm.eos_token_id
        )

    texts = tokenizer_llm.batch_decode(out, skip_special_tokens=True)
    ans = []
    for p, t in zip(prompts, texts):
        a = t[len(p):].strip() if t.startswith(p) else t.strip()
        ans.append(a)
    return ans


def doc_eval(path):
    ds = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                ds.append(json.loads(line))
    return ds


def danh_gia_batch(ds, fn_batch, gioi_han=600, batch_size=24):
    ds = ds[:gioi_han]
    dung = 0
    tong = 0
    bo_qua = 0
    theo_hop = {1: {"dung":0,"tong":0,"bo_qua":0}, 2: {"dung":0,"tong":0,"bo_qua":0}, 3: {"dung":0,"tong":0,"bo_qua":0}}

    for i in tqdm(range(0, len(ds), batch_size), desc="Đang đánh giá (batch)", ncols=100):
        chunk = ds[i:i+batch_size]
        qs = [r["question"] for r in chunk]
        golds = [bool(r["answer"]) for r in chunk]
        hops = [int(r.get("hops", 1)) for r in chunk]

        answers = fn_batch(qs)

        for ans, gold, hop in zip(answers, golds, hops):
            pred = tach_bool(ans)
            if pred is None:
                bo_qua += 1
                if hop in theo_hop:
                    theo_hop[hop]["bo_qua"] += 1
                continue
            tong += 1
            if hop in theo_hop:
                theo_hop[hop]["tong"] += 1
            if pred == gold:
                dung += 1
                if hop in theo_hop:
                    theo_hop[hop]["dung"] += 1

    acc = dung / tong if tong else 0.0
    return acc, tong, bo_qua, theo_hop


ds_eval = doc_eval(DUONG_EVAL)
print("Tổng câu:", len(ds_eval))

acc_g, tong_g, bo_g, hop_g = danh_gia_batch(ds_eval, lambda qs: graphrag_answer_batch(qs, max_new_tokens=2), gioi_han=600, batch_size=8)
print("GraphRAG | accuracy:", acc_g, "| số mẫu tính:", tong_g, "| bỏ qua:", bo_g)
print("GraphRAG | theo hop:", hop_g)

acc_m, tong_m, bo_m, hop_m = danh_gia_batch(ds_eval, lambda qs: graphrag_multihop_answer_batch(qs, max_new_tokens=2), gioi_han=600, batch_size=6)
print("GraphRAG Multi-hop | accuracy:", acc_m, "| số mẫu tính:", tong_m, "| bỏ qua:", bo_m)
print("GraphRAG Multi-hop | theo hop:", hop_m)

acc_b, tong_b, bo_b, hop_b = danh_gia_batch(ds_eval, lambda qs: baseline_answer_batch(qs, max_new_tokens=2), gioi_han=600, batch_size=10)
print("Baseline | accuracy:", acc_b, "| số mẫu tính:", tong_b, "| bỏ qua:", bo_b)
print("Baseline | theo hop:", hop_b)

Tổng câu: 2000


Đang đánh giá (batch):   0%|                                                 | 0/75 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


GraphRAG | accuracy: 0.2857142857142857 | số mẫu tính: 7 | bỏ qua: 593
GraphRAG | theo hop: {1: {'dung': 2, 'tong': 6, 'bo_qua': 424}, 2: {'dung': 0, 'tong': 0, 'bo_qua': 18}, 3: {'dung': 0, 'tong': 1, 'bo_qua': 151}}


Đang đánh giá (batch):   0%|                                                | 0/100 [00:00<?, ?it/s]

GraphRAG Multi-hop | accuracy: 0.0 | số mẫu tính: 2 | bỏ qua: 598
GraphRAG Multi-hop | theo hop: {1: {'dung': 0, 'tong': 2, 'bo_qua': 428}, 2: {'dung': 0, 'tong': 0, 'bo_qua': 18}, 3: {'dung': 0, 'tong': 0, 'bo_qua': 152}}


Đang đánh giá (batch):   0%|                                                 | 0/60 [00:00<?, ?it/s]

Baseline | accuracy: 0.014705882352941176 | số mẫu tính: 68 | bỏ qua: 532
Baseline | theo hop: {1: {'dung': 1, 'tong': 3, 'bo_qua': 427}, 2: {'dung': 0, 'tong': 2, 'bo_qua': 16}, 3: {'dung': 0, 'tong': 63, 'bo_qua': 89}}


In [30]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
import numpy as np

def danh_gia_ner(articles, n_cau=800, batch_size=64):
    cau = []
    for art in articles:
        cau.extend(tach_cau(art.get("content","") or ""))
    cau = [c for c in cau if c and len(c.split()) >= 4]
    random.shuffle(cau)
    cau = cau[:n_cau]
    if not cau:
        print("Không có câu để đánh giá NER")
        return

    y_true = []
    y_pred = []

    for i in tqdm(range(0, len(cau), batch_size), desc="Đang đánh giá NER", ncols=100):
        chunk = cau[i:i+batch_size]
        docs = list(nlp.pipe(chunk, batch_size=batch_size))
        for sent, doc in zip(chunk, docs):
            true_ents = chay_ner(sent)
            pred_ents = []
            for ent in doc.ents:
                tag = SPACY2TAG.get(ent.label_)
                if tag:
                    pred_ents.append({
                        "text": ent.text,
                        "type": tag,
                        "start": ent.start_char,
                        "end": ent.end_char
                    })

            def to_set(ents):
                return set((e["start"], e["end"], e["type"]) for e in ents)

            tset = to_set(true_ents)
            pset = to_set(pred_ents)

            for x in tset:
                y_true.append(1)
                y_pred.append(1 if x in pset else 0)

            for x in pset:
                if x not in tset:
                    y_true.append(0)
                    y_pred.append(1)

    if not y_true:
        print("Không đủ dữ liệu NER để tính metric")
        return

    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    acc = accuracy_score(y_true, y_pred)

    print("NER | Precision:", float(p))
    print("NER | Recall:", float(r))
    print("NER | F1-score:", float(f1))
    print("NER | Accuracy:", float(acc))


def danh_gia_re(triples_linked, batch_size=64):
    if not triples_linked:
        print("Không có triple để đánh giá RE")
        return

    texts = [t["sentence"] for t in triples_linked if t.get("sentence")]
    golds = [t["relation"] for t in triples_linked if t.get("sentence")]

    if not texts:
        print("Không có câu để đánh giá RE")
        return

    preds = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Đang đánh giá RE", ncols=100):
        chunk = texts[i:i+batch_size]
        out = [chay_re(s)["label"] for s in chunk]
        preds.extend(out)

    labels = sorted(list(set(golds) | set(preds)))
    p, r, f1, _ = precision_recall_fscore_support(golds, preds, average="macro", zero_division=0)
    acc = accuracy_score(golds, preds)

    print("RE | Precision(macro):", float(p))
    print("RE | Recall(macro):", float(r))
    print("RE | F1-score(macro):", float(f1))
    print("RE | Accuracy:", float(acc))
    print("RE | Báo cáo chi tiết:")
    print(classification_report(golds, preds, labels=labels, zero_division=0))


danh_gia_ner(articles, n_cau=800, batch_size=64)
danh_gia_re(triples_linked, batch_size=64)


Đang đánh giá NER:   0%|                                                     | 0/13 [00:00<?, ?it/s]

NER | Precision: 1.0
NER | Recall: 1.0
NER | F1-score: 1.0
NER | Accuracy: 1.0


Đang đánh giá RE:   0%|                                                      | 0/11 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [31]:
def chat_fn(message, history):
    try:
        return answer_question_graph(message)["answer"]
    except Exception:
        print(traceback.format_exc())
        return "Lỗi khi suy luận. Vui lòng thử lại."

with gr.Blocks() as demo:
    gr.Markdown("# Chatbot EPL – GraphRAG (spaCy NER + EntityRuler + FAISS + Rerank + Cite)")
    gr.ChatInterface(fn=chat_fn, title="Chat EPL", description="Hỏi về cầu thủ, CLB, HLV, chuyển nhượng, bàn thắng...")

demo.queue()
demo.launch(share=False, debug=False)


  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

