# Benchmark (Machine Learning) — NER / RE

Notebook này chỉ đánh giá bằng:
- `classification_report`,
- ma trận nhầm lẫn (confusion matrix).

In [None]:
import os
import json
from pathlib import Path
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
ROOT = Path('..').resolve()
VECTOR_DIR = ROOT / 'data' / 'vectors'
DATA_DIR = ROOT / 'data' / '04_model_input'
MODELS_DIR = ROOT / 'models'

print('VECTOR_DIR:', VECTOR_DIR)
print('DATA_DIR  :', DATA_DIR)

# ===== Load NER vectors =====
X_train_ner = np.load(VECTOR_DIR / 'X_train_ner_phobert.npy', mmap_mode='r')  # (N, T, H)
y_train_ner = np.load(VECTOR_DIR / 'y_train_ner_phobert.npy', mmap_mode='r')  # (N, T)
X_test_ner  = np.load(VECTOR_DIR / 'X_test_ner_phobert.npy', mmap_mode='r')
y_test_ner  = np.load(VECTOR_DIR / 'y_test_ner_phobert.npy', mmap_mode='r')

# ===== Load RE vectors =====
X_train_re = np.load(VECTOR_DIR / 'X_train_re_phobert.npy', mmap_mode='r')  # (N, H)
y_train_re = np.load(VECTOR_DIR / 'y_train_re_phobert.npy', mmap_mode='r')  # (N,)
X_test_re  = np.load(VECTOR_DIR / 'X_test_re_phobert.npy', mmap_mode='r')
y_test_re  = np.load(VECTOR_DIR / 'y_test_re_phobert.npy', mmap_mode='r')

print('NER train:', X_train_ner.shape, y_train_ner.shape)
print('NER test :', X_test_ner.shape,  y_test_ner.shape)
print('RE train :', X_train_re.shape,  y_train_re.shape)
print('RE test  :', X_test_re.shape,   y_test_re.shape)

# ===== Load label maps =====
NER_ID2LABEL_PATH = MODELS_DIR / 'ner' / 'ner_id2label.json'
RE_ID2LABEL_PATH  = MODELS_DIR / 're' / 're_id2label.json'

def load_id2label(path: Path):
    if not path.exists():
        return None
    with open(path, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    return {int(k): v for k, v in raw.items()}

ner_id2label = load_id2label(NER_ID2LABEL_PATH)
re_id2label  = load_id2label(RE_ID2LABEL_PATH)

if ner_id2label is None or re_id2label is None:
    # fallback: recreate from train_dataset.json if needed
    from src.data_loader.dataset import convert_label_studio_to_ner_data, prepare_re_data_from_json
    with open(DATA_DIR / 'train_dataset.json', 'r', encoding='utf-8') as f:
        train_json = json.load(f)
    if ner_id2label is None:
        ner_data_raw = convert_label_studio_to_ner_data(train_json)
        labels = sorted({lab for sent in ner_data_raw for _, lab in sent})
        ner_id2label = {i: lab for i, lab in enumerate(labels)}
    if re_id2label is None:
        re_data_raw = prepare_re_data_from_json(train_json)
        labels = sorted({item['label'] for item in re_data_raw})
        re_id2label = {i: lab for i, lab in enumerate(labels)}

print('NER labels:', len(ner_id2label))
print('RE labels :', len(re_id2label))

## NER (ML on PhoBERT vectors) — classification report & confusion matrix

In [None]:
# y_*_ner có thể chứa -100 cho padding/subword => filter
IGNORE_ID = -100

def flatten_ner(X_3d, y_2d):
    X_list = []
    y_list = []
    for i in range(y_2d.shape[0]):
        for t in range(y_2d.shape[1]):
            lab = int(y_2d[i, t])
            if lab == IGNORE_ID:
                continue
            X_list.append(np.asarray(X_3d[i, t], dtype=np.float32))
            y_list.append(lab)
    return np.stack(X_list, axis=0), np.asarray(y_list, dtype=np.int64)

Xtr_tok, ytr_tok = flatten_ner(X_train_ner, y_train_ner)
Xte_tok, yte_tok = flatten_ner(X_test_ner,  y_test_ner)

print('Flattened NER train:', Xtr_tok.shape, ytr_tok.shape)
print('Flattened NER test :', Xte_tok.shape, yte_tok.shape)

# ===== Load saved NER model (no training) =====
import joblib

NER_MODEL_CANDIDATES = [
    MODELS_DIR / 'ner' / 'ner_crf.pkl',
    MODELS_DIR / 'ner' / 'ner_svm.pkl',
    MODELS_DIR / 'ner' / 'ner_maxent.pkl',
    MODELS_DIR / 'ner' / 'ner_randomforest.pkl',
    MODELS_DIR / 'ner' / 'ner_model.pkl',
]

def find_existing(paths):
    for p in paths:
        if p.exists():
            return p
    return None

ner_model_path = find_existing(NER_MODEL_CANDIDATES)
if ner_model_path is None:
    raise FileNotFoundError(
        'Không tìm thấy NER model đã lưu trong models/ner/.\n'
        'Hãy copy một trong các file sau vào đúng vị trí rồi chạy lại:\n'
        + '\n'.join([f' - {p}' for p in NER_MODEL_CANDIDATES])
    )

ner_model = joblib.load(ner_model_path)
print('Loaded NER model:', ner_model_path)

# Predict: model có thể là pipeline sklearn => nhận X (n_samples, hidden)
y_pred = ner_model.predict(Xte_tok)

labels_sorted = sorted(ner_id2label.keys())
target_names = [ner_id2label[i] for i in labels_sorted]
print(classification_report(yte_tok, y_pred, labels=labels_sorted, target_names=target_names, zero_division=0))

cm = confusion_matrix(yte_tok, y_pred, labels=labels_sorted)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False, cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('NER Confusion Matrix (token-level)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## RE (ML on PhoBERT vectors) — classification report & confusion matrix

In [None]:
# ===== Load saved RE model (no training) =====
import joblib

RE_MODEL_CANDIDATES = [
    MODELS_DIR / 're' / 're_svm.pkl',
    MODELS_DIR / 're' / 're_maxent.pkl',
    MODELS_DIR / 're' / 're_randomforest.pkl',
    MODELS_DIR / 're' / 're_model.pkl',
]

re_model_path = find_existing(RE_MODEL_CANDIDATES)
if re_model_path is None:
    raise FileNotFoundError(
        'Không tìm thấy RE model đã lưu trong models/re/.\n'
        'Hãy copy một trong các file sau vào đúng vị trí rồi chạy lại:\n'
        + '\n'.join([f' - {p}' for p in RE_MODEL_CANDIDATES])
    )

re_model = joblib.load(re_model_path)
print('Loaded RE model:', re_model_path)

y_pred = re_model.predict(X_test_re)

labels_sorted = sorted(re_id2label.keys())
target_names = [re_id2label[i] for i in labels_sorted]
print(classification_report(y_test_re, y_pred, labels=labels_sorted, target_names=target_names, zero_division=0))

cm = confusion_matrix(y_test_re, y_pred, labels=labels_sorted)
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('RE Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()