# Benchmark (Deep Learning) — NER / RE

Notebook này chỉ đánh giá bằng:
- `classification_report`,
- ma trận nhầm lẫn (confusion matrix).

Ghi chú:
- NER đánh giá ở mức **token** (lọc bỏ token nhãn `-100`).
- RE đánh giá ở mức **sample**.
- Tiền xử lý bám theo `src/data_loader/dataset.py` (PhoBERT tokenizer, `is_split_into_words=True` cho NER).

In [None]:
import os
import json
from pathlib import Path

import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizerFast
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
from src.data_loader.dataset import (
    convert_label_studio_to_ner_data,
    prepare_re_data_from_json,
    NERDataset,
    REDataset,
),
from src.models.deep_learning import PhoBertForNER, PhoBertForRE

In [None]:
# ===== Paths =====
ROOT = Path('..').resolve()
DATA_DIR = ROOT / 'data' / '04_model_input'
MODELS_DIR = ROOT / 'models'

TEST_JSON_PATH = DATA_DIR / 'test_dataset.json'
TRAIN_JSON_PATH = DATA_DIR / 'train_dataset.json' 

NER_ID2LABEL_PATH = MODELS_DIR / 'ner' / 'ner_id2label.json'
RE_ID2LABEL_PATH  = MODELS_DIR / 're' / 're_id2label.json'

# Optional: đặt weight đã train vào đây (nếu có)
NER_WEIGHTS_CANDIDATES = [

]
RE_WEIGHTS_CANDIDATES = [

]

print('ROOT:', ROOT)
print('TEST_JSON_PATH:', TEST_JSON_PATH)
print('TEST_JSON_PATH exists:', TEST_JSON_PATH.exists())

In [None]:
# ===== Load raw JSON =====
with open(TEST_JSON_PATH, 'r', encoding='utf-8') as f:
    test_json = json.load(f)

train_json = None
if TRAIN_JSON_PATH.exists():
    with open(TRAIN_JSON_PATH, 'r', encoding='utf-8') as f:
        train_json = json.load(f)

print('Test docs :', len(test_json))
print('Train docs:', 0 if train_json is None else len(train_json))

# ===== Tokenizer =====
tokenizer = RobertaTokenizerFast.from_pretrained('vinai/phobert-base-v2', add_prefix_space=True)
print('Tokenizer loaded')

In [None]:
def load_id2label(path: Path, fallback: str):
    if path.exists():
        with open(path, 'r', encoding='utf-8') as f:
            raw = json.load(f)
        return {int(k): v for k, v in raw.items()}

    if train_json is None:
        raise FileNotFoundError(f'Missing {path} and no train json to recreate mapping')

    if fallback == 'ner':
        ner_data_raw = convert_label_studio_to_ner_data(train_json)
        labels = sorted({lab for sent in ner_data_raw for _, lab in sent})
        return {i: lab for i, lab in enumerate(labels)}
    if fallback == 're':
        re_data_raw = prepare_re_data_from_json(train_json)
        labels = sorted({item['label'] for item in re_data_raw})
        return {i: lab for i, lab in enumerate(labels)}
    raise ValueError('fallback must be ner|re')

def id2label_to_label2id(id2label):
    return {v: k for k, v in id2label.items()}

ner_id2label = load_id2label(NER_ID2LABEL_PATH, fallback='ner')
re_id2label  = load_id2label(RE_ID2LABEL_PATH,  fallback='re')
ner_label2id = id2label_to_label2id(ner_id2label)
re_label2id  = id2label_to_label2id(re_id2label)

print('NER labels:', len(ner_id2label))
print('RE labels :', len(re_id2label))

# ===== Prepare datasets =====
ner_test_data = convert_label_studio_to_ner_data(test_json)
re_test_data  = prepare_re_data_from_json(test_json)

ner_test_ds = NERDataset(ner_test_data, tokenizer=tokenizer, label2id=ner_label2id, max_len=256)
re_test_ds  = REDataset(re_test_data,  tokenizer=tokenizer, label2id=re_label2id,  max_len=256)

ner_test_loader = DataLoader(ner_test_ds, batch_size=8, shuffle=False)
re_test_loader  = DataLoader(re_test_ds,  batch_size=16, shuffle=False)

print('NER test sentences:', len(ner_test_ds))
print('RE test pairs     :', len(re_test_ds))

def find_first_existing(candidates):
    for p in candidates:
        if p.exists():
            return p
    return None

def require_weights(model, candidates, model_name: str):
    p = find_first_existing(candidates)
    if p is None:
        raise FileNotFoundError(
            f'Không tìm thấy weight đã lưu cho {model_name}.\n'
            'Hãy copy file weight vào một trong các path sau rồi chạy lại:\n'
            + '\n'.join([f' - {c}' for c in candidates])
        )
    state = torch.load(p, map_location='cpu')
    if isinstance(state, dict) and 'state_dict' in state:
        state = state['state_dict']
    model.load_state_dict(state, strict=False)
    print(f'Loaded {model_name} weights:', p)
    return p

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('DEVICE:', DEVICE)

## NER (PhoBERT) — classification report & confusion matrix

In [None]:
ner_model = PhoBertForNER('vinai/phobert-base-v2', num_labels=len(ner_id2label))
_ = require_weights(ner_model, NER_WEIGHTS_CANDIDATES, model_name='NER')
ner_model.to(DEVICE)
ner_model.eval()

all_true = []
all_pred = []

with torch.no_grad():
    for batch in ner_test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].cpu().numpy()

        logits = ner_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(logits, dim=2).cpu().numpy()

        # Flatten + filter ignore_index (-100)
        for i in range(labels.shape[0]):
            for j in range(labels.shape[1]):
                if labels[i, j] == -100:
                    continue
                all_true.append(ner_id2label[int(labels[i, j])])
                all_pred.append(ner_id2label[int(preds[i, j])])

labels_sorted = [ner_id2label[i] for i in sorted(ner_id2label.keys())]
print(classification_report(all_true, all_pred, labels=labels_sorted, zero_division=0))

cm = confusion_matrix(all_true, all_pred, labels=labels_sorted)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False, cmap='Blues', xticklabels=labels_sorted, yticklabels=labels_sorted)
plt.title('NER Confusion Matrix (token-level)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## RE (PhoBERT) — classification report & confusion matrix

In [None]:
re_model = PhoBertForRE('vinai/phobert-base-v2', num_labels=len(re_id2label))
_ = require_weights(re_model, RE_WEIGHTS_CANDIDATES, model_name='RE')
re_model.to(DEVICE)
re_model.eval()

all_true = []
all_pred = []

with torch.no_grad():
    for batch in re_test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].cpu().numpy()

        logits = re_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_true.extend([re_id2label[int(x)] for x in labels])
        all_pred.extend([re_id2label[int(x)] for x in preds])

labels_sorted = [re_id2label[i] for i in sorted(re_id2label.keys())]
print(classification_report(all_true, all_pred, labels=labels_sorted, zero_division=0))

cm = confusion_matrix(all_true, all_pred, labels=labels_sorted)
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels_sorted, yticklabels=labels_sorted)
plt.title('RE Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()