In [None]:

!pip -q install -U "transformers>=4.45.2" "datasets>=2.20.0" "evaluate>=0.4.2" "accelerate>=0.34.2" sentencepiece packaging
import transformers, datasets, evaluate, torch, packaging
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("evaluate:", evaluate.__version__)
print("torch:", torch.__version__)


transformers: 4.56.2
datasets: 4.1.1
evaluate: 0.4.6
torch: 2.8.0+cu126


In [None]:

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os, pandas as pd, numpy as np, torch, json, inspect
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate


os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

SEED = 42
CSV_PATH = "/content/drive/MyDrive/PBL6_DataSet/10000_Trieu_Chung_Benh_Pho_Bien_FINAL.csv"
assert os.path.exists(CSV_PATH), f"Kh√¥ng t√¨m th·∫•y CSV t·∫°i: {CSV_PATH}"

df = pd.read_csv(CSV_PATH)[["text","intent"]]
le = LabelEncoder(); df["label"] = le.fit_transform(df["intent"])

train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED, stratify=df["label"])
train_df, val_df  = train_test_split(train_df, test_size=0.1, random_state=SEED, stratify=train_df["label"])

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

model_name = "vinai/phobert-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name, use_fast=False)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=128)

ds = ds.map(tok, batched=True, remove_columns=[c for c in ["text","intent","__index_level_0__"] if c in ds["train"].column_names])
ds.set_format(type="torch")

id2label = {i:c for i,c in enumerate(le.classes_)}
label2id = {c:i for i,c in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(le.classes_), id2label=id2label, label2id=label2id
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy":  metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro":  metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

cap_major = torch.cuda.get_device_capability(0)[0] if torch.cuda.is_available() else 0
use_bf16  = torch.cuda.is_available() and cap_major >= 8
use_fp16  = torch.cuda.is_available() and not use_bf16

ta_params = set(inspect.signature(TrainingArguments.__init__).parameters.keys())
kwargs = dict(
    output_dir="./phobert_intent_vi",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    seed=SEED
)
if "report_to" in ta_params: kwargs["report_to"] = "none"
if "bf16" in ta_params: kwargs["bf16"] = use_bf16
if "fp16" in ta_params: kwargs["fp16"] = use_fp16
if "eval_strategy" in ta_params:
    kwargs["eval_strategy"] = "epoch"
elif "evaluation_strategy" in ta_params:
    kwargs["evaluation_strategy"] = "epoch"

args = TrainingArguments(**kwargs)

trainer_init_params = set(inspect.signature(Trainer.__init__).parameters.keys())
trainer_kwargs = dict(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics
)
if "processing_class" in trainer_init_params:
    trainer_kwargs["processing_class"] = tokenizer
else:
    trainer_kwargs["tokenizer"] = tokenizer

trainer = Trainer(**trainer_kwargs)

trainer.train()
test_metrics = trainer.evaluate(ds["test"])
print(test_metrics)

trainer.model.save_pretrained("./phobert_intent_vi", safe_serialization=True)
tokenizer.save_pretrained("./phobert_intent_vi")
with open("label_encoder.json","w",encoding="utf-8") as f:
    json.dump({"classes": le.classes_.tolist()}, f, ensure_ascii=False)


Map:   0%|          | 0/8100 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.8668,0.069504,0.984444,0.984451
2,0.0388,0.028956,0.994444,0.994483
3,0.0187,0.02858,0.994444,0.994466
4,0.0103,0.017928,0.996667,0.996683
5,0.0063,0.019301,0.995556,0.995577


{'eval_loss': 0.030212629586458206, 'eval_accuracy': 0.996, 'eval_f1_macro': 0.9960119462775184, 'eval_runtime': 0.9364, 'eval_samples_per_second': 1067.972, 'eval_steps_per_second': 34.175, 'epoch': 5.0}


In [None]:
# === EXPORT CH·ªà CHECKPOINT M·ªöI NH·∫§T (KH√îNG L·∫§Y C√ÅC CHECKPOINT C≈®) ===
import os, re, glob, shutil, zipfile, json

OUTPUT_DIR = "./phobert_intent_vi"          # Th∆∞ m·ª•c output c·ªßa notebook n√†y
EXPORT_DIR = "./export_phobert_latest"      # Th∆∞ m·ª•c t·∫°m ƒë·ªÉ ƒë√≥ng g√≥i
ZIP_PATH   = "phobert_latest_win.zip"       # File zip xu·∫•t ra

# 1) D·ªçn v√† t·∫°o th∆∞ m·ª•c t·∫°m
shutil.rmtree(EXPORT_DIR, ignore_errors=True)
os.makedirs(EXPORT_DIR, exist_ok=True)

# 2) T√¨m checkpoint-* m·ªõi nh·∫•t; n·∫øu kh√¥ng c√≥ th√¨ fallback sang th∆∞ m·ª•c g·ªëc (ƒë√£ save_pretrained best model)
def _ckpt_num(name: str) -> int:
    m = re.search(r"checkpoint-(\d+)", name)
    return int(m.group(1)) if m else -1

ckpt_dirs = [d for d in os.listdir(OUTPUT_DIR)
             if d.startswith("checkpoint-") and os.path.isdir(os.path.join(OUTPUT_DIR, d))]
if ckpt_dirs:
    latest_ckpt_name = max(ckpt_dirs, key=_ckpt_num)
    latest_dir = os.path.join(OUTPUT_DIR, latest_ckpt_name)
else:
    latest_ckpt_name = "(no-checkpoint; using root saved model)"
    latest_dir = OUTPUT_DIR  # d√πng model ƒë√£ save_pretrained ·ªü g·ªëc

# 3) Copy file MODEL + CONFIG t·ª´ checkpoint m·ªõi nh·∫•t
need_patterns = [
    "model.safetensors", "pytorch_model.bin", "pytorch_model.bin.index.json",
    "config.json", "generation_config.json"
]
copied = []
for pat in need_patterns:
    for src in glob.glob(os.path.join(latest_dir, pat)):
        shutil.copy2(src, os.path.join(EXPORT_DIR, os.path.basename(src)))
        copied.append(os.path.basename(src))

# 4) Copy TOKENIZER t·ª´ th∆∞ m·ª•c g·ªëc OUTPUT_DIR (checkpoint th∆∞·ªùng kh√¥ng ch·ª©a tokenizer)
tokenizer_files = [
    "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json",
    "vocab.txt", "vocab.json", "merges.txt", "sentencepiece.bpe.model", "spm.model", "added_tokens.json"
]
for name in tokenizer_files:
    src = os.path.join(OUTPUT_DIR, name)
    if os.path.exists(src):
        shutil.copy2(src, os.path.join(EXPORT_DIR, name))
        copied.append(name)

# 5) Th√™m nh√£n n·∫øu b·∫°n c√≥ (kh√¥ng b·∫Øt bu·ªôc v√¨ id2label/label2id ƒë√£ n·∫±m trong config)
for name in ["label_encoder.json", "labels.json", "id2label.json", "label2id.json"]:
    for base in [".", OUTPUT_DIR]:
        src = os.path.join(base, name)
        if os.path.exists(src):
            shutil.copy2(src, os.path.join(EXPORT_DIR, os.path.basename(src)))
            copied.append(os.path.basename(src))
            break

# 6) Sanity check
assert any(n in copied for n in ["model.safetensors","pytorch_model.bin","pytorch_model.bin.index.json"]), "Kh√¥ng th·∫•y file tr·ªçng s·ªë model ·ªü checkpoint m·ªõi nh·∫•t!"
assert "config.json" in copied, "Thi·∫øu config.json!"

# 7) ƒê√≥ng g√≥i th√†nh .zip
if os.path.exists(ZIP_PATH): os.remove(ZIP_PATH)
with zipfile.ZipFile(ZIP_PATH, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(EXPORT_DIR):
        for f in files:
            abspath = os.path.join(root, f)
            rel = os.path.relpath(abspath, EXPORT_DIR)
            z.write(abspath, arcname=rel)

print(f"‚úÖ ƒê√£ gom {len(copied)} file v√†o: {ZIP_PATH}")
print(f"‚û°Ô∏è  Latest checkpoint: {latest_ckpt_name}  (src: {latest_dir})")
try:
    from google.colab import files
    files.download(ZIP_PATH)
except Exception:
    print("N·∫øu ch·∫°y ngo√†i Colab: t·∫£i file .zip th·ªß c√¥ng trong th∆∞ m·ª•c l√†m vi·ªác.")


‚úÖ ƒê√£ gom 7 file v√†o: phobert_latest_win.zip
‚û°Ô∏è  Latest checkpoint: checkpoint-2535  (src: ./phobert_intent_vi/checkpoint-2535)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# import os, torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# THRESHOLD = 0.50

# def _latest_checkpoint(base_dir):
#     if not base_dir or not os.path.isdir(base_dir):
#         return None
#     cks = [d for d in os.listdir(base_dir) if d.startswith("checkpoint-")]
#     if not cks:
#         return base_dir
#     cks_sorted = sorted(cks, key=lambda x: int(x.split("-")[-1]))
#     return os.path.join(base_dir, cks_sorted[-1])

# def _auto_load():
#     tr = globals().get("trainer", None)
#     tok = globals().get("tokenizer", None)
#     if tr is not None and getattr(tr, "model", None) is not None and tok is not None:
#         return tr.model.to(DEVICE).eval(), tok
#     base_dir = None
#     if "OUTPUT_DIR" in globals() and isinstance(globals()["OUTPUT_DIR"], str):
#         base_dir = globals()["OUTPUT_DIR"]
#     if "MODEL_DIR" in globals() and isinstance(globals()["MODEL_DIR"], str):
#         base_dir = globals()["MODEL_DIR"] or base_dir
#     ckpt = _latest_checkpoint(base_dir) if base_dir else None
#     if ckpt is None:
#         raise ValueError("Kh√¥ng t√¨m th·∫•y model.")
#     return AutoModelForSequenceClassification.from_pretrained(ckpt).to(DEVICE).eval(), AutoTokenizer.from_pretrained(ckpt)

# def _id2label(model):
#     le = globals().get("le", None)
#     if le is not None and hasattr(le, "classes_"):
#         classes = list(le.classes_)
#         return {i: classes[i] for i in range(len(classes))}
#     cfg = getattr(model, "config", None)
#     m = getattr(cfg, "id2label", None)
#     if m:
#         return {int(k): v for k, v in m.items()}
#     return {i: f"LABEL_{i}" for i in range(model.config.num_labels)}

# model, tok = _auto_load()
# id2label = _id2label(model)
# text = input("Nh·∫≠p m√¥ t·∫£ tri·ªáu ch·ª©ng: ").strip()
# enc = tok([text], padding=True, truncation=True, max_length=256, return_tensors="pt")
# enc = {k: v.to(DEVICE) for k, v in enc.items()}
# with torch.no_grad():
#     probs = torch.softmax(model(**enc).logits, dim=-1)[0].cpu()
#     j = int(torch.argmax(probs).item())
#     p = float(probs[j].item())
# label = id2label.get(j, str(j))
# if p >= THRESHOLD and text:
#     print(f"{label} ({p:.2%})")
# else:
#     print("Tri·ªáu ch·ª©ng c·ªßa lo·∫°i b·ªánh n√†y l·∫° qu√°, n√™n t√¥i kh√¥ng bi·∫øt ·∫°")
print("Hello")


Hello


In [None]:
# === L∆ØU CHECKPOINT M·ªöI NH·∫§T V√ÄO TH∆Ø M·ª§C checkpoint_HHMM_dd-mm-YYYY TR√äN /content/drive/MyDrive/PBL6_DataSet/PBL6_Dataset ===
import os, re, glob, json, shutil
from datetime import datetime, timezone
from typing import Optional

# M√∫i gi·ªù VN (Python 3.9+)
try:
    from zoneinfo import ZoneInfo
    _VN_TZ = ZoneInfo("Asia/Ho_Chi_Minh")
except Exception:
    _VN_TZ = None

# 0) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 1) Th∆∞ m·ª•c g·ªëc tr√™n Drive
DEST_PARENT = "/content/drive/MyDrive/PBL6_DataSet"
DEST_ROOT   = os.path.join(DEST_PARENT, "PBL6_Dataset")
os.makedirs(DEST_ROOT, exist_ok=True)

# 2) ƒêo√°n th∆∞ m·ª•c output c·ªßa qu√° tr√¨nh train
DEFAULT_OUTPUT_DIRS = [
    "./phobert_intent_vi", "./output", "./models", "./runs", "."
]
def find_output_dir():
    for d in DEFAULT_OUTPUT_DIRS:
        if not os.path.isdir(d):
            continue
        has_ckpt = bool(glob.glob(os.path.join(d, "checkpoint-*")))
        has_cfg  = os.path.exists(os.path.join(d, "config.json"))
        if has_ckpt or has_cfg:
            return d
    return "."

OUTPUT_DIR = find_output_dir()

# 3) Lu√¥n ch·ªçn checkpoint-* M·ªöI NH·∫§T theo step (n·∫øu kh√¥ng c√≥ checkpoint -> d√πng base_dir)
def pick_latest_ckpt(base_dir: str) -> str:
    base_name = os.path.basename(os.path.normpath(base_dir))
    if base_name.startswith("checkpoint-"):
        return base_dir
    ckpts = [os.path.join(base_dir, d) for d in os.listdir(base_dir)
             if d.startswith("checkpoint-") and os.path.isdir(os.path.join(base_dir, d))]
    if ckpts:
        def _step(p):
            m = re.search(r"checkpoint-(\d+)", os.path.basename(p))
            return int(m.group(1)) if m else -1
        ckpts.sort(key=_step, reverse=True)
        return ckpts[0]
    return base_dir

SRC_DIR = pick_latest_ckpt(OUTPUT_DIR)

# 4) Chu·∫©n b·ªã t√™n th∆∞ m·ª•c theo th·ªùi ƒëi·ªÉm L∆ØU (gi·ªù-ph√∫t-ng√†y-th√°ng-nƒÉm, VN time n·∫øu c√≥)
save_time_utc = datetime.now(timezone.utc)
if _VN_TZ is not None:
    save_time_vn = save_time_utc.astimezone(_VN_TZ)
else:
    save_time_vn = datetime.now()  # fallback: local time c·ªßa m√°y
STAMP = save_time_vn.strftime("%H%M_%d-%m-%Y")  # HHMM_dd-mm-YYYY (kh√¥ng d√πng d·∫•u :)
DEST_DIR = os.path.join(DEST_ROOT, f"checkpoint_{STAMP}")
os.makedirs(DEST_DIR, exist_ok=True)

# 5) Load model/tokenizer t·ª´ SRC_DIR (fallback tokenizer theo base n·∫øu c·∫ßn)
from transformers import AutoModelForSequenceClassification, AutoTokenizer
BASE_MODEL_FALLBACK = "vinai/phobert-base"

def load_tokenizer():
    for d in [SRC_DIR, OUTPUT_DIR]:
        if any(os.path.exists(os.path.join(d, name)) for name in [
            "tokenizer.json", "tokenizer_config.json", "vocab.txt",
            "merges.txt", "sentencepiece.bpe.model", "spm.model", "special_tokens_map.json"
        ]):
            return AutoTokenizer.from_pretrained(d, use_fast=False)
    return AutoTokenizer.from_pretrained(BASE_MODEL_FALLBACK, use_fast=False)

tok = load_tokenizer()
model = AutoModelForSequenceClassification.from_pretrained(SRC_DIR)

# 6) L∆∞u model + tokenizer v√†o th∆∞ m·ª•c checkpoint_{STAMP}
model.save_pretrained(DEST_DIR)
tok.save_pretrained(DEST_DIR)

# 7) (Tu·ª≥ ch·ªçn) copy th√™m file nh√£n n·∫øu c√≥
extra_files = ["labels.json", "id2label.json", "label2id.json", "label_encoder.json"]
for name in extra_files:
    for d in [SRC_DIR, OUTPUT_DIR, "."]:
        src = os.path.join(d, name)
        if os.path.exists(src):
            shutil.copy2(src, os.path.join(DEST_DIR, name))
            break

# 8) L·∫•y "th·ªùi ƒëi·ªÉm checkpoint ho√†n t·∫•t" = mtime m·ªõi nh·∫•t trong SRC_DIR
def _latest_mtime(path: str) -> Optional[float]:
    mtimes = []
    for root, _, files in os.walk(path):
        for f in files:
            try:
                mtimes.append(os.path.getmtime(os.path.join(root, f)))
            except Exception:
                pass
    return max(mtimes) if mtimes else None

ts = _latest_mtime(SRC_DIR)
utc_iso = local_iso = None
if ts is not None:
    dt_utc = datetime.fromtimestamp(ts, tz=timezone.utc)
    utc_iso = dt_utc.isoformat()
    if _VN_TZ is not None:
        dt_vn = dt_utc.astimezone(_VN_TZ)
        local_iso = dt_vn.isoformat()
    else:
        local_iso = datetime.fromtimestamp(ts).isoformat()

# 9) Ghi metadata
meta = {
    "source_checkpoint_basename": os.path.basename(os.path.normpath(SRC_DIR)),
    "source_checkpoint_path": os.path.abspath(SRC_DIR),
    "export_root": os.path.abspath(DEST_ROOT),
    "export_path": os.path.abspath(DEST_DIR),
    "saved_time_vietnam": save_time_vn.isoformat(),
    "saved_time_utc": save_time_utc.isoformat(),
    "checkpoint_finished_time_utc": utc_iso,
    "checkpoint_finished_time_vietnam": local_iso,
}
with open(os.path.join(DEST_DIR, "checkpoint_info.json"), "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("‚úÖ ƒê√É L∆ØU MODEL & TOKENIZER T·ªöI:", DEST_DIR)
print("üì¶ Ngu·ªìn checkpoint (m·ªõi nh·∫•t):", SRC_DIR)
print("üïí Saved (VN):", meta["saved_time_vietnam"])
print("üïí Saved (UTC):", meta["saved_time_utc"])
print("‚ÑπÔ∏è Load l·∫°i: AutoModelForSequenceClassification.from_pretrained(DEST_DIR)")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ ƒê√É L∆ØU MODEL & TOKENIZER T·ªöI: /content/drive/MyDrive/PBL6_DataSet/PBL6_Dataset/checkpoint_2057_26-09-2025
üì¶ Ngu·ªìn checkpoint (m·ªõi nh·∫•t): ./phobert_intent_vi/checkpoint-2535
üïí Saved (VN): 2025-09-26T20:57:49.821294+07:00
üïí Saved (UTC): 2025-09-26T13:57:49.821294+00:00
‚ÑπÔ∏è Load l·∫°i: AutoModelForSequenceClassification.from_pretrained(DEST_DIR)
