In [1]:
!pip install sacrebleu
!pip -q install wandb


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
import os
import sys
import html
import time
import random
import re
import unicodedata
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import GradScaler
from tqdm.notebook import tqdm
from datasets import Dataset as HfDataset, DatasetDict
from sacrebleu.metrics import BLEU

# Tokenizers
from tokenizers import (
    Tokenizer, models, trainers, pre_tokenizers, 
    normalizers, decoders, processors
)

# Th√™m ƒë∆∞·ªùng d·∫´n code (tr√™n kaggle)
CODE_DIR = "/kaggle/input/dataset1/Transformer/"
if CODE_DIR not in sys.path:
    sys.path.append(CODE_DIR)

# --- REPRODUCIBILITY ---
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


In [None]:
# T·ªáp c·∫•u h√¨nh chi ti·∫øt cho m√¥ h√¨nh Transformer: bao g·ªìm thi·∫øt l·∫≠p thi·∫øt b·ªã,tham s·ªë d·ªØ li·ªáu, si√™u tham s·ªë hu·∫•n luy·ªán, ki·∫øn tr√∫c m√¥ h√¨nh v√† n∆°i l∆∞u tr·ªØ tr√™n kaggle.

CFG = {
    "seed": 42,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "use_cuda": torch.cuda.is_available(),
    
    "data_dir": "data",
    "src_lang": "en",
    "tgt_lang": "vi",
    "vocab_size": 16000,
    "max_len": 128,
    "num_workers": 4,
    
    "batch_size": 128,
    "accumulate_steps": 2,
    "epochs": 25,
    "base_lr": 1.0,
    "warmup_steps": 800,
    "lr_factor": 1.0,
    "label_smoothing": 0.1,
    "grad_clip": 1.0,
    "mixed_precision": "bf16", 
    
    "d_model": 512, 
    "num_heads": 8, 
    "num_layers": 6, 
    "d_ff": 2048, 
    "dropout": 0.2, 
    "tie_weights": True,
    
    "beam_size": 4, 
    "max_decode_len": 60, 
    "length_penalty_alpha": 0.8,
    "bleu_tokenize": "13a",   
    "bleu_lowercase": False,   
    
    "ckpt_dir": "./checkpoints",
    "avg_loss_window": 10,
    "avg_topk_loss_k": 5,
    "avg_topk_loss_name": "avg_top5_best_loss.pth",
}

CFG["avg_path"] = os.path.join(CFG["ckpt_dir"], CFG["avg_topk_loss_name"])
CFG["window_dir"] = os.path.join(CFG["ckpt_dir"], "window_checkpoints")

DATA_DIR = "/kaggle/input/dataset3/Transformer/data"
FILE_PATHS = {
    "train": {CFG["src_lang"]: os.path.join(DATA_DIR, f"train.{CFG['src_lang']}"), 
              CFG["tgt_lang"]: os.path.join(DATA_DIR, f"train.{CFG['tgt_lang']}")},
    "validation": {CFG["src_lang"]: os.path.join(DATA_DIR, f"validation.{CFG['src_lang']}"),
                   CFG["tgt_lang"]: os.path.join(DATA_DIR, f"validation.{CFG['tgt_lang']}")},
    "test": {CFG["src_lang"]: os.path.join(DATA_DIR, f"test.{CFG['src_lang']}"),
             CFG["tgt_lang"]: os.path.join(DATA_DIR, f"test.{CFG['tgt_lang']}")},
}

seed_everything(CFG["seed"])
os.makedirs(CFG["ckpt_dir"], exist_ok=True)
os.makedirs(CFG["window_dir"], exist_ok=True)


In [None]:
# C√°c h√†m ti·ªán √≠ch x·ª≠ l√Ω d·ªØ li·ªáu: n·∫°p v√† l√†m s·∫°ch vƒÉn b·∫£n t·ª´ t·ªáp ngu·ªìn, ƒë·ªìng b·ªô h√≥a c·∫∑p c√¢u song ng·ªØ v√† hu·∫•n luy·ªán b·ªô t√°ch t·ª´ BPE Tokenizer.

def load_and_clean_data(file_paths, src_lang, tgt_lang):
    raw_data = {}
    for split, paths in file_paths.items():
        with open(paths[src_lang], "r", encoding="utf-8", errors="replace") as f_src, \
             open(paths[tgt_lang], "r", encoding="utf-8", errors="replace") as f_tgt:
            src_lines = [html.unescape(line.strip()) for line in f_src]
            tgt_lines = [html.unescape(line.strip()) for line in f_tgt]
        
        min_len = min(len(src_lines), len(tgt_lines))
        data = [{"translation": {src_lang: s, tgt_lang: t}} for s, t in zip(src_lines[:min_len], tgt_lines[:min_len])]
        raw_data[split] = HfDataset.from_list(data)
        print(f"-> {split}: {len(data)} lines loaded.")
        
    return DatasetDict(raw_data)

def train_tokenizer(dataset, lang, vocab_size):
    tok = Tokenizer(models.BPE(unk_token="[UNK]"))
    tok.normalizer = normalizers.Sequence([normalizers.NFKC(), normalizers.StripAccents()]) if lang == "en" else normalizers.Sequence([normalizers.NFC()])
    tok.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Metaspace(), pre_tokenizers.Punctuation()])
    
    trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"], min_frequency=2)
    tok.train_from_iterator((item["translation"][lang] for item in dataset), trainer)
    
    tok.post_processor = processors.TemplateProcessing(
        single="[BOS] $A [EOS]",
        special_tokens=[("[BOS]", tok.token_to_id("[BOS]")), ("[EOS]", tok.token_to_id("[EOS]"))]
    )
    tok.decoder = decoders.Metaspace()
    return tok

In [None]:
# 1. Load Data
print("Loading Data...")
datasets = load_and_clean_data(FILE_PATHS, CFG["src_lang"], CFG["tgt_lang"])

# 2. Train Tokenizers
print("Training Tokenizers...")
tokenizer_src = train_tokenizer(datasets["train"], CFG["src_lang"], CFG["vocab_size"])
tokenizer_tgt = train_tokenizer(datasets["train"], CFG["tgt_lang"], CFG["vocab_size"])

# 3. Global Constants
PAD_ID = tokenizer_src.token_to_id("[PAD]")
BOS_ID = tokenizer_tgt.token_to_id("[BOS]")
EOS_ID = tokenizer_tgt.token_to_id("[EOS]")
SRC_VOCAB_SIZE = tokenizer_src.get_vocab_size()
TGT_VOCAB_SIZE = tokenizer_tgt.get_vocab_size()


In [None]:
# ƒê·ªãnh nghƒ©a l·ªõp CleanCollate ƒë·ªÉ x·ª≠ l√Ω batch v√† h√†m get_loaders ƒë·ªÉ kh·ªüi t·∫°o c√°c DataLoader.

class CleanCollate:
    def __init__(self, tok_src, tok_tgt, cfg):
        self.tok_src = tok_src
        self.tok_tgt = tok_tgt
        self.src_lang = cfg["src_lang"]
        self.tgt_lang = cfg["tgt_lang"]
        self.max_len = cfg["max_len"]
        self.pad_id = tok_src.token_to_id("[PAD]")

    def __call__(self, batch):
        src_batch, tgt_batch = [], []
        for item in batch:
            s_txt = item["translation"][self.src_lang]
            t_txt = item["translation"][self.tgt_lang]
            
            s_ids = self.tok_src.encode(str(s_txt)).ids[:self.max_len]
            t_ids = self.tok_tgt.encode(str(t_txt)).ids[:self.max_len]
            
            src_batch.append(torch.tensor(s_ids, dtype=torch.long))
            tgt_batch.append(torch.tensor(t_ids, dtype=torch.long))
            
        src_padded = pad_sequence(src_batch, batch_first=True, padding_value=self.pad_id)
        tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=self.pad_id)
        return src_padded, tgt_padded

def get_loaders(datasets, cfg):
    collate_fn = CleanCollate(tokenizer_src, tokenizer_tgt, cfg)
    
    train_loader = DataLoader(datasets["train"], batch_size=cfg["batch_size"], shuffle=True, collate_fn=collate_fn, num_workers=cfg["num_workers"], pin_memory=True, persistent_workers=True)
    val_loader = DataLoader(datasets["validation"], batch_size=cfg["batch_size"], shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(datasets["test"], batch_size=cfg["batch_size"], shuffle=False, collate_fn=collate_fn)
    
    return train_loader, val_loader, test_loader

train_loader, val_loader, test_loader = get_loaders(datasets, CFG)

In [None]:
from model.Transformer import Transformer 

# 1. Model Init
model = Transformer(
    src_vocab_size=SRC_VOCAB_SIZE, tgt_vocab_size=TGT_VOCAB_SIZE,
    d_model=CFG["d_model"], num_heads=CFG["num_heads"], d_ff=CFG["d_ff"],
    num_encoder_layers=CFG["num_layers"], num_decoder_layers=CFG["num_layers"],
    dropout=CFG["dropout"], pad_id=PAD_ID, tie_weights=CFG["tie_weights"]
).to(CFG["device"])

# Xavier Init
for p in model.parameters():
    if p.dim() > 1: nn.init.xavier_uniform_(p)

# Torch Compile (Optimization)
try:
    model = torch.compile(model, mode="default")
    print("Torch Compile Activated!")
except:
    print("Torch Compile Failed, using eager mode.")

# 2. Optimizer & Loss
optimizer = optim.Adam(model.parameters(), lr=CFG["base_lr"], betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=CFG["label_smoothing"])
scaler = GradScaler(enabled=CFG["use_cuda"])

# 3. Scheduler (Noam)
def rate(step, model_size, factor, warmup):
    if step == 0: step = 1
    return factor * (model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5)))

lr_scheduler = optim.lr_scheduler.LambdaLR(
    optimizer, lr_lambda=lambda step: rate(step, CFG["d_model"], CFG["lr_factor"], CFG["warmup_steps"])
)


Torch Compile Activated!
Model & Optimizer Setup Done.


  scaler = GradScaler(enabled=CFG["use_cuda"])


In [None]:
import re
import unicodedata
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from sacrebleu.metrics import BLEU
from tqdm.notebook import tqdm



_SENT_END_RE = re.compile(r'(^|[.!?]\s+)(["‚Äú‚Äù‚Äò‚Äô\(\[\{]*)([A-Za-z√Ä-·ªπ])')

def _normalize_text(s: str) -> str:
    # L√†m s·∫°ch text, x·ª≠ l√Ω unicode v√† artifact BPE
    s = unicodedata.normalize("NFC", s)
    s = s.replace("‚ñÅ", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _detok_punct(s: str) -> str:
    # G·∫Øn li·ªÅn d·∫•u c√¢u v√†o t·ª´ ƒë·ª©ng tr∆∞·ªõc
    s = re.sub(r"\s+([.,!?;:%)\]\}])", r"\1", s)
    s = re.sub(r"([(\[\{])\s+", r"\1", s)
    return s

def _trim_hyp_ids(ids):
    # C·∫Øt b·ªè token th·ª´a trong output model
    if len(ids) > 0 and ids[0] == BOS_ID:
        ids = ids[1:]
    if EOS_ID in ids:
        ids = ids[:ids.index(EOS_ID)]
    ids = [x for x in ids if x != PAD_ID]
    return ids


def beam_search_decode(model, src, src_mask, max_len, beam_size):
    memory = model.encoder(src, src_mask)
    memory = memory.repeat(beam_size, 1, 1)
    src_mask = src_mask.repeat(beam_size, 1, 1, 1)
    beam = [(0.0, [BOS_ID])]
    
    for _ in range(max_len):
        candidates = []
        beam_inputs = pad_sequence([torch.tensor(b[1], device=CFG["device"]) for b in beam], 
                                   batch_first=True, padding_value=PAD_ID)
        tgt_mask = model.make_tgt_mask(beam_inputs)
        
        with torch.no_grad():
            logits = model.decoder(beam_inputs, memory[:len(beam)], tgt_mask, src_mask[:len(beam)])[:, -1, :]
            log_probs = F.log_softmax(logits, dim=-1)
            
        for i, (score, seq) in enumerate(beam):
            if seq[-1] == EOS_ID: 
                candidates.append((score, seq)); continue
            
            vals, idxs = log_probs[i].topk(beam_size)
            for v, idx in zip(vals, idxs):
                candidates.append((score + v.item(), seq + [idx.item()]))
                
        beam = sorted(candidates, key=lambda x: x[0] / (len(x[1])**CFG["length_penalty_alpha"]), reverse=True)[:beam_size]
        if all(b[1][-1] == EOS_ID for b in beam): break
        
    return beam[0][1]

# Bleu caculation
bleu_metric = BLEU(tokenize=CFG["bleu_tokenize"], lowercase=CFG["bleu_lowercase"])

def calculate_bleu(model, loader, limit=None):
    model.eval()
    hyps, refs = [], []
    ds = loader.dataset
    
    # Logic ch·ªçn index r·∫£i ƒë·ªÅu
    n_total = len(ds)
    n_eval = min(limit, n_total) if limit else n_total
    step = max(1, n_total // n_eval)
    idxs = list(range(0, n_total, step))[:n_eval]
    
    with torch.no_grad():
        for i in tqdm(idxs, desc="Calculating BLEU", leave=False):
            item = ds[int(i)]
            src_text = item["translation"][CFG["src_lang"]]
            ref_text = item["translation"][CFG["tgt_lang"]]
            
            # Encode
            src_ids = torch.tensor(tokenizer_src.encode(src_text).ids).unsqueeze(0).to(CFG["device"])
            src_mask = (src_ids != PAD_ID).unsqueeze(1).unsqueeze(2)
            
            # Decode
            out_ids = beam_search_decode(model, src_ids, src_mask, CFG["max_decode_len"], CFG["beam_size"])
            out_ids = _trim_hyp_ids(out_ids)
            hyp_text = tokenizer_tgt.decode(out_ids, skip_special_tokens=True)
            
            hyp_clean = _detok_punct(_normalize_text(hyp_text))
            hyps.append(hyp_clean)
           
            ref_clean = _detok_punct(_normalize_text(ref_text))
            refs.append(ref_clean)
            
    return bleu_metric.corpus_score(hyps, [refs]).score


In [9]:
from kaggle_secrets import UserSecretsClient
import wandb

wandb_key = UserSecretsClient().get_secret("WANDB_API_KEY")
wandb.login(key=wandb_key)

wandb.init(
    project="lossvalid",
    config=CFG,
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvuxuandungnb2k5[0m ([33mvuxuandungnb2k5-cnn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: setting up run 4wm2p593
[34m[1mwandb[0m: Tracking run with wandb version 0.22.2
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20251223_045119-4wm2p593[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfanciful-snow-10[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/vuxuandungnb2k5-cnn/lossvalid[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/vuxuandungnb2k5-cnn/lossvalid/runs/4wm2p593[0m


In [10]:
wandb.run.name = f"lr{CFG.get('lr','')}_bs{CFG.get('batch_size','')}"

In [11]:
global_step = 0

In [None]:
def train_epoch(model, train_loader, optimizer, scheduler, scaler):
    model.train()
    total_loss = 0
    dtype = torch.bfloat16 if CFG["mixed_precision"] == "bf16" else torch.float16

    pbar = tqdm(train_loader, desc="Training", leave=False)
    for i, (src, tgt) in enumerate(pbar):
        src, tgt = src.to(CFG["device"]), tgt.to(CFG["device"])
        tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]

        with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=CFG["use_cuda"]):
            logits = model(src, tgt_in)
            loss = criterion(logits.reshape(-1, TGT_VOCAB_SIZE), tgt_out.reshape(-1))
            loss = loss / CFG["accumulate_steps"]

        if CFG["mixed_precision"] == "bf16":
            loss.backward()
            if (i + 1) % CFG["accumulate_steps"] == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), CFG["grad_clip"])
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
        else:
            scaler.scale(loss).backward()
            if (i + 1) % CFG["accumulate_steps"] == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), CFG["grad_clip"])
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

        loss_unscaled = loss.item() * CFG["accumulate_steps"]
        total_loss += loss_unscaled
        pbar.set_postfix(loss=f"{total_loss/(i+1):.4f}")

    return total_loss / len(train_loader)

def validate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(CFG["device"]), tgt.to(CFG["device"])
            with torch.amp.autocast(device_type="cuda", enabled=CFG["use_cuda"]):
                logits = model(src, tgt[:, :-1])
                loss = criterion(logits.reshape(-1, TGT_VOCAB_SIZE), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

def average_checkpoints(paths):
    # C·ªông trung b√¨nh tr·ªçng s·ªë c√°c file model
    print(f"Averaging {len(paths)} checkpoints...")
    avg_state = {}
    states = [torch.load(p, map_location="cpu") for p in paths]

    states = [s["model_state_dict"] if "model_state_dict" in s else s for s in states]
    
    for k in states[0].keys():
        params = [s[k] for s in states]
        if torch.is_floating_point(params[0]):
            avg_state[k] = sum(params) / len(params)
        else:
            avg_state[k] = params[0] 
            
    return avg_state



In [None]:
import time
import torch
import os

window_ckpts = [] 

print("START TRAINING (Per-Epoch Logging)...")
print(f"{'EPOCH':<6} | {'TRAIN LOSS':<12} | {'VAL LOSS':<12} |")
print("-" * 45)

for epoch in range(CFG["epochs"]):
    # Ch·∫°y 1 epoch train v√† l·∫•y loss trung b√¨nh
    train_loss_avg = train_epoch(model, train_loader, optimizer, lr_scheduler, scaler)
    
    # Ch·∫°y validate v√† l·∫•y loss trung b√¨nh
    val_loss_avg = validate(model, val_loader)
    
    # Log l√™n WandB sau khi k·∫øt th√∫c 1 epoch
    if wandb.run is not None:
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss_avg,
            "val_loss": val_loss_avg,
            "lr": optimizer.param_groups[0]['lr']
        })
    
    # In m√†n h√¨nh
    print(f"{epoch+1:<6} | {train_loss_avg:<12.4f} | {val_loss_avg:<12.4f} |")
    
    # L∆∞u checkpoint
    if epoch >= (CFG["epochs"] - CFG["avg_loss_window"]):
        path = os.path.join(CFG["window_dir"], f"ep{epoch+1}_loss{val_loss_avg:.4f}.pth")
        torch.save(model.state_dict(), path)
        window_ckpts.append((val_loss_avg, path))

print("\nTraining Finished. Performing Checkpoint Averaging...")

print("\nAveraging 5 checkpoints...")

# S·∫Øp x·∫øp c√°c checkpoint theo val_loss th·∫•p nh·∫•t
window_ckpts.sort(key=lambda x: x[0])

# L·∫•y Top-K checkpoint t·ªët nh·∫•t
top_k_paths = [p for _, p in window_ckpts[:CFG["avg_topk_loss_k"]]]

if top_k_paths:
    avg_weights = average_checkpoints(top_k_paths)
    torch.save(avg_weights, CFG["avg_path"])
    print(f"Saved Averaged Model (Top-{len(top_k_paths)} Best Loss) to: {CFG['avg_path']}")
else:
    print("error.")

START TRAINING (FAST MODE - LOSS ONLY)...
EPOCH  | TRAIN LOSS   | VAL LOSS     |
---------------------------------------------


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

1      | 5.6621       | 4.2024       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

2      | 3.8170       | 3.6726       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

3      | 3.4189       | 3.4697       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

4      | 3.2240       | 3.3684       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

5      | 3.0984       | 3.3062       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

6      | 3.0058       | 3.2593       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

7      | 2.9337       | 3.2145       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

8      | 2.8738       | 3.1927       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

9      | 2.8251       | 3.1769       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

10     | 2.7812       | 3.1578       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

11     | 2.7441       | 3.1457       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

12     | 2.7106       | 3.1462       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

13     | 2.6811       | 3.1371       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

14     | 2.6538       | 3.1318       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

15     | 2.6286       | 3.1339       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

16     | 2.6056       | 3.1298       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

17     | 2.5846       | 3.1297       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

18     | 2.5642       | 3.1289       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

19     | 2.5455       | 3.1257       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

20     | 2.5280       | 3.1289       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

21     | 2.5119       | 3.1296       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

22     | 2.4958       | 3.1303       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

23     | 2.4811       | 3.1305       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

24     | 2.4664       | 3.1326       |


Training:   0%|          | 0/1041 [00:00<?, ?it/s]

25     | 2.4536       | 3.1389       |

Training Finished. Performing Checkpoint Averaging...
Averaging 5 checkpoints...
Saved Averaged Model (Top-5 Best Loss) to: ./checkpoints/avg_top5_best_loss.pth


In [14]:
wandb.finish()

[34m[1mwandb[0m: updating run metadata
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:             lr ‚ñà‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
[34m[1mwandb[0m:     train/loss ‚ñà‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
[34m[1mwandb[0m: train/loss_avg ‚ñà‚ñá‚ñÑ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
[34m[1mwandb[0m:       val/loss ‚ñà‚ñÜ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:             lr 0.00039
[34m[1mwandb[0m:     train/loss 2.43701
[34m[1mwandb[0m: train/loss_avg 2.45339
[34m[1mwandb[0m:       val/loss 3.13257
[34m[1mwandb[0m: 
[34m[1mwandb[0m: üöÄ View run

In [None]:
import torch
import os
from tqdm.notebook import tqdm

if test_loader is not None:
    target_loader = test_loader
    split_name = "FULL TEST"
else:
    target_loader = val_loader
    split_name = "FULL VALIDATION (Thay th·∫ø)"

output_txt_file = "model_outputs.txt"

print(f" M·ª§C TI√äU: Ch·∫•m ƒëi·ªÉm & Xu·∫•t file b·∫£n d·ªãch tr√™n t·∫≠p {split_name}\n")

ckpt_path = CFG.get("avg_path", "path_not_found")

if os.path.exists(ckpt_path):
    print(f" ƒêang x·ª≠ l√Ω: MODEL AVERAGED")
    
    try:
        # 1. Load tr·ªçng s·ªë
        state_dict = torch.load(ckpt_path, map_location=CFG["device"])
        if isinstance(state_dict, dict) and ("model_state_dict" in state_dict):
            state_dict = state_dict["model_state_dict"]

        model.load_state_dict(state_dict)
        model.eval()
        print(f"   ƒê√£ t·∫£i checkpoint: {os.path.basename(ckpt_path)}")
        
        # 2. D·ªãch to√†n b·ªô t·∫≠p d·ªØ li·ªáu
        hyps, refs = [], []
        ds = target_loader.dataset
        
        with torch.no_grad():
            for i in tqdm(range(len(ds)), desc="Translating"):
                item = ds[i]
                src_text = item["translation"][CFG["src_lang"]]
                ref_text = item["translation"][CFG["tgt_lang"]]
                
                # Encode & Decode
                src_ids = torch.tensor(tokenizer_src.encode(src_text).ids).unsqueeze(0).to(CFG["device"])
                src_mask = (src_ids != PAD_ID).unsqueeze(1).unsqueeze(2)
                
                # S·ª≠ d·ª•ng Beam Search ƒë√£ ƒë·ªãnh nghƒ©a ·ªü Cell 8
                out_ids = beam_search_decode(model, src_ids, src_mask, CFG["max_decode_len"], CFG["beam_size"])
                out_ids = _trim_hyp_ids(out_ids)
                hyp_text = tokenizer_tgt.decode(out_ids, skip_special_tokens=True)
                
                # L√†m s·∫°ch text theo ƒë√∫ng logic post-processing
                hyp_clean = _detok_punct(_normalize_text(hyp_text))
                ref_clean = _detok_punct(_normalize_text(ref_text))
                
                hyps.append(hyp_clean)
                refs.append(ref_clean)

        # 3. T√≠nh ƒëi·ªÉm BLEU ƒë·ªÉ hi·ªÉn th·ªã
        score = bleu_metric.corpus_score(hyps, [refs]).score
        print(f"\n   K·∫æT QU·∫¢ {split_name}:")
        print(f"      + BLEU: {score:.2f}")

        # 4. XU·∫§T FILE CH·ªà CH·ª®A B·∫¢N D·ªäCH (M·ªói c√¢u m·ªôt d√≤ng)
        print(f"   ƒêang xu·∫•t b·∫£n d·ªãch ra file: {output_txt_file}")
        with open(output_txt_file, "w", encoding="utf-8") as f:
            for line in hyps:
                f.write(line + "\n")
        
        print(f"   Ho√†n t·∫•t! File '{output_txt_file}' ƒë√£ s·∫µn s√†ng trong ph·∫ßn Output.")
        
    except Exception as e:
        print(f"   L·ªói: {e}")

else:
    print(f" Kh√¥ng t√¨m th·∫•y checkpoint t·∫°i: {ckpt_path}")

print("\n DONE!")

 M·ª§C TI√äU: Ch·∫•m ƒëi·ªÉm & Xu·∫•t file b·∫£n d·ªãch tr√™n t·∫≠p FULL TEST

 ƒêang x·ª≠ l√Ω: MODEL AVERAGED
   ƒê√£ t·∫£i checkpoint: avg_top5_best_loss.pth


Translating:   0%|          | 0/1268 [00:00<?, ?it/s]


   K·∫æT QU·∫¢ FULL TEST:
      + BLEU: 29.94
   ƒêang xu·∫•t b·∫£n d·ªãch ra file: model_outputs.txt
   Ho√†n t·∫•t! File 'model_outputs.txt' ƒë√£ s·∫µn s√†ng trong ph·∫ßn Output.

 DONE!
