In [1]:
from pathlib import Path
import sys, math, time

import torch
from torch.utils.data import DataLoader

# 1) ƒê∆∞·ªùng d·∫´n code & d·ªØ li·ªáu tr√™n Kaggle
CODE_DIR = Path("/kaggle/input/envi-nmt-code/src")
DATA_ROOT = Path("/kaggle/input/envi-nmt-data/data")
PROCESSED_DIR = DATA_ROOT / "processed"
SPM_MODEL = DATA_ROOT / "spm" / "spm_unigram.model"

print("CODE_DIR:", CODE_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)
print("SPM_MODEL:", SPM_MODEL)

# 2) Th√™m src v√†o sys.path ƒë·ªÉ import ƒë∆∞·ª£c tokenizer, dataset, model
sys.path.append(str(CODE_DIR))

# 3) Import c√°c class ƒë√£ c√≥ s·∫µn
from tokenizer import SubwordTokenizer
from dataset import NMTDataset, collate_fn
from model import Transformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


CODE_DIR: /kaggle/input/envi-nmt-code/src
PROCESSED_DIR: /kaggle/input/envi-nmt-data/data/processed
SPM_MODEL: /kaggle/input/envi-nmt-data/data/spm/spm_unigram.model
Device: cuda


In [2]:
# Kh·ªüi t·∫°o tokenizer t·ª´ SentencePiece model
tokenizer = SubwordTokenizer(str(SPM_MODEL))

# L∆ØU √ù: version sentencepiece tr√™n Kaggle d√πng vocab_size(), kh√¥ng c√≥ get_vocab_size()
vocab_size = tokenizer.sp.vocab_size()
pad_id = tokenizer.pad_id
bos_id = tokenizer.bos_id
eos_id = tokenizer.eos_id

print("Vocab size:", vocab_size)
print("pad/bos/eos:", pad_id, bos_id, eos_id)

# C√°c hyperparameter c∆° b·∫£n
MAX_SRC_LEN = 70
MAX_TGT_LEN = 70
BATCH_SIZE = 64  # n·∫øu b·ªã OOM th√¨ gi·∫£m xu·ªëng 32 ho·∫∑c 16

# T·∫°o Dataset
train_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="train",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

valid_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="valid",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

print("Train size:", len(train_dataset))
print("Valid size:", len(valid_dataset))

# T·∫°o DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda batch: collate_fn(batch, pad_id=pad_id),
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, pad_id=pad_id),
)

# Test 1 batch cho ch·∫Øc
batch = next(iter(train_loader))
for k, v in batch.items():
    print(k, v.shape, v.dtype)


Vocab size: 8000
pad/bos/eos: 0 2 3
Train size: 132406
Valid size: 1550
src_ids torch.Size([64, 70]) torch.int64
tgt_in_ids torch.Size([64, 70]) torch.int64
tgt_out_ids torch.Size([64, 70]) torch.int64
src_padding_mask torch.Size([64, 70]) torch.bool
tgt_padding_mask torch.Size([64, 70]) torch.bool


In [3]:
# H√†m t·∫°o mask cho Encoder
def make_src_mask(src_ids: torch.Tensor, pad_id: int):
    """
    src_ids: (B, S)
    Tr·∫£ v·ªÅ mask shape (B, 1, 1, S), 1 = kh√¥ng b·ªã mask, 0 = b·ªã che
    """
    # True ·ªü v·ªã tr√≠ NOT PAD
    mask = (src_ids != pad_id).unsqueeze(1).unsqueeze(2)  # (B,1,1,S), bool
    return mask  # ƒë·ªÉ bool c≈©ng ƒë∆∞·ª£c, v√¨ attention d√πng mask == 0


# H√†m t·∫°o mask cho Decoder (pad + look-ahead)
def make_tgt_mask(tgt_ids: torch.Tensor, pad_id: int):
    """
    tgt_ids: (B, T) = input cho decoder (BOS, w1, w2, ...)
    Tr·∫£ v·ªÅ mask shape (B, 1, T, T)
    """
    B, T = tgt_ids.shape

    # Pad mask: True ·ªü v·ªã tr√≠ NOT PAD
    pad_mask = (tgt_ids != pad_id).unsqueeze(1).unsqueeze(2)  # (B,1,1,T)

    # Look-ahead mask: tam gi√°c d∆∞·ªõi (ch·ªâ ƒë∆∞·ª£c nh√¨n qu√° kh·ª© & hi·ªán t·∫°i)
    nopeak = torch.tril(torch.ones((T, T), device=tgt_ids.device)).bool()  # (T,T)
    nopeak = nopeak.unsqueeze(0).unsqueeze(1)  # (1,1,T,T)

    # K·∫øt h·ª£p: ch·ªâ cho ph√©p n·∫øu c·∫£ 2 ƒë·ªÅu True
    combined = pad_mask & nopeak  # (B,1,T,T) bool
    return combined


# Kh·ªüi t·∫°o model
d_model = 512
n_layers = 4        # gi·∫£m 4 layer cho nh·∫π, kh√¥ng nh·∫•t thi·∫øt ph·∫£i 6 nh∆∞ paper
n_heads = 8
d_ff = 2048
dropout = 0.1
max_len = max(MAX_SRC_LEN, MAX_TGT_LEN)

model = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=d_model,
    n_layers=n_layers,
    n_heads=n_heads,
    d_ff=d_ff,
    dropout=dropout,
    max_len=max_len,
).to(device)

criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.98), eps=1e-9)

print("Model params:", sum(p.numel() for p in model.parameters()) / 1e6, "M")


Model params: 41.723712 M


In [4]:
import os, math, time
import torch
import torch.nn as nn
import torch.optim as optim

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("üöÄ Device:", DEVICE)

# ========= CONFIG V3 =========
D_MODEL = 384
N_LAYERS = 4
N_HEADS  = 8
D_FF     = 1536
DROPOUT  = 0.1
MAX_LEN  = 5000

N_EPOCHS = 30
WARMUP_STEPS = 4000
LABEL_SMOOTHING = 0.1

best_v3_path = "/kaggle/working/best_transformer_v3.pt"

print("V3 config:",
      f"d_model={D_MODEL}, n_layers={N_LAYERS}, n_heads={N_HEADS}, d_ff={D_FF}, dropout={DROPOUT}")

# ========= MASKS =========
def make_masks(src, tgt_in, pad_id):
    # src_mask: [B, 1, 1, S]
    src_mask = (src != pad_id).unsqueeze(1).unsqueeze(2)

    # tgt_mask: [B, 1, T, T]
    tgt_pad_mask = (tgt_in != pad_id).unsqueeze(1).unsqueeze(2)
    T = tgt_in.size(1)
    nopeak_mask = torch.tril(torch.ones((1, 1, T, T), device=src.device)).bool()

    tgt_mask = tgt_pad_mask & nopeak_mask
    return src_mask, tgt_mask

# ========= NOAM SCHEDULER (simple + robust) =========
class NoamLR:
    """
    lr = factor * d_model^{-0.5} * min(step^{-0.5}, step * warmup^{-1.5})
    """
    def __init__(self, optimizer, d_model, warmup_steps=4000, factor=1.0):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.factor = factor
        self.step_num = 0

    def rate(self):
        step = max(self.step_num, 1)
        return self.factor * (self.d_model ** -0.5) * min(step ** -0.5, step * (self.warmup_steps ** -1.5))

    def step(self):
        self.step_num += 1
        lr = self.rate()
        for g in self.optimizer.param_groups:
            g["lr"] = lr

# ========= MODEL =========
model_v3 = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    dropout=DROPOUT,
    max_len=MAX_LEN,
).to(DEVICE)

criterion_v3 = nn.CrossEntropyLoss(ignore_index=pad_id, label_smoothing=LABEL_SMOOTHING)

# NOTE: optimizer lr ƒë·ªÉ 0, scheduler s·∫Ω set lr theo Noam
optimizer_v3 = optim.Adam(model_v3.parameters(), lr=0.0, betas=(0.9, 0.98), eps=1e-9)
scheduler_v3 = NoamLR(optimizer_v3, d_model=D_MODEL, warmup_steps=WARMUP_STEPS, factor=1.0)

def run_epoch(model, dataloader, optimizer, criterion, device, pad_id, is_train=True, scheduler=None):
    model.train() if is_train else model.eval()
    total_loss = 0.0
    total_tokens = 0

    for batch in dataloader:
        src = batch["src_ids"].to(device)
        tgt_in = batch["tgt_in_ids"].to(device)
        tgt_out = batch["tgt_out_ids"].to(device)

        src_mask, tgt_mask = make_masks(src, tgt_in, pad_id)

        if is_train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(is_train):
            logits = model(src, tgt_in, src_mask, tgt_mask)  # [B, T, V]
            V = logits.size(-1)

            loss = criterion(logits.view(-1, V), tgt_out.view(-1))

            if is_train:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()

        # t√≠nh theo token ƒë·ªÉ loss/ppl ·ªïn ƒë·ªãnh h∆°n
        non_pad = (tgt_out != pad_id).sum().item()
        total_loss += loss.item() * non_pad
        total_tokens += non_pad

    avg_loss = total_loss / max(total_tokens, 1)
    ppl = math.exp(avg_loss) if avg_loss < 20 else float("inf")
    return avg_loss, ppl

print("\nüöÄ START TRAINING V3")
best_valid_loss = float("inf")

for epoch in range(1, N_EPOCHS + 1):
    start = time.time()

    train_loss, train_ppl = run_epoch(
        model_v3, train_loader, optimizer_v3, criterion_v3, DEVICE, pad_id,
        is_train=True, scheduler=scheduler_v3
    )
    valid_loss, valid_ppl = run_epoch(
        model_v3, valid_loader, optimizer_v3, criterion_v3, DEVICE, pad_id,
        is_train=False, scheduler=None   # ‚ùó validation KH√îNG step scheduler
    )

    mins = (time.time() - start) / 60
    lr_now = optimizer_v3.param_groups[0]["lr"]
    print(f"Epoch {epoch:02d} | Time: {mins:.2f} min | lr: {lr_now:.6g}")
    print(f"  Train Loss: {train_loss:.4f} | PPL: {train_ppl:.2f}")
    print(f"  Valid Loss: {valid_loss:.4f} | PPL: {valid_ppl:.2f}")

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_v3.state_dict(), best_v3_path)
        print(f"  ‚úÖ Saved new best V3 to {best_v3_path}")


üöÄ Device: cuda
V3 config: d_model=384, n_layers=4, n_heads=8, d_ff=1536, dropout=0.1

üöÄ START TRAINING V3
Epoch 01 | Time: 4.35 min | lr: 0.000417354
  Train Loss: 5.7000 | PPL: 298.86
  Valid Loss: 4.5653 | PPL: 96.09
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 02 | Time: 4.34 min | lr: 0.000793303
  Train Loss: 4.1893 | PPL: 65.98
  Valid Loss: 3.8176 | PPL: 45.50
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 03 | Time: 4.33 min | lr: 0.000647729
  Train Loss: 3.6328 | PPL: 37.82
  Valid Loss: 3.4817 | PPL: 32.52
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 04 | Time: 4.34 min | lr: 0.00056095
  Train Loss: 3.3496 | PPL: 28.49
  Valid Loss: 3.3379 | PPL: 28.16
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 05 | Time: 4.34 min | lr: 0.000501729
  Train Loss: 3.1889 | PPL: 24.26
  Valid Loss: 3.2584 | PPL: 26.01
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.p

In [5]:
import os, math, time
import torch
import torch.nn as nn
import torch.optim as optim

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ckpt_v3 = "/kaggle/working/best_transformer_v3.pt"
assert os.path.exists(ckpt_v3), f"Kh√¥ng th·∫•y checkpoint: {ckpt_v3}"
print("Fine-tune from:", ckpt_v3)

# ph·∫£i kh·ªõp config v·ªõi V3 ƒë√£ train
D_MODEL = 384
N_LAYERS = 4
N_HEADS  = 8
D_FF     = 1536
DROPOUT  = 0.1
MAX_LEN  = 5000

ft_v3 = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    dropout=DROPOUT,
    max_len=MAX_LEN,
).to(DEVICE)

ft_v3.load_state_dict(torch.load(ckpt_v3, map_location=DEVICE))

# Fine-tune nh·∫π: th∆∞·ªùng LR nh·ªè + c√≥ th·ªÉ gi·∫£m/gi·ªØ label_smoothing tu·ª≥ b·∫°n
criterion_ft = nn.CrossEntropyLoss(ignore_index=pad_id, label_smoothing=0.1)
optimizer_ft = optim.Adam(ft_v3.parameters(), lr=5e-5, betas=(0.9, 0.98), eps=1e-9)

N_EPOCHS_FT = 3
best_ft_path = "/kaggle/working/best_transformer_v3_ft.pt"
best_valid_loss = float("inf")

print("\nüèÅ START FINE-TUNE V3")
for epoch in range(1, N_EPOCHS_FT + 1):
    start = time.time()

    train_loss, train_ppl = run_epoch(
        ft_v3, train_loader, optimizer_ft, criterion_ft, DEVICE, pad_id,
        is_train=True, scheduler=None
    )
    valid_loss, valid_ppl = run_epoch(
        ft_v3, valid_loader, optimizer_ft, criterion_ft, DEVICE, pad_id,
        is_train=False, scheduler=None
    )

    mins = (time.time() - start) / 60
    print(f"[V3_FT] Epoch {epoch:02d} | Time: {mins:.2f} min")
    print(f"  Train Loss: {train_loss:.4f} | PPL: {train_ppl:.2f}")
    print(f"  Valid Loss: {valid_loss:.4f} | PPL: {valid_ppl:.2f}")

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(ft_v3.state_dict(), best_ft_path)
        print(f"  ‚úÖ Saved new best V3_FT to {best_ft_path}")


Fine-tune from: /kaggle/working/best_transformer_v3.pt

üèÅ START FINE-TUNE V3
[V3_FT] Epoch 01 | Time: 4.35 min
  Train Loss: 2.4535 | PPL: 11.63
  Valid Loss: 3.0263 | PPL: 20.62
  ‚úÖ Saved new best V3_FT to /kaggle/working/best_transformer_v3_ft.pt
[V3_FT] Epoch 02 | Time: 4.34 min
  Train Loss: 2.4258 | PPL: 11.31
  Valid Loss: 3.0273 | PPL: 20.64
[V3_FT] Epoch 03 | Time: 4.34 min
  Train Loss: 2.4124 | PPL: 11.16
  Valid Loss: 3.0246 | PPL: 20.59
  ‚úÖ Saved new best V3_FT to /kaggle/working/best_transformer_v3_ft.pt


In [6]:
def load_version(version="v2_ft"):
    global eval_model

    if version == "v1":
        ckpt = torch.load("/kaggle/working/best_transformer_v1.pt", map_location=device)
        d_model  = ckpt.get("d_model", 256)
        n_layers = ckpt.get("n_layers", 3)
        n_heads  = ckpt.get("n_heads", 4)
        d_ff     = ckpt.get("d_ff", 1024)
        dropout  = ckpt.get("dropout", 0.1)
        max_len  = ckpt.get("max_len", 5000)

        eval_model = Transformer(
            vocab_size, vocab_size,
            d_model, n_layers, n_heads, d_ff, dropout, max_len
        ).to(device)
        eval_model.load_state_dict(ckpt["model_state_dict"])

    elif version == "v2":
        path = "/kaggle/working/best_transformer_v2.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 256, 3, 4, 1024, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    elif version == "v2_ft":
        path = "/kaggle/working/best_transformer_v2_ft.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 256, 3, 4, 1024, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    elif version == "v3":
        path = "/kaggle/working/best_transformer_v3.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 384, 4, 8, 1536, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    elif version == "v3_ft":
        path = "/kaggle/working/best_transformer_v3_ft.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 384, 4, 8, 1536, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    else:
        raise ValueError("version ph·∫£i l√† 'v1', 'v2', 'v2_ft', 'v3' ho·∫∑c 'v3_ft'.")

    eval_model.eval()
    print(f"‚úÖ Loaded {version} model.")

    


# V√≠ d·ª•:
# load_version("v2_ft")   # d√πng V2 fine-tune
# load_version("v2")   # d√πng V2
# load_version("v1")    # d√πng V1
# load_version("v3")    # d√πng V3
load_version("v3_ft")   # d√πng V3 ƒë√£ fine-tune


‚úÖ Loaded v3_ft model.


In [7]:
import torch
import re

@torch.no_grad()
def greedy_translate(
    model,
    tokenizer,
    src_sentence: str,
    max_len: int = 70,
    max_src_len: int = MAX_SRC_LEN,
) -> None: 
    """
    D·ªãch 1 c√¢u EN -> VI b·∫±ng greedy search.
    """

    model.eval()

    # 0) Chu·∫©n ho√° c√¢u ti·∫øng Anh gi·ªëng l√∫c train (lower-case)
    src_clean_str = src_sentence.strip().lower()

    # 1) Encode c√¢u ngu·ªìn
    src_ids = tokenizer.encode_src(src_clean_str, add_bos=False, add_eos=True)
    if len(src_ids) > max_src_len:
        src_ids = src_ids[:max_src_len]

    src = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)
    src_mask = make_src_mask(src, pad_id)
    enc_output = model.encode(src, src_mask)

    # 2) Decode greedy
    tgt_ids = [bos_id]
    for _ in range(max_len):
        tgt = torch.tensor(tgt_ids, dtype=torch.long, device=device).unsqueeze(0)
        tgt_mask = make_tgt_mask(tgt, pad_id)

        dec_output = model.decode(tgt, enc_output, src_mask, tgt_mask)
        logits = model.projection(dec_output)
        next_token = logits[:, -1, :].argmax(dim=-1).item()
        tgt_ids.append(next_token)
        if next_token == eos_id:
            break

    # 3) B·ªè BOS/EOS, decode sang text
    out_ids = tgt_ids[1:]
    if out_ids and out_ids[-1] == eos_id:
        out_ids = out_ids[:-1]

    translation = tokenizer.decode(out_ids)
    translation = translation.replace("ÔøΩ", "").replace("‚Åá", "").strip()
    translation = " ".join(translation.split())

    return translation


In [8]:
import torch
import torch.nn.functional as F
import math
import random

@torch.no_grad()
def beam_translate(
    model,
    tokenizer,
    src_sentence: str,
    max_len: int = 70,
    max_src_len: int = MAX_SRC_LEN,
    beam_size: int = 4,
    length_penalty: float = 1.0,  # ƒë·ªÅ xu·∫•t 1.0 cho c√¢u ƒë·ª° b·ªã ng·∫Øn qu√°
):
    """
    Beam Search decode cho 1 c√¢u EN -> VI.
    """

    model.eval()

    # 0) Chu·∫©n ho√° c√¢u ti·∫øng Anh gi·ªëng l√∫c train (lower-case)
    src_clean_str = src_sentence.strip().lower()

    # 1) Encode c√¢u ngu·ªìn
    src_ids = tokenizer.encode_src(src_clean_str, add_bos=False, add_eos=True)
    if len(src_ids) > max_src_len:
        src_ids = src_ids[:max_src_len]

    src = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)  # (1, S)
    src_mask = make_src_mask(src, pad_id)
    enc_output = model.encode(src, src_mask)

    # 2) Kh·ªüi t·∫°o beam
    # M·ªói beam: (tokens, log_prob, finished)
    beams = [([bos_id], 0.0, False)]

    for step in range(max_len):
        new_beams = []

        for tokens, log_prob, finished in beams:
            # N·∫øu beam ƒë√£ k·∫øt th√∫c (ra EOS), gi·ªØ nguy√™n
            if finished:
                new_beams.append((tokens, log_prob, True))
                continue

            # Chu·∫©n b·ªã input cho decoder
            tgt = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)  # (1, T)
            tgt_mask = make_tgt_mask(tgt, pad_id)

            dec_out = model.decode(tgt, enc_output, src_mask, tgt_mask)  # (1, T, d_model)
            logits = model.projection(dec_out)                           # (1, T, V)
            next_log_probs = F.log_softmax(logits[:, -1, :], dim=-1)     # (1, V)

            # L·∫•y top-k ti·∫øp theo cho beam n√†y
            topk_log_probs, topk_ids = torch.topk(next_log_probs, beam_size, dim=-1)

            for k in range(beam_size):
                token_id = topk_ids[0, k].item()
                token_lp = topk_log_probs[0, k].item()
                new_tokens = tokens + [token_id]
                new_log_prob = log_prob + token_lp
                new_finished = (token_id == eos_id)
                new_beams.append((new_tokens, new_log_prob, new_finished))

        # 3) Ch·ªçn l·∫°i top beam_size theo score ƒë√£ chu·∫©n h√≥a ƒë·ªô d√†i
        # score = log_prob / (len(tokens) ** length_penalty)
        beams = sorted(
            new_beams,
            key=lambda x: x[1] / (len(x[0]) ** length_penalty),
            reverse=True
        )[:beam_size]

        # N·∫øu t·∫•t c·∫£ beam ƒë·ªÅu finished th√¨ d·ª´ng s·ªõm
        if all(f for _, _, f in beams):
            break

    # 4) Ch·ªçn beam t·ªët nh·∫•t
    best_tokens, best_log_prob, finished = max(
        beams,
        key=lambda x: x[1] / (len(x[0]) ** length_penalty)
    )

    # B·ªè BOS v√† EOS
    out_ids = best_tokens[1:]
    if out_ids and out_ids[-1] == eos_id:
        out_ids = out_ids[:-1]

    translation = tokenizer.decode(out_ids)

    # L√†m s·∫°ch m·ªôt ch√∫t cho d·ªÖ ƒë·ªçc
    translation = translation.replace("ÔøΩ", "").replace("‚Åá", "").strip()
    translation = " ".join(translation.split())  # g·ªôp b·ªõt kho·∫£ng tr·∫Øng th·ª´a

    return translation


In [9]:
sentences = [
    # 3 c√¢u ban ƒë·∫ßu
    "I really like natural language processing.",
    "This is a small machine translation model.",
    "Thank you for your help.",

    # Th√™m nhi·ªÅu c√¢u ch·ªß ng·ªØ "I"
    "I am studying machine learning at the university.",
    "I don't understand this sentence very well.",
    "I will try to improve the translation quality.",

    # "We"
    "We are working on a neural machine translation project.",
    "We need more training data to get better results.",
    "We will present our model in the final report.",

    # "You"
    "You can run the code on Kaggle with a GPU.",
    "You should compare the greedy and beam search outputs.",

    # "He / She / They"
    "He likes to read research papers about deep learning.",
    "She is preparing a presentation about transformers.",
    "They want to build a better translation system.",

    # C√¢u d√†i h∆°n, c√≥ m·ªánh ƒë·ªÅ
    "When I first learned about attention, I was very confused.",
    "Even if the BLEU score is not very high, the model can still be useful.",
    "If we have more time, we will try a larger transformer model.",
]


In [10]:
for s in sentences:
    vi_greedy = greedy_translate(eval_model, tokenizer, s, max_len=MAX_TGT_LEN)
    vi_beam   = beam_translate(eval_model, tokenizer, s, max_len=MAX_TGT_LEN, beam_size=4)

    print("EN     :", s)
    print("Greedy :", vi_greedy)
    print("Beam   :", vi_beam)
    print("-" * 60)


EN     : I really like natural language processing.
Greedy : t√¥i th·ª±c s·ª± th√≠ch qu√° tr√¨nh x·ª≠ l√Ω ng√¥n ng·ªØ t·ª± nhi√™n .
Beam   : t√¥i r·∫•t th√≠ch qu√° tr√¨nh x·ª≠ l√Ω ng√¥n ng·ªØ t·ª± nhi√™n .
------------------------------------------------------------
EN     : This is a small machine translation model.
Greedy : ƒë√¢y l√† m·ªôt m√¥ h√¨nh d·ªãch thu·∫≠t nh·ªè .
Beam   : ƒë√¢y l√† m·ªôt m√¥ h√¨nh d·ªãch thu·∫≠t nh·ªè .
------------------------------------------------------------
EN     : Thank you for your help.
Greedy : c·∫£m ∆°n s·ª± gi√∫p ƒë·ª° c·ªßa c√°c b·∫°n .
Beam   : c·∫£m ∆°n s·ª± gi√∫p ƒë·ª° c·ªßa c√°c b·∫°n .
------------------------------------------------------------
EN     : I am studying machine learning at the university.
Greedy : t√¥i ƒëang nghi√™n c·ª©u m√°y m√≥c h·ªçc t·∫°i tr∆∞·ªùng ƒë·∫°i h·ªçc .
Beam   : t√¥i ƒëang nghi√™n c·ª©u m√°y m√≥c h·ªçc t·∫°i tr∆∞·ªùng ƒë·∫°i h·ªçc .
------------------------------------------------------------
EN     : I don't 

In [11]:
from torch.utils.data import DataLoader

test_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="test",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, pad_id=pad_id),
)

print("Test size:", len(test_dataset))


Test size: 1262


In [12]:
!pip install -q sacrebleu
import sacrebleu


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
from tqdm.auto import tqdm

@torch.no_grad()
def compute_bleu(
    model,
    tokenizer,
    loader,
    max_len=70,
    max_sentences=None,
    use_beam=False,
    beam_size=4,
    length_penalty=1.0,
):
    model.eval()
    all_refs = []
    all_hyps = []
    count = 0

    for batch in tqdm(loader, desc="BLEU decoding"):
        src_ids = batch["src_ids"]
        tgt_out_ids = batch["tgt_out_ids"]
        B = src_ids.size(0)

        for i in range(B):
            if (max_sentences is not None) and (count >= max_sentences):
                break

            # --- decode input EN ---
            src_seq = src_ids[i].tolist()
            src_clean = [tid for tid in src_seq if tid not in (pad_id, eos_id)]
            src_text = tokenizer.decode(src_clean)

            # --- ch·ªçn greedy ho·∫∑c beam ---
            if use_beam:
                hyp = beam_translate(
                    model,
                    tokenizer,
                    src_text,
                    max_len=max_len,
                    beam_size=beam_size,
                    length_penalty=length_penalty,
                )
            else:
                hyp = greedy_translate(
                    model,
                    tokenizer,
                    src_text,
                    max_len=max_len,
                )

            # --- reference VI ---
            tgt_seq = tgt_out_ids[i].tolist()
            tgt_clean = [tid for tid in tgt_seq if tid not in (pad_id, eos_id)]
            ref_text = tokenizer.decode(tgt_clean)

            all_hyps.append(hyp)
            all_refs.append(ref_text)
            count += 1

        if (max_sentences is not None) and (count >= max_sentences):
            break

    bleu = sacrebleu.corpus_bleu(all_hyps, [all_refs])
    return bleu.score


In [14]:
bleu_greedy = compute_bleu(
    eval_model,
    tokenizer,
    test_loader,
    max_len=MAX_TGT_LEN,
    max_sentences=None,
    use_beam=False,       # ho·∫∑c b·ªè v√¨ default = False
)
print("BLEU (greedy):", bleu_greedy)


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU (greedy): 29.78482464599802


In [15]:
bleu_beam = compute_bleu(
    eval_model,
    tokenizer,
    test_loader,
    max_len=MAX_TGT_LEN,
    max_sentences=None,
    use_beam=True,
    beam_size=5,
    length_penalty=1.2,
)
print("BLEU (beam=5, lp=1.2):", bleu_beam)


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU (beam=5, lp=1.2): 30.59901819145823
