In [None]:
from pathlib import Path
import sys, math, time

import torch
from torch.utils.data import DataLoader

CODE_DIR = Path("/kaggle/input/envi-nmt-code/src")
DATA_ROOT = Path("/kaggle/input/envi-nmt-data/data")
PROCESSED_DIR = DATA_ROOT / "processed"
SPM_MODEL = Path("/kaggle/input/envi-nmt-data/spm/spm_unigram.model")

print("CODE_DIR:", CODE_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)
print("SPM_MODEL:", SPM_MODEL)

sys.path.append(str(CODE_DIR))

from tokenizer import SubwordTokenizer
from dataset import NMTDataset, collate_fn
from model import Transformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


CODE_DIR: /kaggle/input/envi-nmt-code/src
PROCESSED_DIR: /kaggle/input/envi-nmt-data/data/processed
SPM_MODEL: /kaggle/input/envi-nmt-data/spm/spm_unigram.model
Device: cuda


In [None]:
from pathlib import Path
import shutil

DATASET_ROOT = Path("/kaggle/input/envi-nmt-data")  
OUT_DIR = Path("/kaggle/working/processed_norm")
OUT_DIR.mkdir(parents=True, exist_ok=True)

SPLITS = ["train", "valid", "val", "dev", "test"]

PAIR_CANDIDATES = [
    ("en", "vi"),
    ("src", "tgt"),
    ("source", "target"),
]

def find_file(split, ext):
    patterns = [
        f"{split}.{ext}",
        f"{split}.{ext}.txt",
        f"{split}_{ext}.txt",
        f"{split}-{ext}.txt",
        f"{split}.{ext}.tok",
        f"{split}.{ext}.bpe",
    ]
    for pat in patterns:
        hits = list(DATASET_ROOT.rglob(pat))
        if hits:
            hits_sorted = sorted(hits, key=lambda p: (("processed" not in str(p).lower()), len(str(p))))
            return hits_sorted[0]
    return None

def normalize_split(split_src, split_dst):
    for a, b in PAIR_CANDIDATES:
        fa = find_file(split_src, a)
        fb = find_file(split_src, b)
        if fa and fb:
            out_en = OUT_DIR / f"{split_dst}.en"
            out_vi = OUT_DIR / f"{split_dst}.vi"

            src_path = fa
            tgt_path = fb

            shutil.copyfile(src_path, out_en)
            shutil.copyfile(tgt_path, out_vi)

            print(f"[OK] {split_dst}:")
            print(f"  src -> {out_en}  (from {src_path})")
            print(f"  tgt -> {out_vi}  (from {tgt_path})")
            return True
    print(f"[MISS] Kh√¥ng t√¨m th·∫•y c·∫∑p file cho split='{split_src}'")
    return False

done_train = normalize_split("train", "train")

done_valid = False
for s in ["valid", "val", "dev"]:
    if normalize_split(s, "valid"):
        done_valid = True
        break

done_test = normalize_split("test", "test")

print("\n== Files in OUT_DIR ==")
for p in sorted(OUT_DIR.glob("*")):
    print(p.name, "-", p.stat().st_size, "bytes")

PROCESSED_DIR = OUT_DIR
print("\nPROCESSED_DIR =", PROCESSED_DIR)


[OK] train:
  src -> /kaggle/working/processed_norm/train.en  (from /kaggle/input/envi-nmt-data/processed/train.en)
  tgt -> /kaggle/working/processed_norm/train.vi  (from /kaggle/input/envi-nmt-data/processed/train.vi)
[OK] valid:
  src -> /kaggle/working/processed_norm/valid.en  (from /kaggle/input/envi-nmt-data/processed/valid.en)
  tgt -> /kaggle/working/processed_norm/valid.vi  (from /kaggle/input/envi-nmt-data/processed/valid.vi)
[OK] test:
  src -> /kaggle/working/processed_norm/test.en  (from /kaggle/input/envi-nmt-data/processed/test.en)
  tgt -> /kaggle/working/processed_norm/test.vi  (from /kaggle/input/envi-nmt-data/processed/test.vi)

== Files in OUT_DIR ==
test.en - 131113 bytes
test.vi - 181615 bytes
train.en - 44018207 bytes
train.vi - 58823626 bytes
valid.en - 140532 bytes
valid.vi - 188066 bytes

PROCESSED_DIR = /kaggle/working/processed_norm


In [None]:
tokenizer = SubwordTokenizer(str(SPM_MODEL))

vocab_size = tokenizer.sp.vocab_size()
pad_id = tokenizer.pad_id
bos_id = tokenizer.bos_id
eos_id = tokenizer.eos_id

print("Vocab size:", vocab_size)
print("pad/bos/eos:", pad_id, bos_id, eos_id)

MAX_SRC_LEN = 70
MAX_TGT_LEN = 70
BATCH_SIZE = 64  

train_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="train",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

valid_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="valid",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

print("Train size:", len(train_dataset))
print("Valid size:", len(valid_dataset))

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda batch: collate_fn(batch, pad_id=pad_id),
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, pad_id=pad_id),
)


batch = next(iter(train_loader))
for k, v in batch.items():
    print(k, v.shape, v.dtype)


Vocab size: 8000
pad/bos/eos: 0 2 3
Train size: 449692
Valid size: 1550
src_ids torch.Size([64, 70]) torch.int64
tgt_in_ids torch.Size([64, 62]) torch.int64
tgt_out_ids torch.Size([64, 62]) torch.int64
src_padding_mask torch.Size([64, 70]) torch.bool
tgt_padding_mask torch.Size([64, 62]) torch.bool


In [4]:
train_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="train",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

valid_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="valid",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

In [None]:
def make_src_mask(src_ids: torch.Tensor, pad_id: int):
    """
    src_ids: (B, S)
    Tr·∫£ v·ªÅ mask shape (B, 1, 1, S), 1 = kh√¥ng b·ªã mask, 0 = b·ªã che
    """
    mask = (src_ids != pad_id).unsqueeze(1).unsqueeze(2)  
    return mask  


def make_tgt_mask(tgt_ids: torch.Tensor, pad_id: int):
    """
    tgt_ids: (B, T) = input cho decoder (BOS, w1, w2, ...)
    Tr·∫£ v·ªÅ mask shape (B, 1, T, T)
    """
    B, T = tgt_ids.shape

    pad_mask = (tgt_ids != pad_id).unsqueeze(1).unsqueeze(2)  

    nopeak = torch.tril(torch.ones((T, T), device=tgt_ids.device)).bool()  
    nopeak = nopeak.unsqueeze(0).unsqueeze(1)  

    combined = pad_mask & nopeak 
    return combined


def make_masks(src, tgt_in, pad_id):
    src_mask = (src != pad_id).unsqueeze(1).unsqueeze(2)

    tgt_pad_mask = (tgt_in != pad_id).unsqueeze(1).unsqueeze(2)
    tgt_len = tgt_in.size(1)
    nopeak_mask = torch.tril(
        torch.ones((1, 1, tgt_len, tgt_len), device=tgt_in.device)
    ).bool()

    tgt_mask = tgt_pad_mask & nopeak_mask
    return src_mask, tgt_mask

d_model = 512
n_layers = 4       
n_heads = 8
d_ff = 2048
dropout = 0.1
max_len = max(MAX_SRC_LEN, MAX_TGT_LEN)

model = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=d_model,
    n_layers=n_layers,
    n_heads=n_heads,
    d_ff=d_ff,
    dropout=dropout,
    max_len=max_len,
).to(device)

criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.98), eps=1e-9)

print("Model params:", sum(p.numel() for p in model.parameters()) / 1e6, "M")


Model params: 41.723712 M


In [None]:
import math
import torch

def train_one_epoch(model, dataloader, optimizer, criterion, pad_id, device,
                    clip=1.0, scaler=None, scheduler=None):
    model.train()
    total_loss = 0.0
    total_tokens = 0

    for batch in dataloader:
        src = batch["src_ids"].to(device)
        tgt_in = batch["tgt_in_ids"].to(device)
        tgt_out = batch["tgt_out_ids"].to(device)

        src_mask, tgt_mask = make_masks(src, tgt_in, pad_id)

        optimizer.zero_grad(set_to_none=True)

        if scaler is not None:
            with torch.cuda.amp.autocast():
                logits = model(src, tgt_in, src_mask, tgt_mask)
                loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(src, tgt_in, src_mask, tgt_mask)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

        if scheduler is not None:
            scheduler.step()

        n_tok = (tgt_out != pad_id).sum().item()
        total_tokens += n_tok
        total_loss += loss.item() * max(n_tok, 1)  
    avg_loss = total_loss / max(total_tokens, 1)
    ppl = math.exp(min(avg_loss, 20))
    return avg_loss, ppl


@torch.no_grad()
def evaluate(model, dataloader, criterion, pad_id, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    for batch in dataloader:
        src = batch["src_ids"].to(device)
        tgt_in = batch["tgt_in_ids"].to(device)
        tgt_out = batch["tgt_out_ids"].to(device)

        src_mask, tgt_mask = make_masks(src, tgt_in, pad_id)

        logits = model(src, tgt_in, src_mask, tgt_mask)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

        n_tok = (tgt_out != pad_id).sum().item()
        total_tokens += n_tok
        total_loss += loss.item() * max(n_tok, 1)

    avg_loss = total_loss / max(total_tokens, 1)
    ppl = math.exp(min(avg_loss, 20))
    return avg_loss, ppl


In [7]:
batch = next(iter(train_loader))
src = batch["src_ids"].to(device)
tgt_in = batch["tgt_in_ids"].to(device)
src_mask, tgt_mask = make_masks(src, tgt_in, pad_id)
print("src", src.shape, "src_mask", src_mask.shape)
print("tgt_in", tgt_in.shape, "tgt_mask", tgt_mask.shape)
print("mask dtype", src_mask.dtype, tgt_mask.dtype)


src torch.Size([64, 63]) src_mask torch.Size([64, 1, 1, 63])
tgt_in torch.Size([64, 66]) tgt_mask torch.Size([64, 1, 66, 66])
mask dtype torch.bool torch.bool


In [None]:
import math
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from dataset import NMTDataset, collate_fn
from tokenizer import SubwordTokenizer
from model import Transformer

D_MODEL = 384
N_LAYERS = 4
N_HEADS  = 8
D_FF     = 1536
DROPOUT  = 0.1
MAX_LEN  = 5000

N_EPOCHS = 30
WARMUP_STEPS = 4000

print("V3 config:",
      f"d_model={D_MODEL}, n_layers={N_LAYERS}, n_heads={N_HEADS}, d_ff={D_FF}, dropout={DROPOUT}")

model_v3 = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    dropout=DROPOUT,
    max_len=MAX_LEN,
).to(device)

criterion_v3 = nn.CrossEntropyLoss(
    ignore_index=pad_id,
    label_smoothing=0.1,
)

optimizer_v3 = optim.Adam(
    model_v3.parameters(),
    lr=1.0,  
    betas=(0.9, 0.98),
    eps=1e-9,
)

class NoamLR(torch.optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, d_model, warmup_steps=4000, last_epoch=-1):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        step = max(self.last_epoch + 1, 1)
        scale = (self.d_model ** -0.5) * min(
            step ** -0.5, step * (self.warmup_steps ** -1.5)
        )
        return [base_lr * scale for base_lr in self.base_lrs]

scheduler_v3 = NoamLR(optimizer_v3, d_model=D_MODEL, warmup_steps=WARMUP_STEPS)


def run_epoch_v3(model, dataloader, optimizer, scheduler, criterion, device, is_train=True):
    if is_train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0

    for batch in dataloader:
        src = batch["src_ids"].to(device)
        tgt_in = batch["tgt_in_ids"].to(device)
        tgt_out = batch["tgt_out_ids"].to(device)

        src_mask, tgt_mask = make_masks(src, tgt_in, pad_id)

        if is_train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(is_train):
            logits = model(src, tgt_in, src_mask, tgt_mask)
            loss = criterion(
                logits.view(-1, logits.size(-1)),
                tgt_out.view(-1)
            )

            if is_train:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

best_valid_loss_v3 = float("inf")
best_v3_path = "/kaggle/working/best_transformer_v3.pt"

print("\nüöÄ START TRAINING V3")
for epoch in range(1, N_EPOCHS + 1):
    start_time = time.time()

    train_loss = run_epoch_v3(
        model_v3, train_loader, optimizer_v3, scheduler_v3, criterion_v3,
        device, is_train=True
    )
    valid_loss = run_epoch_v3(
        model_v3, valid_loader, optimizer_v3, scheduler_v3, criterion_v3,
        device, is_train=False
    )

    train_ppl = math.exp(train_loss) if train_loss < 20 else float("inf")
    valid_ppl = math.exp(valid_loss) if valid_loss < 20 else float("inf")

    mins = (time.time() - start_time) / 60
    print(f"Epoch {epoch:02d} | Time: {mins:.2f} min")
    print(f"  Train Loss: {train_loss:.4f} | PPL: {train_ppl:.2f}")
    print(f"  Valid Loss: {valid_loss:.4f} | PPL: {valid_ppl:.2f}")

    if valid_loss < best_valid_loss_v3:
        best_valid_loss_v3 = valid_loss
        torch.save(model_v3.state_dict(), best_v3_path)
        print(f"  ‚úÖ Saved new best V3 to {best_v3_path}")



V3 config: d_model=384, n_layers=4, n_heads=8, d_ff=1536, dropout=0.1

üöÄ START TRAINING V3
Epoch 01 | Time: 14.43 min
  Train Loss: 4.6259 | PPL: 102.10
  Valid Loss: 3.4500 | PPL: 31.50
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 02 | Time: 14.40 min
  Train Loss: 3.3890 | PPL: 29.64
  Valid Loss: 3.1285 | PPL: 22.84
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 03 | Time: 14.42 min
  Train Loss: 3.1550 | PPL: 23.45
  Valid Loss: 2.9885 | PPL: 19.86
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 04 | Time: 14.43 min
  Train Loss: 3.0390 | PPL: 20.88
  Valid Loss: 2.9022 | PPL: 18.21
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 05 | Time: 14.42 min
  Train Loss: 2.9636 | PPL: 19.37
  Valid Loss: 2.8484 | PPL: 17.26
  ‚úÖ Saved new best V3 to /kaggle/working/best_transformer_v3.pt
Epoch 06 | Time: 14.40 min
  Train Loss: 2.9083 | PPL: 18.32
  Valid Loss: 2.8118 | PPL: 16.64
  ‚úÖ

In [None]:
ckpt_v3 = "/kaggle/working/best_transformer_v3.pt"

D_MODEL  = 384
N_LAYERS = 4
N_HEADS  = 8
D_FF     = 1536
DROPOUT  = 0.1
MAX_LEN  = 5000

ft_v3 = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    dropout=DROPOUT,
    max_len=MAX_LEN,
).to(device)

ft_v3.load_state_dict(torch.load(ckpt_v3, map_location=device))
ft_v3.eval()
print("Loaded V3 for extra fine-tune.")

criterion_ft = nn.CrossEntropyLoss(ignore_index=pad_id, label_smoothing=0.1)
optimizer_ft = optim.Adam(ft_v3.parameters(), lr=5e-5, betas=(0.9, 0.98), eps=1e-9)

N_EPOCHS_FT_V3 = 3
best_valid_v3_ft = float("inf")
save_v3_ft = "/kaggle/working/best_transformer_v3_ft.pt"

for epoch in range(1, N_EPOCHS_FT_V3 + 1):
    start = time.time()
    train_loss, train_ppl = train_one_epoch(ft_v3, train_loader, optimizer_ft, criterion_ft, pad_id, device)
    valid_loss, valid_ppl = evaluate(ft_v3, valid_loader, criterion_ft, pad_id, device)
    elapsed = (time.time() - start) / 60

    print(f"[V3_FT] Epoch {epoch:02d} | Time: {elapsed:.2f} min")
    print(f"  Train Loss: {train_loss:.4f} | PPL: {train_ppl:.2f}")
    print(f"  Valid Loss: {valid_loss:.4f} | PPL: {valid_ppl:.2f}")

    if valid_loss < best_valid_v3_ft:
        best_valid_v3_ft = valid_loss
        torch.save(ft_v3.state_dict(), save_v3_ft)
        print(f"  ‚úÖ Saved new best V3_FT to {save_v3_ft}")


Loaded V3 for extra fine-tune.
[V3_FT] Epoch 01 | Time: 14.40 min
  Train Loss: 2.5013 | PPL: 12.20
  Valid Loss: 2.4871 | PPL: 12.03
  ‚úÖ Saved new best V3_FT to /kaggle/working/best_transformer_v3_ft.pt
[V3_FT] Epoch 02 | Time: 14.42 min
  Train Loss: 2.4891 | PPL: 12.05
  Valid Loss: 2.4822 | PPL: 11.97
  ‚úÖ Saved new best V3_FT to /kaggle/working/best_transformer_v3_ft.pt
[V3_FT] Epoch 03 | Time: 14.40 min
  Train Loss: 2.4832 | PPL: 11.98
  Valid Loss: 2.4751 | PPL: 11.88
  ‚úÖ Saved new best V3_FT to /kaggle/working/best_transformer_v3_ft.pt


In [None]:
def load_version(version="v2_ft"):
    global eval_model

    if version == "v1":
        ckpt = torch.load("/kaggle/working/best_transformer_v1.pt", map_location=device)
        d_model  = ckpt.get("d_model", 256)
        n_layers = ckpt.get("n_layers", 3)
        n_heads  = ckpt.get("n_heads", 4)
        d_ff     = ckpt.get("d_ff", 1024)
        dropout  = ckpt.get("dropout", 0.1)
        max_len  = ckpt.get("max_len", 5000)

        eval_model = Transformer(
            vocab_size, vocab_size,
            d_model, n_layers, n_heads, d_ff, dropout, max_len
        ).to(device)
        eval_model.load_state_dict(ckpt["model_state_dict"])

    elif version == "v2":
        path = "/kaggle/working/best_transformer_v2.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 256, 3, 4, 1024, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    elif version == "v2_ft":
        path = "/kaggle/working/best_transformer_v2_ft.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 256, 3, 4, 1024, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    elif version == "v3":
        path = "/kaggle/working/best_transformer_v3.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 384, 4, 8, 1536, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    elif version == "v3_ft":
        path = "/kaggle/working/best_transformer_v3_ft.pt"
        D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN = 384, 4, 8, 1536, 0.1, 5000
        eval_model = Transformer(
            vocab_size, vocab_size,
            D_MODEL, N_LAYERS, N_HEADS, D_FF, DROPOUT, MAX_LEN
        ).to(device)
        eval_model.load_state_dict(torch.load(path, map_location=device))

    else:
        raise ValueError("version ph·∫£i l√† 'v1', 'v2', 'v2_ft', 'v3' ho·∫∑c 'v3_ft'.")

    eval_model.eval()
    print(f"‚úÖ Loaded {version} model.")

    


load_version("v3_ft")   


‚úÖ Loaded v3_ft model.


In [None]:
import torch
import re

@torch.no_grad()
def greedy_translate(
    model,
    tokenizer,
    src_sentence: str,
    max_len: int = 70,
    max_src_len: int = MAX_SRC_LEN,
) -> None: 
    """
    D·ªãch 1 c√¢u EN -> VI b·∫±ng greedy search.
    """

    model.eval()

    src_clean_str = src_sentence.strip().lower()

    src_ids = tokenizer.encode_src(src_clean_str, add_bos=False, add_eos=True)
    if len(src_ids) > max_src_len:
        src_ids = src_ids[:max_src_len]

    src = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)
    src_mask = make_src_mask(src, pad_id)
    enc_output = model.encode(src, src_mask)

    tgt_ids = [bos_id]
    for _ in range(max_len):
        tgt = torch.tensor(tgt_ids, dtype=torch.long, device=device).unsqueeze(0)
        tgt_mask = make_tgt_mask(tgt, pad_id)

        dec_output = model.decode(tgt, enc_output, src_mask, tgt_mask)
        logits = model.projection(dec_output)
        next_token = logits[:, -1, :].argmax(dim=-1).item()
        tgt_ids.append(next_token)
        if next_token == eos_id:
            break

    out_ids = tgt_ids[1:]
    if out_ids and out_ids[-1] == eos_id:
        out_ids = out_ids[:-1]

    translation = tokenizer.decode(out_ids)
    translation = translation.replace("ÔøΩ", "").replace("‚Åá", "").strip()
    translation = " ".join(translation.split())

    return translation


In [None]:
import torch
import torch.nn.functional as F
import math
import random

@torch.no_grad()
def beam_translate(
    model,
    tokenizer,
    src_sentence: str,
    max_len: int = 70,
    max_src_len: int = MAX_SRC_LEN,
    beam_size: int = 4,
    length_penalty: float = 1.0,  
):
    """
    Beam Search decode cho 1 c√¢u EN -> VI.
    """

    model.eval()

    src_clean_str = src_sentence.strip().lower()

    src_ids = tokenizer.encode_src(src_clean_str, add_bos=False, add_eos=True)
    if len(src_ids) > max_src_len:
        src_ids = src_ids[:max_src_len]

    src = torch.tensor(src_ids, dtype=torch.long, device=device).unsqueeze(0)  
    src_mask = make_src_mask(src, pad_id)
    enc_output = model.encode(src, src_mask)


    beams = [([bos_id], 0.0, False)]

    for step in range(max_len):
        new_beams = []

        for tokens, log_prob, finished in beams:
            if finished:
                new_beams.append((tokens, log_prob, True))
                continue

            tgt = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)  
            tgt_mask = make_tgt_mask(tgt, pad_id)

            dec_out = model.decode(tgt, enc_output, src_mask, tgt_mask)  
            logits = model.projection(dec_out)                           
            next_log_probs = F.log_softmax(logits[:, -1, :], dim=-1)     

            topk_log_probs, topk_ids = torch.topk(next_log_probs, beam_size, dim=-1)

            for k in range(beam_size):
                token_id = topk_ids[0, k].item()
                token_lp = topk_log_probs[0, k].item()
                new_tokens = tokens + [token_id]
                new_log_prob = log_prob + token_lp
                new_finished = (token_id == eos_id)
                new_beams.append((new_tokens, new_log_prob, new_finished))

        beams = sorted(
            new_beams,
            key=lambda x: x[1] / (len(x[0]) ** length_penalty),
            reverse=True
        )[:beam_size]

        if all(f for _, _, f in beams):
            break

    best_tokens, best_log_prob, finished = max(
        beams,
        key=lambda x: x[1] / (len(x[0]) ** length_penalty)
    )

    out_ids = best_tokens[1:]
    if out_ids and out_ids[-1] == eos_id:
        out_ids = out_ids[:-1]

    translation = tokenizer.decode(out_ids)

    translation = translation.replace("ÔøΩ", "").replace("‚Åá", "").strip()
    translation = " ".join(translation.split())
    return translation


In [None]:
sentences = [
    "I really like natural language processing.",
    "This is a small machine translation model.",
    "Thank you for your help.",

    "I am studying machine learning at the university.",
    "I don't understand this sentence very well.",
    "I will try to improve the translation quality.",

    "We are working on a neural machine translation project.",
    "We need more training data to get better results.",
    "We will present our model in the final report.",

    "You can run the code on Kaggle with a GPU.",
    "You should compare the greedy and beam search outputs.",

    "He likes to read research papers about deep learning.",
    "She is preparing a presentation about transformers.",
    "They want to build a better translation system.",

    "When I first learned about attention, I was very confused.",
    "Even if the BLEU score is not very high, the model can still be useful.",
    "If we have more time, we will try a larger transformer model.",
]


In [14]:
for s in sentences:
    vi_greedy = greedy_translate(eval_model, tokenizer, s, max_len=MAX_TGT_LEN)
    vi_beam   = beam_translate(eval_model, tokenizer, s, max_len=MAX_TGT_LEN, beam_size=4)

    print("EN     :", s)
    print("Greedy :", vi_greedy)
    print("Beam   :", vi_beam)
    print("-" * 60)


EN     : I really like natural language processing.
Greedy : t√¥i th·ª±c s·ª± th√≠ch qu√° tr√¨nh x·ª≠ l√Ω ng√¥n ng·ªØ t·ª± nhi√™n.
Beam   : t√¥i th·ª±c s·ª± th√≠ch qu√° tr√¨nh x·ª≠ l√Ω ng√¥n ng·ªØ t·ª± nhi√™n.
------------------------------------------------------------
EN     : This is a small machine translation model.
Greedy : ƒë√¢y l√† m√¥ h√¨nh d·ªãch nh·ªè c·ªßa m√°y.
Beam   : ƒë√¢y l√† m·ªôt m√¥ h√¨nh d·ªãch nh·ªè.
------------------------------------------------------------
EN     : Thank you for your help.
Greedy : c·∫£m ∆°n s·ª± gi√∫p ƒë·ª° c·ªßa c√°c b·∫°n.
Beam   : c·∫£m ∆°n s·ª± gi√∫p ƒë·ª° c·ªßa c√°c b·∫°n.
------------------------------------------------------------
EN     : I am studying machine learning at the university.
Greedy : t√¥i ƒëang h·ªçc m√°y ·ªü tr∆∞·ªùng ƒë·∫°i h·ªçc.
Beam   : t√¥i ƒëang nghi√™n c·ª©u m√°y m√≥c t·∫°i tr∆∞·ªùng ƒë·∫°i h·ªçc.
------------------------------------------------------------
EN     : I don't understand this sentence very well.
Gree

In [15]:
from torch.utils.data import DataLoader

test_dataset = NMTDataset(
    data_dir=str(PROCESSED_DIR),
    split="test",
    tokenizer=tokenizer,
    max_src_len=MAX_SRC_LEN,
    max_tgt_len=MAX_TGT_LEN,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, pad_id=pad_id),
)

print("Test size:", len(test_dataset))


Test size: 1262


In [16]:
!pip install -q sacrebleu
import sacrebleu


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from tqdm.auto import tqdm

@torch.no_grad()
def compute_bleu(
    model,
    tokenizer,
    loader,
    max_len=70,
    max_sentences=None,
    use_beam=False,
    beam_size=4,
    length_penalty=1.0,
):
    model.eval()
    all_refs = []
    all_hyps = []
    count = 0

    for batch in tqdm(loader, desc="BLEU decoding"):
        src_ids = batch["src_ids"]
        tgt_out_ids = batch["tgt_out_ids"]
        B = src_ids.size(0)

        for i in range(B):
            if (max_sentences is not None) and (count >= max_sentences):
                break

            src_seq = src_ids[i].tolist()
            src_clean = [tid for tid in src_seq if tid not in (pad_id, eos_id)]
            src_text = tokenizer.decode(src_clean)

            if use_beam:
                hyp = beam_translate(
                    model,
                    tokenizer,
                    src_text,
                    max_len=max_len,
                    beam_size=beam_size,
                    length_penalty=length_penalty,
                )
            else:
                hyp = greedy_translate(
                    model,
                    tokenizer,
                    src_text,
                    max_len=max_len,
                )

            tgt_seq = tgt_out_ids[i].tolist()
            tgt_clean = [tid for tid in tgt_seq if tid not in (pad_id, eos_id)]
            ref_text = tokenizer.decode(tgt_clean)

            all_hyps.append(hyp)
            all_refs.append(ref_text)
            count += 1

        if (max_sentences is not None) and (count >= max_sentences):
            break

    bleu = sacrebleu.corpus_bleu(all_hyps, [all_refs])
    return bleu.score


In [None]:
bleu_greedy = compute_bleu(
    eval_model,
    tokenizer,
    test_loader,
    max_len=MAX_TGT_LEN,
    max_sentences=None,
    use_beam=False,       
)
print("BLEU (greedy):", bleu_greedy)


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU (greedy): 36.64127507145566


In [19]:
bleu_beam = compute_bleu(
    eval_model,
    tokenizer,
    test_loader,
    max_len=MAX_TGT_LEN,
    max_sentences=None,
    use_beam=True,
    beam_size=4,
    length_penalty=1.0,
)
print("BLEU (beam=4, lp=1.0):", bleu_beam)


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU (beam=4, lp=1.0): 38.09516055648722


In [None]:
configs = [
    (3, 0.8),
    (4, 1.0),
    (5, 1.0),
    (5, 1.2),
    (6, 1.0),
]

for beam_size, lp in configs:
    bleu = compute_bleu(
        eval_model,
        tokenizer,
        test_loader,
        max_len=MAX_TGT_LEN,
        max_sentences=None,   
        use_beam=True,        
        beam_size=beam_size,
        length_penalty=lp,
    )
    print(f"V3_ft | beam={beam_size}, lp={lp} -> BLEU={bleu:.3f}")


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


V3_ft | beam=3, lp=0.8 -> BLEU=37.877


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


V3_ft | beam=4, lp=1.0 -> BLEU=38.095


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


V3_ft | beam=5, lp=1.0 -> BLEU=38.137


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


V3_ft | beam=5, lp=1.2 -> BLEU=38.187


BLEU decoding:   0%|          | 0/40 [00:00<?, ?it/s]

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


V3_ft | beam=6, lp=1.0 -> BLEU=38.221
