In [1]:
# %% [code] - Cài đặt và tải dữ liệu
!pip install spacy==3.8.2 torchtext==0.18.0 sacrebleu
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm


# import torch.nn as nn
# import torch.nn.functional as F

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import spacy
spacy_en = spacy.load("en_core_web_sm")
spacy_fr = spacy.load("fr_core_news_sm")

def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]

def tokenize_fr(text):
    return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

# Tải dữ liệu Multi30K
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.en.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.fr.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.en.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.fr.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.en.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.fr.gz

!gunzip -f *.gz

def load_lines(file):
    with open(file, encoding='utf-8') as f:
        return [line.strip() for line in f]

train_en = load_lines("train.en")
train_fr = load_lines("train.fr")
val_en   = load_lines("val.en")
val_fr   = load_lines("val.fr")
test_en  = load_lines("test_2016_flickr.en")
test_fr  = load_lines("test_2016_flickr.fr")

# %% [code] - Xây vocab (giới hạn 10k từ phổ biến nhất + 4 special tokens)
from collections import Counter

def build_vocab(sentences, tokenizer, max_size=10000):
    counter = Counter()
    for s in sentences:
        counter.update(tokenizer(s))

    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    for word, freq in counter.most_common(max_size - 4):
        vocab[word] = len(vocab)
    return vocab

SRC_vocab = build_vocab(train_en, tokenize_en)
TRG_vocab = build_vocab(train_fr, tokenize_fr)

print(f"EN vocab: {len(SRC_vocab):,} | FR vocab: {len(TRG_vocab):,}")

# %% [code] - Dataset + Collate_fn (sort + padding + packing)
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

class TranslationDataset(Dataset):
    def __init__(self, src_lines, trg_lines):
        self.src_lines = src_lines
        self.trg_lines = trg_lines

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        src = ["<sos>"] + tokenize_en(self.src_lines[idx]) + ["<eos>"]
        trg = ["<sos>"] + tokenize_fr(self.trg_lines[idx]) + ["<eos>"]

        src_ids = [SRC_vocab.get(t, SRC_vocab["<unk>"]) for t in src]
        trg_ids = [TRG_vocab.get(t, TRG_vocab["<unk>"]) for t in trg]

        return torch.tensor(src_ids), torch.tensor(trg_ids)

def collate_fn(batch):
    srcs, trgs = zip(*batch)
    src_lens = [len(s) for s in srcs]
    trg_lens = [len(t) for t in trgs]

    srcs_pad = pad_sequence(srcs, batch_first=True, padding_value=SRC_vocab["<pad>"])
    trgs_pad = pad_sequence(trgs, batch_first=True, padding_value=TRG_vocab["<pad>"])

    return srcs_pad, trgs_pad, src_lens, trg_lens

train_dataset = TranslationDataset(train_en, train_fr)
val_dataset   = TranslationDataset(val_en, val_fr)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=64, shuffle=False, collate_fn=collate_fn)

# %% [code] - Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



Collecting spacy==3.8.2
  Downloading spacy-3.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting torchtext==0.18.0
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting langcodes<4.0.0,>=3.2.0 (from spacy==3.8.2)
  Downloading langcodes-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading spacy-3.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.8/31.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadi

In [3]:
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True)

    def forward(self, src, src_len):
        embedded = self.dropout(self.embedding(src))
        packed = pack_padded_sequence(embedded, src_len, batch_first=True, enforce_sorted=False)
        _, (h, c) = self.lstm(packed)
        return h, c



In [5]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True)
        self.fc = nn.Linear(hid_dim, vocab_size)

    def forward(self, input, h, c, lengths=None):
        embedded = self.dropout(self.embedding(input))
        if lengths is not None:
            embedded = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        output, (h, c) = self.lstm(embedded, (h, c))
        if lengths is not None:
            output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        logits = self.fc(output)
        return logits, h, c

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_len, trg, trg_len, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_vocab_size = len(TRG_vocab)
        max_len = trg.size(1)

        outputs = torch.zeros(batch_size, max_len-1, trg_vocab_size).to(src.device)

        h, c = self.encoder(src, src_len)

        input = trg[:, 0].unsqueeze(1)  # <sos>

        for t in range(1, max_len):
            logit, h, c = self.decoder(input, h, c)
            outputs[:, t-1] = logit.squeeze(1)

            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            # thay đổi
            top1 = logit.argmax(-1).squeeze(1)   # [batch]
            input = trg[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)


        return outputs

# %% [code] - Khởi tạo model
enc = Encoder(len(SRC_vocab), emb_dim=256, hid_dim=512, dropout=0.3)
dec = Decoder(len(TRG_vocab), emb_dim=256, hid_dim=512, dropout=0.3)
model = Seq2Seq(enc, dec).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_vocab["<pad>"])

# %% [code] - Training loop + Early stopping
import copy
best_val_loss = float('inf')
best_model = None
patience = 3
no_improve = 0

for epoch in range(1, 31):
    model.train()
    train_loss = 0
    for src, trg, src_len, trg_len in train_loader:
        src, trg = src.to(device), trg.to(device)
        src_len = torch.tensor(src_len)
        trg_len = torch.tensor(trg_len)

        optimizer.zero_grad()
        output = model(src, src_len, trg, trg_len, teacher_forcing_ratio=0.5)

        loss = criterion(output.reshape(-1, output.size(-1)), trg[:,1:].reshape(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, trg, src_len, trg_len in val_loader:
            src, trg = src.to(device), trg.to(device)
            src_len = torch.tensor(src_len)
            output = model(src, src_len, trg, trg_len, teacher_forcing_ratio=0.0)
            loss = criterion(output.reshape(-1, output.size(-1)), trg[:,1:].reshape(-1))
            val_loss += loss.item()

    val_loss /= len(val_loader)
    print(f"Epoch {epoch:02d} | Train loss: {train_loss/len(train_loader):.4f} | Val loss: {val_loss:.4f}")

    # Early stopping + save best
    if val_loss < best_val_loss - 1e-4:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model.state_dict())
        torch.save(best_model, "best_model.pt")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping!")
            break

model.load_state_dict(torch.load("best_model.pt"))

# %% [code] - Inference (greedy)
def translate(sentence):
    model.eval()
    tokens = ["<sos>"] + tokenize_en(sentence) + ["<eos>"]
    src = torch.tensor([SRC_vocab.get(t, SRC_vocab["<unk>"]) for t in tokens]).unsqueeze(0).to(device)
    src_len = [len(tokens)]

    with torch.no_grad():
        h, c = model.encoder(src, src_len)

    input = torch.tensor([[TRG_vocab["<sos>"]]]).to(device)
    result = []
    for _ in range(50):
        with torch.no_grad():
            logit, h, c = model.decoder(input, h, c)
            pred = logit.argmax(-1).item()
        word = [k for k, v in TRG_vocab.items() if v == pred][0]
        if word == "<eos>":
            break
        result.append(word)
        input = torch.tensor([[pred]]).to(device)

    return " ".join(result)

# Test một câu
print(translate("A man is playing a guitar."))

# %% [code] - Tính BLEU score
import sacrebleu

# model.eval()
# refs = [[fr.split()] for fr in test_fr]
# preds = []

# for en in test_en[:200]:  # test nhanh 200 câu
#     pred = translate(en)
#     preds.append(pred.split())

model.eval()

preds = []
refs = []

for i in range(200):
    pred = translate(test_en[i])  # chuỗi
    preds.append(pred)

    refs.append([test_fr[i]])     # list chứa 1 câu tham chiếu
                                  # không split!


bleu = sacrebleu.corpus_bleu(preds, refs[:200])
print(f"BLEU score: {bleu.score:.2f}")

Epoch 01 | Train loss: 4.6094 | Val loss: 4.6386
Epoch 02 | Train loss: 3.6571 | Val loss: 4.2266
Epoch 03 | Train loss: 3.2261 | Val loss: 3.9327
Epoch 04 | Train loss: 2.9166 | Val loss: 3.7793
Epoch 05 | Train loss: 2.6730 | Val loss: 3.6974
Epoch 06 | Train loss: 2.4708 | Val loss: 3.6277
Epoch 07 | Train loss: 2.2995 | Val loss: 3.5747
Epoch 08 | Train loss: 2.1490 | Val loss: 3.5349
Epoch 09 | Train loss: 2.0045 | Val loss: 3.5039
Epoch 10 | Train loss: 1.8902 | Val loss: 3.4786
Epoch 11 | Train loss: 1.7743 | Val loss: 3.4897
Epoch 12 | Train loss: 1.6892 | Val loss: 3.5120
Epoch 13 | Train loss: 1.6031 | Val loss: 3.5084
Early stopping!
un homme joue de la guitare .
BLEU score: 57.74


In [1]:
# 6.2.1 Decoder với Luong Attention
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)  # for general (dot) score
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: (1, batch, hidden) - decoder hidden
        # encoder_outputs: (src_len, batch, hidden*2) - bi-LSTM
        src_len = encoder_outputs.shape[0]
        hidden = hidden.repeat(src_len, 1, 1)  # (src_len, batch, hidden)
        
        # General score: dot product after linear
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # (src_len, batch, hidden)
        attention = self.v(energy).squeeze(2)  # (src_len, batch)
        return torch.softmax(attention, dim=0)  # (src_len, batch)

class DecoderWithAttn(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_size, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim + hidden_size * 2, hidden_size, batch_first=False)  # input cat embed + context (hidden*2 bi)
        self.fc_out = nn.Linear(hidden_size * 3, output_dim)  # hidden + embed + context
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        # input: (batch)
        input = input.unsqueeze(0)  # (1, batch)
        embedded = self.dropout(self.embedding(input))  # (1, batch, emb)
        
        # Attention
        attn = self.attention(hidden, encoder_outputs)  # (src_len, batch)
        context = torch.bmm(attn.unsqueeze(1), encoder_outputs.permute(1, 0, 2))  # (batch, 1, hidden*2)
        context = context.permute(1, 0, 2)  # (1, batch, hidden*2)
        
        # RNN input: cat embed + context
        rnn_input = torch.cat((embedded, context), dim=2)  # (1, batch, emb + hidden*2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        
        # Output: cat output + embed + context
        pred = self.fc_out(torch.cat((output, context, embedded), dim=2).squeeze(0))
        return pred, hidden, cell

# Seq2Seq với attention (encoder giữ nguyên, decoder mới)
class Seq2SeqAttn(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_len, trg, trg_len, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len_max = trg.shape[0]
        trg_vocab_size = self.decoder.fc_out.out_features
        
        outputs = torch.zeros(trg_len_max, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src, src_len)  # encoder_outputs for attn
        
        # Encoder last hidden/cell for decoder init (bi → concat to hidden_size)
        hidden = hidden.view(1, batch_size, -1)  # (layers=1, batch, hidden*2 → hidden)
        cell = cell.view(1, batch_size, -1)
        
        input = trg[0, :]  # <sos>
        for t in range(1, trg_len_max):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

NameError: name 'nn' is not defined