## 1. Import các thư viện cần thiết
Các yêu cầu hệ thống:
- Python phiên bản 3.10.18
- Pytorch phiên bản 2.1.0 (có hỗ trợ CUDA)
- Torchtext phiên bản 0.16.0
- NLTK phiên bản 3.9.1

In [30]:
import torch
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn import LSTM, Embedding, Linear, CrossEntropyLoss, Module
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import copy
import nltk, torchtext

## 2. Tải dữ liệu train, validation và test

In [8]:
# Tìm thiết bị GPU để huấn luyện
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == 'cuda':
    print("CUDA is available.")
    print("Device name:", torch.cuda.get_device_name(0))
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available, using CPU.")

CUDA is available.
Device name: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA version: 12.1


In [9]:
print("NLTK: ", nltk.__version__)
print("Torchtext: ", torchtext.__version__)
print("Torch: ", torch.__version__)

NLTK:  3.9.1
Torchtext:  0.16.0+cpu
Torch:  2.1.0+cu121


In [10]:
def getData(filePath):
    data = []
    
    with open(f'{filePath}', 'r') as file: 
        text = file.read()
        
    # Tách các câu theo ký tự \n
    lines = text.split("\n")
    
    # Thêm vào mảng
    for line in lines:
        arr = []
        arr.append(line)

        data.append(arr)
        
    return data   

train_en = getData("./data/train.en")
val_en = getData("./data/val.en")
train_fr = getData("./data/train.fr")
val_fr = getData("./data/val.fr")
test_en = getData("./data/test.en")
test_fr = getData("./data/test.fr")
        

In [11]:
# Kiểm tra dữ liệu
print("Train tiếng Anh: ")
print(train_en[:5])

print("\nValidation tiếng Anh: ")
print(val_en[:5])

print("\nTest tiếng Anh: ")
print(test_en[:5])

print("\nTrain tiếng Pháp: ")
print(train_fr[:5])

print("\nValidation tiếng Pháp: ")
print(val_fr[:5])

print("\nTest tiếng Pháp: ")
print(test_fr[:5])

Train tiếng Anh: 
[['Two young, White males are outside near many bushes.'], ['Several men in hard hats are operating a giant pulley system.'], ['A little girl climbing into a wooden playhouse.'], ['A man in a blue shirt is standing on a ladder cleaning a window.'], ['Two men are at the stove preparing food.']]

Validation tiếng Anh: 
[['A group of men are loading cotton onto a truck'], ['A man sleeping in a green room on a couch.'], ["A boy wearing headphones sits on a woman's shoulders."], ['Two men setting up a blue ice fishing hut on an iced over lake'], ['A balding man wearing a red life jacket is sitting in a small boat.']]

Test tiếng Anh: 
[['A young man participates in a career while the subject who records it smiles.'], ['The man is scratching the back of his neck while looking for a book in a book store.'], ['A person wearing goggles and a hat is sled riding.'], ['A girl in a pink coat and flowered goloshes sledding down a hill.'], ['Three girls are standing in front of a wi

## 3. Tokenization
Thực hiện chuyển các câu thành danh sách token dùng tokenizer của spacy

In [12]:
# Cài bằng lệnh python -m spacy download fr_core_news_sm/en_core_web_sm
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
fr_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')

# Tokenization
train_en_tokenized = [en_tokenizer(sentence[0]) for sentence in train_en] 
val_en_tokenized = [en_tokenizer(sentence[0]) for sentence in val_en]
test_en_tokenized = [en_tokenizer(sentence[0]) for sentence in test_en]

train_fr_tokenized = [fr_tokenizer(sentence[0]) for sentence in train_fr]
val_fr_tokenized = [fr_tokenizer(sentence[0]) for sentence in val_fr]
test_fr_tokenized = [fr_tokenizer(sentence[0]) for sentence in test_fr]


In [13]:
# Kiểm tra dữ liệu
print("Train tiếng Anh token: ", train_en_tokenized[:10])
print("Validation tiếng Anh token: ", val_en_tokenized[:10])
print("Test tiếng Anh token: ", test_en_tokenized[:10])

print("Train tiếng Pháp token: ", train_fr_tokenized[:10])
print("Validation tiếng Pháp token: ", val_fr_tokenized[:10])
print("Test tiếng Pháp token: ", test_fr_tokenized[:10])


Train tiếng Anh token:  [['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.'], ['Several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.'], ['A', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.'], ['A', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.'], ['Two', 'men', 'are', 'at', 'the', 'stove', 'preparing', 'food', '.'], ['A', 'man', 'in', 'green', 'holds', 'a', 'guitar', 'while', 'the', 'other', 'man', 'observes', 'his', 'shirt', '.'], ['A', 'man', 'is', 'smiling', 'at', 'a', 'stuffed', 'lion'], ['A', 'trendy', 'girl', 'talking', 'on', 'her', 'cellphone', 'while', 'gliding', 'slowly', 'down', 'the', 'street', '.'], ['A', 'woman', 'with', 'a', 'large', 'purse', 'is', 'walking', 'by', 'a', 'gate', '.'], ['Boys', 'dancing', 'on', 'poles', 'in', 'the', 'middle', 'of', 'the', 'night', '.']]
Validation tiếng Anh token:  [['A', 'group', 'o

## 4. Xây dựng từ điển
Xây dựng từ điển dùng build_vocal_from_iterator của pytorchtext với các token đặc biệt là $<unk>, <pad>, <sos>, <eos>$ và giới hạn 10000 từ phổ biến nhất

In [14]:
vocab_size = 10000
vocab_en = build_vocab_from_iterator(
    train_en_tokenized,
    specials=["<unk>", "<pad>", "<sos>", "<eos>"],
    max_tokens=vocab_size
)

vocab_fr = build_vocab_from_iterator(
    train_fr_tokenized,
    specials=["<unk>", "<pad>", "<sos>", "<eos>"],
    max_tokens=vocab_size
)


# Thiết lập index mặc định là <unk>
vocab_en.set_default_index(vocab_en['<unk>'])
vocab_fr.set_default_index(vocab_fr['<unk>'])


In [15]:
# Kiểm tra 
print("20 từ đầu của từ điển tiếng Anh: ", vocab_en.get_itos()[:20])
print("20 từ đầu của từ điển tiếng Pháp: ", vocab_fr.get_itos()[:20])


20 từ đầu của từ điển tiếng Anh:  ['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'A', 'in', 'the', 'on', 'is', 'and', 'man', 'of', 'with', ',', 'woman', 'are', 'to', 'Two']
20 từ đầu của từ điển tiếng Pháp:  ['<unk>', '<pad>', '<sos>', '<eos>', '.', 'un', 'une', 'Un', 'de', 'en', "d'", 'dans', 'sur', 'homme', 'et', 'avec', 'Une', 'des', 'la', 'à']


## 5. Xây dựng tensor
Thực hiện chuyển đổi dữ liệu từ dạng chuỗi sang dạng tensor.

In [16]:
# Chuyển đổi dữ liệu về dạng tensor
train_en_tensor = [
    torch.tensor([vocab_en[token] for token in sentence], dtype=torch.long)
    for sentence in train_en_tokenized
]
val_en_tensor = [
    torch.tensor([vocab_en[token] for token in sentence], dtype=torch.long)
    for sentence in val_en_tokenized
]
test_en_tensor = [
    torch.tensor([vocab_en[token] for token in sentence], dtype=torch.long)
    for sentence in test_en_tokenized
]
train_fr_tensor = [
    torch.tensor([vocab_fr[token] for token in sentence], dtype=torch.long)
    for sentence in train_fr_tokenized
]
val_fr_tensor = [
    torch.tensor([vocab_fr[token] for token in sentence], dtype=torch.long)
    for sentence in val_fr_tokenized
]
test_fr_tensor = [
    torch.tensor([vocab_fr[token] for token in sentence], dtype=torch.long)
    for sentence in test_fr_tokenized
]

In [17]:
print("Tensor tiếng Anh (train)", train_en_tensor)
print("Tensor tiếng Anh (val)", val_en_tensor)
print("Tensor tiếng Pháp (train)", train_fr_tensor)
print("Tensor tiếng Pháp (val)", val_fr_tensor)

Tensor tiếng Anh (train) [tensor([  19,   25,   15, 1169,  808,   17,   57,   84,  336, 1339,    5]), tensor([ 165,   36,    7,  335,  287,   17, 1224,    4,  758, 4496, 2957,    5]), tensor([   6,   61,   33,  232,   71,    4,  253, 4460,    5]), tensor([  6,  12,   7,   4,  30,  23,  10,  37,   9,   4, 589, 586,   4, 242,
          5]), tensor([  19,   36,   17,   20,    8, 1203,  375,  134,    5]), tensor([   6,   12,    7,   51,  144,    4,  126,   29,    8,   75,   12, 1748,
          27,   23,    5]), tensor([   6,   12,   10,  136,   20,    4,  941, 2599]), tensor([   6, 6104,   33,  121,    9,   45,  295,   29, 3165, 4582,   41,    8,
          40,    5]), tensor([   6,   16,   14,    4,   62,  665,   10,   42,   48,    4, 1045,    5]), tensor([1779,  241,    9, 1061,    7,    8,  181,   13,    8,  312,    5]), tensor([   6, 1884,  634,   13,  720,  110,   97,    7, 5901,    5]), tensor([145, 388, 222,  21, 287,  54, 978,  17,  97,  20,   8, 113,  13,   4,
        984,   5]), t

## 6. DataLoader
Chuẩn bị dữ liệu theo batch, padding và sắp xếp theo độ dài giảm dần

In [18]:
def collate_fn(batch):
    en_batch = [item[0] for item in batch]
    fr_batch = [item[1] for item in batch]

    # Pad = 1
    pad_en = pad_sequence(en_batch, batch_first=True, padding_value=1)
    pad_fr = pad_sequence(fr_batch, batch_first=True, padding_value=1)

    # Độ dài đúng
    lengths_en = (pad_en != 1).sum(dim=1)
    lengths_fr = (pad_fr != 1).sum(dim=1)

    lengths_en[lengths_en == 0] = 1
    lengths_fr[lengths_fr == 0] = 1

    # Sort chung một perm
    lengths_en_sorted, perm = lengths_en.sort(descending=True)
    pad_en = pad_en[perm]
    pad_fr = pad_fr[perm]
    lengths_fr_sorted = lengths_fr[perm]

    return pad_en, pad_fr, lengths_en_sorted, lengths_fr_sorted


# Gom dữ liệu train và test lại
train_zip_data = list(zip(train_en_tensor, train_fr_tensor))
val_zip_data = list(zip(val_en_tensor, val_fr_tensor))


train_loader = DataLoader(
    train_zip_data,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_zip_data,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)


## 7. Khởi tạo mô hình Sequence To Sequence và Luong Attention

In [19]:
class LuongAttention(Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.Wa = Linear(hidden_size, hidden_size, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden: [B, H]
        # encoder_outputs: [B, T, H]

        # [B, 1, H]
        dec = decoder_hidden.unsqueeze(1)

        # [B, T, H]
        score = torch.bmm(dec, self.Wa(encoder_outputs).transpose(1, 2))

        # [B, T]
        attn_weights = F.softmax(score.squeeze(1), dim=-1)

        # context = sum(attention * encoder_out)
        # [B, H]
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)

        return context, attn_weights

In [20]:
class Seq2SeqAttention(Module):
    def __init__(self, 
                 input_vocab_size, 
                 output_vocab_size, 
                 embedding_dim=512, 
                 hidden_size=512, 
                 lstm_layers=2, 
                 dropout=0.4,
                 pad_idx=1,
                 sos_idx=2):
        super().__init__()

        self.hidden_size = hidden_size
        self.lstm_layers = lstm_layers
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx

        # Encoder
        self.encoder_embedding = Embedding(input_vocab_size, embedding_dim, padding_idx=pad_idx)
        self.encoder_lstm = LSTM(embedding_dim, hidden_size, lstm_layers,
                                 batch_first=True, dropout=dropout, bidirectional=False)

        # Decoder
        self.decoder_embedding = Embedding(output_vocab_size, embedding_dim, padding_idx=pad_idx)
        self.decoder_lstm = LSTM(embedding_dim + hidden_size, hidden_size, lstm_layers,
                                 batch_first=True, dropout=dropout)

        # Attention
        self.attention = LuongAttention(hidden_size)

        # Context + decoder hidden => vocab
        self.out_linear = Linear(hidden_size*2, output_vocab_size)

    def forward(self, src, src_lengths, trg=None, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1) if trg is not None else 20
        vocab_size = self.out_linear.out_features

        # ---------- Encoder ----------
        enc_emb = self.encoder_embedding(src)

        packed = pack_padded_sequence(
            enc_emb, src_lengths.cpu(), batch_first=True, enforce_sorted=True
        )
        packed_out, (h, c) = self.encoder_lstm(packed)
        encoder_outputs, _ = pad_packed_sequence(packed_out, batch_first=True)

        # ---------- Decoder ----------
        decoder_input = torch.full(
            (batch_size, 1), self.sos_idx, dtype=torch.long, device=src.device
        )

        dec_h, dec_c = h, c
        outputs = torch.zeros(batch_size, trg_len, vocab_size, device=src.device)

        for t in range(trg_len - 1):
            dec_emb = self.decoder_embedding(decoder_input)

            # Lấy hidden state top layer: [B, H]
            dec_hidden_top = dec_h[-1]

            # Attention: context vector
            context, _ = self.attention(dec_hidden_top, encoder_outputs)

            # Ghép (B,1,H+H)
            dec_input = torch.cat([dec_emb, context.unsqueeze(1)], dim=-1)

            dec_out, (dec_h, dec_c) = self.decoder_lstm(dec_input, (dec_h, dec_c))

            # concat decoder output + context
            output = self.out_linear(
                torch.cat([dec_out.squeeze(1), context], dim=1)
            )

            outputs[:, t, :] = output

            # Teacher forcing
            use_tf = (trg is not None) and (torch.rand(1).item() < teacher_forcing_ratio)
            if use_tf:
                decoder_input = trg[:, t+1].unsqueeze(1)
            else:
                decoder_input = output.argmax(1).unsqueeze(1)

        return outputs


## 8. Huấn luyện mô hình (có attention)

In [26]:
# Các tham số huấn luyện
embedding_dim = 512
hidden_size = 512
lstm_layers = 2
dropout = 0.4
pad_idx = 1
sos_idx = 2
teacher_forcing_ratio = 0.5
num_epochs = 12
early_stop_patience = 3

# Khởi tạo mô hình
model = Seq2SeqAttention(
    input_vocab_size=len(vocab_en),
    output_vocab_size=len(vocab_fr),
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    lstm_layers=lstm_layers,
    dropout=dropout,
    pad_idx=pad_idx,
    sos_idx=sos_idx
).to(device)

# Khởi tạo các thành phần
criterion = CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.5, patience=1, verbose=True
)

# Early stopping
best_val_loss = float('inf')
epochs_no_improve = 0
best_model = None


# Vòng lặp huấn luyện
for epoch in range(1, num_epochs + 1):

    # Train
    model.train()
    train_loss = 0.0

    for pad_en, pad_fr, lengths_en, lengths_fr in train_loader:

        # Chuyển dữ liệu sang dạng cho GPU
        pad_en = pad_en.to(device)
        pad_fr = pad_fr.to(device)
        lengths_en = lengths_en.to(device)

        optimizer.zero_grad()

        logits = model(
            src=pad_en,
            src_lengths=lengths_en,
            trg=pad_fr,
            teacher_forcing_ratio=teacher_forcing_ratio
        )

        loss = criterion(
            logits.reshape(-1, logits.size(-1)),
            pad_fr.reshape(-1)
        )

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for pad_en, pad_fr, lengths_en, lengths_fr in val_loader:

            # Chuyển dữ liệu sang dạng cho GPU
            pad_en = pad_en.to(device)
            pad_fr = pad_fr.to(device)
            lengths_en = lengths_en.to(device)

            logits = model(
                src=pad_en,
                src_lengths=lengths_en,
                trg=pad_fr,
                teacher_forcing_ratio=1.0  # always TF=1.0 in validation
            )

            loss = criterion(
                logits.reshape(-1, logits.size(-1)),
                pad_fr.reshape(-1)
            )

            val_loss += loss.item()

    val_loss /= len(val_loader)

    print(f"Epoch {epoch}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")

    scheduler.step(val_loss)

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= early_stop_patience:
            print(f"Early stopping at epoch {epoch}")
            break


# Lưu model tốt nhất
torch.save(best_model, "best_seq2seq_attention_model_update1.pth")
print("Best model saved.")

Epoch 1: Train Loss=4.0510, Val Loss=1.6171
Epoch 2: Train Loss=2.6298, Val Loss=0.8908
Epoch 3: Train Loss=2.0673, Val Loss=0.6827


KeyboardInterrupt: 

## 9. Dự đoán và đánh giá mô hình

In [31]:
def calculate_perplexity(model, src_tensor, trg_tensor, pad_idx=1):
    model.eval()
    with torch.no_grad():
        # Tính src_lengths 
        src_lengths = (src_tensor != pad_idx).sum(dim=1)

        # Forward
        logits = model(
            src_tensor,
            src_lengths=src_lengths,
            trg=trg_tensor,
            teacher_forcing_ratio=0.0
        )

        # Tính cross entropy
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)),
            trg_tensor.view(-1),
            ignore_index=pad_idx
        )

        ppl = torch.exp(loss)
        
    return ppl.item()


In [32]:
#  Beam search decode 
def beam_search_decode(model, h, c, beam_size, max_len, stoi_fr, itos_fr, device, encoder_outputs):
    eos_idx = stoi_fr["<eos>"]
    beams = [([model.sos_idx], 0.0, h, c)]  # (tokens, log_prob, h, c)
    finished = []

    for _ in range(max_len):
        new_beams = []
        for tokens, log_prob, h_t, c_t in beams:
            last_token = tokens[-1]

            if last_token == eos_idx:
                finished.append((tokens, log_prob))
                continue

            dec_emb = model.decoder_embedding(torch.tensor([[last_token]], device=device))
            
            # Lấy hidden top layer
            dec_hidden_top = h_t[-1]

            # Tính context vector
            context, _ = model.attention(dec_hidden_top, encoder_outputs)

            # Ghép embedding + context
            dec_input_combined = torch.cat([dec_emb, context.unsqueeze(1)], dim=-1)

            dec_out, (new_h, new_c) = model.decoder_lstm(dec_input_combined, (h_t, c_t))

            # concat decoder output + context
            logits = model.out_linear(torch.cat([dec_out.squeeze(1), context], dim=1))
            log_probs = F.log_softmax(logits, dim=-1)

            topk_log_probs, topk_idx = log_probs.topk(beam_size, dim=-1)
            for next_log_p, token_id in zip(topk_log_probs[0], topk_idx[0]):
                new_beams.append((
                    tokens + [token_id.item()],
                    log_prob + next_log_p.item(),
                    new_h,
                    new_c
                ))

        # Chọn beam_size beams tốt nhất
        new_beams.sort(key=lambda x: x[1], reverse=True)
        beams = new_beams[:beam_size]
        if not beams:
            break

    if not finished:
        finished = [(b[0], b[1]) for b in beams]

    best_tokens = max(finished, key=lambda x: x[1])[0]
    best_tokens = [t for t in best_tokens[1:] if t != eos_idx]  # bỏ <sos> và <eos>
    return " ".join(itos_fr[t] for t in best_tokens)


In [39]:
#  Load checkpoint 
model.load_state_dict(torch.load("best_seq2seq_attention_model.pth", map_location=device))
model.eval()

# Dùng Smoothing Function làm hàm đánh giá
smooth_fn = SmoothingFunction().method1

#  Dự đoán 5 câu đầu 
beam_size = 5

for i in range(5):
    en_sentence = " ".join(test_en_tokenized[i])
    fr_reference = " ".join(test_fr_tokenized[i])

    #  Tensor cho encoder 
    stoi_en = vocab_en.get_stoi()
    stoi_fr = vocab_fr.get_stoi()
    itos_fr = vocab_fr.get_itos()
    unk_idx = stoi_en.get("<unk>", 0)
    eos_idx_en = stoi_en.get("<eos>", 3)

    src_ids = [stoi_en.get(t, unk_idx) for t in test_en_tokenized[i]] + [eos_idx_en]
    src_tensor = torch.tensor([src_ids], dtype=torch.long).to(device)
    src_lengths = torch.tensor([len(src_ids)], dtype=torch.long)

    with torch.no_grad():
        enc_emb = model.encoder_embedding(src_tensor)
        packed = torch.nn.utils.rnn.pack_padded_sequence(
            enc_emb, src_lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        packed_out, (h, c) = model.encoder_lstm(packed)
        encoder_outputs, _ = pad_packed_sequence(packed_out, batch_first=True)

    fr_pred = beam_search_decode(model, h, c, beam_size, max_len=50,
                             stoi_fr=stoi_fr, itos_fr=itos_fr,
                             device=device, encoder_outputs=encoder_outputs)

    # BLEU & Perplexity 
    trg_tensor = torch.tensor([[vocab_fr[token] for token in test_fr_tokenized[i]]],
                              dtype=torch.long).to(device)
    bleu = sentence_bleu([test_fr_tokenized[i]], fr_pred.split(), smoothing_function=smooth_fn)
    ppl = calculate_perplexity(model, src_tensor, trg_tensor, pad_idx=pad_idx)

    print(f"EN: {en_sentence}")
    print(f"FR actual: {fr_reference}")
    print(f"FR predicted: {fr_pred}")
    print(f"BLEU: {bleu:.4f}")
    print(f"Perplexity: {ppl:.2f}")
    print("-------------------------")

EN: A young man participates in a career while the subject who records it smiles .
FR actual: Un jeune homme participe à une course pendant que le sujet qui le filme sourit .
FR predicted: Un jeune homme participe dans un un tandis tandis qu' il est est la la main . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
BLEU: 0.0483
Perplexity: 136.00
-------------------------
EN: The man is scratching the back of his neck while looking for a book in a book store .
FR actual: L' homme se gratte l' arrière du cou tout en cherchant un livre dans une librairie .
FR predicted: L' homme est tient le le dos tout tout en attendant attendant livre livre dans une livre livre livre livre . . livre livre . un un livre livre . . . . livre livre . . . . livre livre . . . . livre livre . . .
BLEU: 0.0300
Perplexity: 114.91
-------------------------
EN: A person wearing goggles and a hat is sled riding .
FR actual: Une personne portant des lunettes de protection et un chapeau fait de la lu

In [1]:
bleu_scores = []

# Chuẩn bị các từ điển
stoi_en = vocab_en.get_stoi()
stoi_fr = vocab_fr.get_stoi()
itos_fr = vocab_fr.get_itos()
unk_idx = stoi_en.get("<unk>", 0)
eos_idx_en = stoi_en.get("<eos>", 3)

for i in range(len(test_en_tokenized)):
    # Tensor cho encoder
    src_ids = [stoi_en.get(t, unk_idx) for t in test_en_tokenized[i]] + [eos_idx_en]
    src_tensor = torch.tensor([src_ids], dtype=torch.long).to(device)
    src_lengths = torch.tensor([len(src_ids)], dtype=torch.long)

    with torch.no_grad():
        # Encoder
        enc_emb = model.encoder_embedding(src_tensor)
        packed = torch.nn.utils.rnn.pack_padded_sequence(
            enc_emb, src_lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        packed_out, (h, c) = model.encoder_lstm(packed)
        encoder_outputs, _ = pad_packed_sequence(packed_out, batch_first=True)  # **sử dụng packed_out**

        # Beam search decode
        fr_pred_sentence = beam_search_decode(
            model, h, c, beam_size=5, max_len=50,
            stoi_fr=stoi_fr, itos_fr=itos_fr, device=device,
            encoder_outputs=encoder_outputs
        )

    # BLEU score
    fr_reference = test_fr_tokenized[i]  # danh sách token
    fr_pred_tokens = fr_pred_sentence.split()
    bleu = sentence_bleu([fr_reference], fr_pred_tokens, smoothing_function=smooth_fn)
    bleu_scores.append(bleu)

avg_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU score: {avg_bleu:.4f}")


NameError: name 'vocab_en' is not defined