# Dự án Dịch Máy: Seq2Seq với Attention (EN → FR)

## 1. Cài đặt và Tải Dữ Liệu

In [2]:
# Cài đặt thư viện (nếu chạy trong Colab hoặc môi trường mới)
!pip install spacy==3.8.2 torchtext==0.18.0 sacrebleu
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

# Import các thư viện cần thiết
import random
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from collections import Counter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sacrebleu
from sacrebleu import sentence_bleu

# Load mô hình spaCy cho tokenize
spacy_en = spacy.load("en_core_web_sm")
spacy_fr = spacy.load("fr_core_news_sm")

# Hàm tokenize cho tiếng Anh
def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]  # Tokenize và chuyển về chữ thường

# Hàm tokenize cho tiếng Pháp
def tokenize_fr(text):
    return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]  # Tokenize và chuyển về chữ thường

# Tải dữ liệu Multi30K từ GitHub
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.en.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.fr.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.en.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.fr.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.en.gz
!wget -q https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.fr.gz

# Giải nén file
!gunzip -f *.gz

# Hàm load lines từ file
def load_lines(file):
    with open(file, encoding='utf-8') as f:
        return [line.strip() for line in f]  # Đọc từng dòng và loại bỏ khoảng trắng thừa

# Load dữ liệu train, val, test
train_en = load_lines("train.en")
train_fr = load_lines("train.fr")
val_en   = load_lines("val.en")
val_fr   = load_lines("val.fr")
test_en  = load_lines("test_2016_flickr.en")
test_fr  = load_lines("test_2016_flickr.fr")

Collecting spacy==3.8.2
  Downloading spacy-3.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting torchtext==0.18.0
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting langcodes<4.0.0,>=3.2.0 (from spacy==3.8.2)
  Downloading langcodes-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading spacy-3.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.8/31.8 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m130.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m127.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

ModuleNotFoundError: No module named 'spacy.pipeline.factories'

## 2. Xây Dựng Từ Vựng (Vocab)

In [None]:
# Hàm xây vocab từ list sentences
def build_vocab(sentences, tokenizer, max_size=10000):
    counter = Counter()  # Đếm tần suất từ
    for s in sentences:
        counter.update(tokenizer(s))  # Cập nhật counter từ tokens của mỗi câu
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}  # Các token đặc biệt
    for word, freq in counter.most_common(max_size - 4):  # Lấy top từ phổ biến
        vocab[word] = len(vocab)  # Gán index tăng dần
    return vocab

# Xây vocab cho nguồn (EN) và đích (FR)
SRC_vocab = build_vocab(train_en, tokenize_en)
TRG_vocab = build_vocab(train_fr, tokenize_fr)

print(f"EN vocab: {len(SRC_vocab):,} | FR vocab: {len(TRG_vocab):,}")

EN vocab: 9,797 | FR vocab: 10,000


## 3. Dataset và Collate Function

In [None]:
# Class Dataset cho dữ liệu dịch máy
class TranslationDataset(Dataset):
    def __init__(self, src_lines, trg_lines):
        self.src_lines = src_lines  # List câu nguồn
        self.trg_lines = trg_lines  # List câu đích

    def __len__(self):
        return len(self.src_lines)  # Độ dài dataset

    def __getitem__(self, idx):
        # Tokenize và thêm <sos>/<eos> cho nguồn
        src = ["<sos>"] + tokenize_en(self.src_lines[idx]) + ["<eos>"]
        # Tokenize và thêm <sos>/<eos> cho đích
        trg = ["<sos>"] + tokenize_fr(self.trg_lines[idx]) + ["<eos>"]
        # Chuyển token thành ID (sử dụng <unk> nếu không có trong vocab)
        src_ids = [SRC_vocab.get(t, SRC_vocab["<unk>"]) for t in src]
        trg_ids = [TRG_vocab.get(t, TRG_vocab["<unk>"]) for t in trg]
        return torch.tensor(src_ids), torch.tensor(trg_ids)  # Trả về tensor nguồn và đích

# Collate function để batch data (padding và packing)
def collate_fn(batch):
    srcs, trgs = zip(*batch)  # Tách nguồn và đích từ batch
    src_lens = [len(s) for s in srcs]  # Độ dài nguồn
    trg_lens = [len(t) for t in trgs]  # Độ dài đích
    # Padding nguồn với <pad>
    srcs_pad = pad_sequence(srcs, batch_first=True, padding_value=SRC_vocab["<pad>"])
    # Padding đích với <pad>
    trgs_pad = pad_sequence(trgs, batch_first=True, padding_value=TRG_vocab["<pad>"])
    return srcs_pad, trgs_pad, src_lens, trg_lens  # Trả về padded tensors và lengths

# Tạo dataset
train_dataset = TranslationDataset(train_en, train_fr)
val_dataset   = TranslationDataset(val_en, val_fr)
test_dataset  = TranslationDataset(test_en, test_fr)

# Tạo DataLoader (batch size 128, shuffle cho train)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=128, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset,  batch_size=128, shuffle=False, collate_fn=collate_fn)

## 4. Mô Hình (Model)

In [None]:
# Device (GPU nếu có)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Encoder: LSTM với embedding
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)  # Embedding layer
        self.dropout = nn.Dropout(dropout)  # Dropout để tránh overfitting
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True, bidirectional=False)  # LSTM đơn hướng

    def forward(self, src, src_len):
        embedded = self.dropout(self.embedding(src))  # (B, L, emb_dim)
        packed = pack_padded_sequence(embedded, src_len, batch_first=True, enforce_sorted=False)  # Pack để bỏ padding
        packed_out, (h, c) = self.lstm(packed)  # LSTM forward
        outputs, _ = pad_packed_sequence(packed_out, batch_first=True)  # Unpack outputs (B, L, hid_dim)
        h = h.view(1, h.size(1), -1)  # Reshape hidden (1, B, hid_dim)
        c = c.view(1, c.size(1), -1)  # Reshape cell (1, B, hid_dim)
        return outputs, h, c  # Outputs cho attention, h/c cho decoder init

# Attention: Additive attention (Bahdanau)
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim + hid_dim, hid_dim)  # Linear để tính energy
        self.v = nn.Parameter(torch.rand(hid_dim))  # Vector v cho attention score

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.permute(1, 0, 2)  # (B, 1, hid_dim)
        src_len = encoder_outputs.shape[1]
        hidden = hidden.repeat(1, src_len, 1)  # Repeat hidden theo src_len (B, src_len, hid_dim)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # (B, src_len, hid_dim)
        energy = energy.permute(0, 2, 1)  # (B, hid_dim, src_len)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # (B, 1, hid_dim)
        attn_scores = torch.bmm(v, energy).squeeze(1)  # (B, src_len)
        return F.softmax(attn_scores, dim=1)  # Softmax để có weights

# Decoder: LSTM với attention
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, dropout=0.5):
        super().__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(emb_dim + hid_dim, hid_dim, batch_first=True)  # Input: emb + context
        self.fc_out = nn.Linear(hid_dim, vocab_size)  # Output layer
        self.attention = Attention(hid_dim)  # Attention module

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)  # (B) → (B, 1)
        embedded = self.dropout(self.embedding(input))  # (B, 1, emb_dim)
        attn_weights = self.attention(hidden, encoder_outputs)  # (B, src_len)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # (B, 1, hid_dim)
        lstm_input = torch.cat((embedded, context), dim=2)  # (B, 1, emb_dim + hid_dim)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))  # LSTM forward
        output = self.fc_out(output.squeeze(1))  # (B, vocab_size)
        return output, hidden, cell

# Seq2Seq: Kết hợp encoder và decoder
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, src_len, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)  # Init outputs
        encoder_outputs, hidden, cell = self.encoder(src, src_len)  # Encode
        input = trg[:, 0]  # Bắt đầu từ <sos>
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)  # Decode
            outputs[:, t, :] = output  # Lưu output
            teacher_force = random.random() < teacher_forcing_ratio  # Teacher forcing
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1  # Input tiếp theo
        return outputs

## 5. Huấn Luyện (Training)

In [None]:
# Khởi tạo model
INPUT_DIM = len(SRC_vocab)
OUTPUT_DIM = len(TRG_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DROPOUT)
attn_model = Seq2Seq(enc, dec, device).to(device)

# Optimizer và loss
optimizer = optim.Adam(attn_model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=TRG_vocab["<pad>"])  # Bỏ qua padding

# Hàm train một epoch
def train(model, iterator, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, trg, src_len, _ in iterator:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg, src_len)  # Forward
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)  # Bỏ <sos>, flatten
        trg = trg[:, 1:].reshape(-1)  # Bỏ <sos>, flatten
        loss = criterion(output, trg)  # Tính loss
        loss.backward()  # Backward
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # Gradient clipping
        optimizer.step()  # Update weights
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)  # Average loss

# Hàm eval một epoch
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg, src_len, _ in iterator:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, src_len, 0)  # No teacher forcing
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Vòng lặp huấn luyện (ví dụ 10 epochs)
N_EPOCHS = 10
train_losses = []
val_losses = []

for epoch in range(N_EPOCHS):
    train_loss = train(attn_model, train_loader, optimizer, criterion)
    val_loss = evaluate(attn_model, val_loader, criterion)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {val_loss:.3f}')

## 6. Vẽ Biểu Đồ Loss

In [None]:
# Vẽ biểu đồ loss
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss', marker='o')
plt.plot(val_losses, label='Validation Loss', marker='o')
plt.title('Training and Validation Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Cross Entropy Loss')
plt.legend()
plt.grid(True)
plt.xticks(range(0, len(train_losses), 2))  # Hiển thị mỗi 2 epochs
plt.show()

## 7. Hàm Dịch Một Câu (Translate Sentence)

In [None]:
# Reverse vocab cho đích (id → word)
id2word_trg = {v: k for k, v in TRG_vocab.items()}

# Hàm dịch một câu EN → FR
def translate(sentence: str, max_len: int = 50) -> str:
    attn_model.eval()
    with torch.no_grad():
        # Tokenize nguồn và thêm <sos>/<eos>
        src_tokens = ["<sos>"] + tokenize_en(sentence) + ["<eos>"]
        src_ids = [SRC_vocab.get(t, SRC_vocab["<unk>"]) for t in src_tokens]
        src_tensor = torch.LongTensor(src_ids).unsqueeze(0).to(device)  # (1, src_len)
        src_len = [len(src_ids)]
        
        # Encode
        encoder_outputs, hidden, cell = attn_model.encoder(src_tensor, src_len)
        
        # Decode từ <sos>
        input_token = torch.tensor([TRG_vocab["<sos>"]]).to(device)
        translated_tokens = []
        for _ in range(max_len):
            output, hidden, cell = attn_model.decoder(input_token, hidden, cell, encoder_outputs)
            pred_token = output.argmax(1).item()
            if pred_token == TRG_vocab["<eos>"]:
                break
            translated_tokens.append(id2word_trg[pred_token])  # Thêm token dự đoán
            input_token = torch.tensor([pred_token]).to(device)  # Input tiếp theo
        return " ".join(translated_tokens)  # Trả về câu dịch

## 8. Tính BLEU Score Trên Test Set

In [None]:
# Thu thập hypotheses và references
hypotheses = []  # List câu dịch (str)
references_single = []  # List references (str)

attn_model.eval()
with torch.no_grad():
    for src, trg, src_len, trg_len in test_loader:
        src = src.to(device)
        for i in range(src.size(0)):
            src_single = src[i:i+1]  # (1, src_len)
            src_len_single = [src_len[i]]
            hyp_tokens = translate_sentence(attn_model, src_single, src_len_single)  # Dịch
            hypotheses.append(" ".join(hyp_tokens))  # Thêm hypothesis
            
            # Xử lý reference: bỏ <sos>/<eos>
            trg_ids = trg[i].tolist()
            eos_idx = trg_ids.index(TRG_vocab["<eos>"]) if TRG_vocab["<eos>"] in trg_ids else len(trg_ids)
            ref_ids = trg_ids[1:eos_idx]
            ref_tokens = [id2word_trg[j] for j in ref_ids]
            references_single.append(" ".join(ref_tokens))  # Thêm reference

# Tính average BLEU
sentence_scores = []
for hyp, ref in zip(hypotheses, references_single):
    score = sentence_bleu(hyp, [ref])  # Tính BLEU cho từng câu
    sentence_scores.append(score.score)

avg_bleu = sum(sentence_scores) / len(sentence_scores)
print(f"Test BLEU score: {avg_bleu:.2f}")

## 9. Test Với Các Câu Ví Dụ

In [None]:
# Các câu ví dụ
example_sentences = [
    "A man is playing a guitar.",
    "Two dogs are running in the park.",
    "A woman is riding a horse.",
    "A group of people are dancing."
]

print("=== Kết quả dịch ===\n")
for en_sent in example_sentences:
    fr_sent = translate(en_sent)
    print(f"EN: {en_sent}")
    print(f"FR: {fr_sent}")
    print("-" * 50)