## Import libary và load datasets

In [57]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models
from datasets import load_dataset
import torch

In [2]:
ds = load_dataset("thainq107/iwslt2015-en-vi")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [4]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Encoding

### Defined toknize

In [58]:
# Word-based Tokenizer
tokenizer_en = Tokenizer(models.WordLevel(unk_token="<unk>")) # Tạo 2 tokenizer
tokenizer_vi = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace() # Tách dựa trên khoảng trắng
tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(
    vocab_size=15000, 
    min_frequency=2,  # xác định tần suất xuất hiện tối thiểu để được đưa vào vocab
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

# Train tokenizer
tokenizer_en.train_from_iterator(ds["train"]["en"], trainer) # Vừa xây dựng vocab vừa tokenize
tokenizer_vi.train_from_iterator(ds["train"]["vi"], trainer)

# Save tokenizer
tokenizer_en.save("tokenizer_en.json")
tokenizer_vi.save("tokenizer_vi.json")

### Encoding

In [59]:
MAX_LENGTH = 75
from transformers import  PreTrainedTokenizerFast
tokenizer_en = PreTrainedTokenizerFast(tokenizer_file="tokenizer_en.json", unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")
tokenizer_vi = PreTrainedTokenizerFast(tokenizer_file="tokenizer_vi.json",
                                       unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")

In [60]:
added_tokens_encoder = tokenizer_en.added_tokens_encoder
added_tokens_encoder['<pad>']

0

In [99]:
def preprocess_function(examples):
    src_text = examples["en"]
    tgt_text = ["<bos> " + text + " <eos>" for text in examples["vi"]]

    src_encodings = tokenizer_en(
        src_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tgt_encodings = tokenizer_vi(
        tgt_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    return {
        # Trả về list, không phải tensor
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],  # Trả về list, không phải tensor
    }


# Áp dụng map() và set_format("torch") để tự động chuyển thành tensor
preprocessed_train = ds['train'].select(
    range(100)).map(preprocess_function, batched=True)
preprocessed_val = ds['validation'].select(
    range(100)).map(preprocess_function, batched=True)
preprocessed_test = ds['test'].select(
    range(100)).map(preprocess_function, batched=True)

# Định dạng dữ liệu dưới dạng torch.Tensor
preprocessed_train.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_val.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_test.set_format(type="torch", columns=["input_ids", "labels"])

Map: 100%|██████████| 100/100 [00:00<00:00, 3497.56 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3491.01 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 4240.35 examples/s]


In [100]:
# Kiểm tra đầu ra
print(preprocessed_train[0])
print(type(preprocessed_train[0]["input_ids"]))  # Phải là torch.Tensor

{'input_ids': tensor([6675,    1,   57,   60,  339,  604,   13,  744, 5643,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]), 'labels': tensor([   2, 1960,   66, 1157,  131,    8,  376,  113,   38,  417,  735,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 

## Modeling

### RNNs

In [None]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqRNNConfig(PretrainedConfig):
    def __init__(self, vocab_size_src = 10000, vocab_size_tgt = 10000, embedding_dim = 128, hidden_size = 128, dropout = 0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.dropout = dropout


class EncoderRNN(nn.Module):
    def __init__(self, input_size=10000, embedding_dim=128, hidden_size=128, dropout=0.1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True, dropout=dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded)
        return output, hidden
    
class DecoderRNN(nn.Module):
    def __init__(self, output_size=10000, embedding_dim=128, hidden_size=128, dropout=0.1):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True, dropout=dropout)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden_decoder):
        embedded = self.embedding(x)
        output, last_hidden = self.gru(embedded, hidden_decoder)
        output = self.out(output)
        return output, last_hidden
    
class Seq2SeqRNNmodel(PreTrainedModel):
    def __init__(self, config: Seq2SeqRNNConfig, tokenizer_en: PreTrainedTokenizerFast):
        super().__init__(config)
        self.encoder = EncoderRNN(config.vocab_size_src, config.embedding_dim, config.hidden_size, config.dropout).to(device)
        self.decoder = DecoderRNN(
            config.vocab_size_tgt, config.embedding_dim, config.hidden_size, config.dropout).to(device)
        self.bos_idx = tokenizer_en.added_tokens_encoder['<bos>']
        self.loss_fn = nn.CrossEntropyLoss(
            ignore_index=tokenizer_en.added_tokens_encoder['<pad>'])

    def forward(self, input_ids, labels):
        input_ids = input_ids.to("cuda")
        labels = labels.to("cuda")

        batch_size, seq_len  = labels.size()
        encoder_output, encoder_hidden = self.encoder(input_ids)
        # print(encoder_output.shape, encoder_hidden.shape)

        inputs_decoder = torch.full((batch_size, 1), self.bos_idx, dtype=torch.long).to(input_ids.device) # BOS token
        hidden_decoder = encoder_hidden
        output_decoder_lst = []

        # teacher forcing
        for i in range(seq_len):
            output_decoder, hidden_decoder = self.decoder(inputs_decoder, hidden_decoder)
            # print(output_decoder.shape, hidden_decoder.shape)
            inputs_decoder = labels[:, i].unsqueeze(1)
            output_decoder_lst.append(output_decoder)

        # print(len(output_decoder_lst), output_decoder_lst[0].shape)
        logits = torch.cat(output_decoder_lst, dim=1) # (batch_size, seq_len, vocab_size_tgt)
        loss = self.loss_fn(logits.permute(0, 2, 1), labels)
        
        return {"loss": loss, "logits": logits}

In [131]:
config = Seq2SeqRNNConfig()
model = Seq2SeqRNNmodel(config, tokenizer_en)
model.forward(preprocessed_train[0: 3]['input_ids'],
              preprocessed_train[0: 3]['labels'])

{'loss': tensor(9.2516, grad_fn=<NllLoss2DBackward0>),
 'logits': tensor([[[-0.0581, -0.1600,  0.0921,  ...,  0.2343, -0.4025, -0.4931],
          [-0.0602, -0.0598,  0.0117,  ...,  0.2292, -0.3739, -0.3102],
          [ 0.0197, -0.0962, -0.1172,  ..., -0.0404, -0.2372, -0.0765],
          ...,
          [ 0.0766,  0.1735, -0.1656,  ...,  0.3689, -0.1773,  0.6316],
          [ 0.0766,  0.1735, -0.1656,  ...,  0.3689, -0.1773,  0.6316],
          [ 0.0766,  0.1735, -0.1656,  ...,  0.3689, -0.1773,  0.6316]],
 
         [[-0.0581, -0.1601,  0.0921,  ...,  0.2342, -0.4017, -0.4931],
          [-0.0602, -0.0598,  0.0117,  ...,  0.2291, -0.3734, -0.3102],
          [-0.1354, -0.0927,  0.2444,  ...,  0.0566,  0.0292, -0.4020],
          ...,
          [ 0.1929, -0.1134, -0.0746,  ..., -0.3125,  0.1782,  0.2041],
          [ 0.1298, -0.0669, -0.1370,  ..., -0.2732,  0.4786, -0.0553],
          [ 0.1379,  0.1780, -0.0321,  ..., -0.0070,  0.5041, -0.1037]],
 
         [[-0.0581, -0.1600,  0.092

In [None]:
# Disable wandb
from transformers import Trainer, TrainingArguments
import os
os.environ['WANDB_DISABLED'] = 'true'


# Training
training_args = TrainingArguments(
    output_dir="./en-vi-machine-translation",
    logging_dir="logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=25,
    learning_rate=2e-5,
    save_total_limit=1,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_train,
    eval_dataset=preprocessed_val
)

In [136]:
! pip install sacrebleu==2.5.1

Collecting sacrebleu==2.5.1
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu==2.5.1)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting tabulate>=0.8.9 (from sacrebleu==2.5.1)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting lxml (from sacrebleu==2.5.1)
  Downloading lxml-5.3.1-cp310-cp310-win_amd64.whl.metadata (3.8 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading lxml-5.3.1-cp310-cp310-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 3.8/3.8 MB 22.7 MB/s eta 0:00:00
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: tabulate, portalocker, lxml, sacrebleu
Successfully installed lxml-5.3.1 portalocker-3.1.1 sacrebleu-2.5.1 tabulate-0.9.0


In [146]:
import sacrebleu
def greedy_decode(model, src, max_len, tokenizer, device="cpu"):
    src = torch.tensor(src).unsqueeze(0).to(device)
    memory, hidden = model.encoder(src)
    y_start = torch.full((1, 1), tokenizer.added_tokens_encoder["<bos>"], dtype=torch.long).to(device)
    output = []

    # teacher forcing
    for i in range(max_len):
        output_decoder, hidden = model.decoder(y_start, hidden)
        _, next_word = output_decoder.max(dim=-1)
        y_start = next_word
        output.append(next_word.item())

        if next_word.item() == tokenizer.added_tokens_encoder["<eos>"]:
            break
    return output




# Test
def translate():
    model.eval()
    src = preprocessed_test[0]["input_ids"]
    tgt = preprocessed_test[0]["labels"]
    output = greedy_decode(model, src, MAX_LENGTH,
                           tokenizer_vi, device="cpu")
    
    print("Input:", tokenizer_en.decode(src))
    print("Target:", tokenizer_vi.decode(tgt))
    print("Predict:", tokenizer_vi.decode(output))

    bleu_score = sacrebleu.corpus_bleu(
        [tokenizer_vi.decode(output)], [[tokenizer_vi.decode(tgt)]], force=True).score
    print("BLEU Score:", bleu_score)

translate()

Input: When I was little , I thought my country was the best on the planet , and I grew up singing a song called & quot ; Nothing To <unk> . & quot ; <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Target: <bos> Khi tôi còn nhỏ , Tôi nghĩ rằng <unk> Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài & quot ; Chúng ta chẳng có gì phải ghen tị . & quot ; <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predict: proton 1959 Ba bin Agnes nhào Treasure hạn Piano đãng hợm Húp Mad Nuôi ôtô xuân MySpace dốt nhiện Frederic Libby lộn Tứ JB JR Vietnam Tyler 380 quyệt Aldo xê Tweets giành Tuổi Idol Collusion Ellen nếu Treatment 

  src = torch.tensor(src).unsqueeze(0).to(device)
