## Import libary và load datasets

In [104]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models
from datasets import load_dataset
import torch
import torch.nn as nn

In [105]:
ds = load_dataset("thainq107/iwslt2015-en-vi")

In [106]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [107]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Encoding

### Defined toknize

In [108]:
# Word-based Tokenizer
tokenizer_en = Tokenizer(models.WordLevel(unk_token="<unk>")) # Tạo 2 tokenizer
tokenizer_vi = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace() # Tách dựa trên khoảng trắng
tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(
    vocab_size=15000, 
    min_frequency=2,  # xác định tần suất xuất hiện tối thiểu để được đưa vào vocab
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

# Train tokenizer
tokenizer_en.train_from_iterator(ds["train"]["en"], trainer) # Vừa xây dựng vocab vừa tokenize
tokenizer_vi.train_from_iterator(ds["train"]["vi"], trainer)

# Save tokenizer
tokenizer_en.save("./transformer/tokenizer_en.json")
tokenizer_vi.save("./transformer/tokenizer_vi.json")

### Encoding

In [109]:
MAX_LENGTH = 75
from transformers import  PreTrainedTokenizerFast
tokenizer_en = PreTrainedTokenizerFast(tokenizer_file="./transformer/tokenizer_en.json", unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")
tokenizer_vi = PreTrainedTokenizerFast(tokenizer_file="./transformer/tokenizer_vi.json",
                                       unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos")

In [110]:
vocab_size_src = len(tokenizer_en.get_vocab())
vocab_size_tgt = len(tokenizer_vi.get_vocab())
vocab_size_src

15001

In [111]:
added_tokens_encoder = tokenizer_en.added_tokens_encoder
added_tokens_encoder['<pad>']

0

In [112]:
def preprocess_function(examples):
    src_text = examples["en"]
    tgt_text = ["<bos> " + text + " <eos>" for text in examples["vi"]]

    src_encodings = tokenizer_en(
        src_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    tgt_encodings = tokenizer_vi(
        tgt_text, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    return {
        # Trả về list, không phải tensor
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],  # Trả về list, không phải tensor
    }


# Áp dụng map() và set_format("torch") để tự động chuyển thành tensor
preprocessed_train = ds['train'].select(
    range(100)).map(preprocess_function, batched=True)
preprocessed_val = ds['validation'].select(
    range(100)).map(preprocess_function, batched=True)
preprocessed_test = ds['test'].select(
    range(100)).map(preprocess_function, batched=True)

# Định dạng dữ liệu dưới dạng torch.Tensor
preprocessed_train.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_val.set_format(type="torch", columns=["input_ids", "labels"])
preprocessed_test.set_format(type="torch", columns=["input_ids", "labels"])

In [113]:
# Kiểm tra đầu ra
print(preprocessed_train[0])
print(type(preprocessed_train[0]["input_ids"]))  # Phải là torch.Tensor

{'input_ids': tensor([6675,    1,   57,   60,  339,  604,   13,  744, 5643,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]), 'labels': tensor([   2, 1960,   66, 1157,  131,    8,  376,  113,   38,  417,  735,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 

## Modeling

#### Example padding mask

In [114]:
import torch
import torch.nn.functional as F

# Từ điển ánh xạ từ thành ID
vocab = {"Hello": 1, "world": 2, "!": 3, "<PAD>": 0}

# Câu đã tokenized
sentence = ["Hello", "world", "!", "<PAD>", "<PAD>"]
sentence_ids = torch.tensor([[vocab[word] for word in sentence]])

# Tạo Padding Mask (True nếu là <PAD>)
padding_mask = (sentence_ids == vocab["<PAD>"])
print("Padding Mask:\n", padding_mask)

# Tạo ma trận Attention Scores ngẫu nhiên (giả lập)
seq_len = sentence_ids.shape[1]
attention_scores = torch.rand(seq_len, seq_len)  # Ma trận Attention (5x5)

print("\nAttention Scores trước khi áp dụng mask:\n", attention_scores)

# Áp dụng padding mask: Gán -inf cho vị trí <PAD> để loại bỏ khi tính softmax
attention_scores = attention_scores.masked_fill(padding_mask, float('-inf'))

print("\nAttention Scores sau khi áp dụng mask:\n", attention_scores)

# Tính Softmax để thấy sự khác biệt
attention_probs = F.softmax(attention_scores, dim=-1)
print("\nAttention Weights sau Softmax:\n", attention_probs)

Padding Mask:
 tensor([[False, False, False,  True,  True]])

Attention Scores trước khi áp dụng mask:
 tensor([[5.6528e-01, 1.5462e-01, 6.3104e-01, 8.0898e-01, 1.9997e-01],
        [2.5445e-01, 2.3181e-01, 5.8909e-01, 9.9554e-01, 3.3283e-01],
        [2.6358e-01, 4.9669e-01, 5.1804e-02, 5.0553e-01, 9.1042e-01],
        [2.4338e-01, 3.2681e-01, 5.3528e-01, 4.2900e-01, 3.2967e-04],
        [6.0340e-01, 6.4141e-01, 1.9927e-01, 4.3930e-01, 8.6750e-01]])

Attention Scores sau khi áp dụng mask:
 tensor([[0.5653, 0.1546, 0.6310,   -inf,   -inf],
        [0.2545, 0.2318, 0.5891,   -inf,   -inf],
        [0.2636, 0.4967, 0.0518,   -inf,   -inf],
        [0.2434, 0.3268, 0.5353,   -inf,   -inf],
        [0.6034, 0.6414, 0.1993,   -inf,   -inf]])

Attention Weights sau Softmax:
 tensor([[0.3661, 0.2428, 0.3910, 0.0000, 0.0000],
        [0.2963, 0.2897, 0.4140, 0.0000, 0.0000],
        [0.3256, 0.4110, 0.2634, 0.0000, 0.0000],
        [0.2919, 0.3173, 0.3908, 0.0000, 0.0000],
        [0.3695, 0.3

### Transformer

In [None]:
def generate_square_subsequent_mask(sz, device):
    mask = torch.triu(torch.ones(sz, sz, device=device)) == 1  
    mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt, tokenizer_en, tokenizer_vi, device):
    # src = tgt = [batch_size, seq_len]
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    mask_decoder = generate_square_subsequent_mask(tgt_seq_len, device)
    mask_encoder = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    
    src_padding_mask = (src == tokenizer_en.added_tokens_encoder['<pad>'])
    tgt_padding_mask = (tgt == tokenizer_vi.added_tokens_encoder['<pad>'])
    return mask_encoder, mask_decoder,src_padding_mask, tgt_padding_mask

In [137]:
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqTransformerConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size_src=15000,
        vocab_size_tgt=15000,
        max_seq_length=50,
        d_model=256,
        num_heads=8,
        num_layers=2,
        dropout=0.1,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout


class EncoderTransformer(nn.Module):
    def __init__(self, config:Seq2SeqTransformerConfig):
        super().__init__(*args, **kwargs)
class Seq2SeqTransformer(PreTrainedModel):
    def __init__(self, config: Seq2SeqTransformerConfig, tokenizer_vi: PreTrainedTokenizerFast):
        super().__init__(config)
        self.embedding_src = nn.Embedding(config.vocab_size_src, config.d_model)
        self.embedding_tgt = nn.Embedding(config.vocab_size_tgt, config.d_model)

        self.position_embedding_src = nn.Embedding(
            config.max_seq_length, config.d_model
        )
        self.position_embedding_tgt = nn.Embedding(
            config.max_seq_length, config.d_model
        )
        
        self.transformer = nn.Transformer(
            d_model=config.d_model,
            nhead=config.num_heads,
            num_decoder_layers=config.num_layers,
            num_encoder_layers=config.num_layers,
            dropout=config.dropout,
            batch_first=True
        ) # --> [B, Seq_length, E]

        self.classifier = nn.Linear(
            config.d_model, config.vocab_size_tgt
        )

        self.loss_fn = nn.CrossEntropyLoss(
            ignore_index=tokenizer_vi.added_tokens_encoder['<pad>'])

    def forward(self, input_ids, labels):
        # teacher forcing
        tgt_input = labels[:, :-1]
        tgt_output = labels[:, 1:]

        batch_size , seq_len_src = input_ids.shape
        _, seq_len_tgt = tgt_input.shape

        src_positions = torch.arange(seq_len_src, device=input_ids.device).unsqueeze(0)
        tgt_positions = torch.arange(
            seq_len_tgt, device=tgt_input.device).unsqueeze(0)

        src_embedded = self.embedding_src(input_ids) + self.position_embedding_src(src_positions) # broad casting để cộng lại
        tgt_embedded = self.embedding_tgt(tgt_input) + self.position_embedding_tgt(tgt_positions)
        src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask = create_mask(
            input_ids, tgt_input, tokenizer_en, tokenizer_vi, device=input_ids.device)

        outs = self.transformer(
            src_embedded, tgt_embedded, src_mask, tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = self.classifier(outs)
        loss = self.loss_fn(logits.permute(0, 2, 1), tgt_output)

        return {"loss": loss, "logits": logits}
    
    def encoder(self, src, src_mask):
        _, seq_len_src = src.shape
        src_positions = torch.arange(seq_len_src, device=src.device).unsqueeze(0)
        src_embedded = self.embedding_src(src) + self.position_embedding_src(src_positions)
        return self.transformer.encoder(src_embedded, src_mask)

    def decoder(self, tgt, encoder_output, tgt_mask):
        _, seq_len_tgt = tgt.shape
        tgt_positions = torch.arange(seq_len_tgt, device=tgt.device).unsqueeze(0)
        tgt_embedded = self.embedding_tgt(tgt) + self.position_embedding_tgt(tgt_positions)
        out = self.transformer.decoder(tgt_embedded, encoder_output, tgt_mask)
        out = self.classifier(out)
        return out
    

In [None]:
# config = Seq2SeqTransformerConfig(vocab_size_src, vocab_size_tgt, MAX_LENGTH)
# model = Seq2SeqTransformer(config, tokenizer_vi)
# pred = model.forward(preprocessed_train[0: 1]['input_ids'],
#               preprocessed_train[0: 1]['labels'])
# pred["logits"].shape

75 74




torch.Size([1, 74, 13685])

In [138]:
config = Seq2SeqTransformerConfig(vocab_size_src, vocab_size_tgt, MAX_LENGTH)
model = Seq2SeqTransformer(config, tokenizer_vi)

In [None]:
# Disable wandb
from transformers import Trainer, TrainingArguments
import os
os.environ['WANDB_DISABLED'] = 'true'

# Training
training_args = TrainingArguments(
    output_dir="/kaggle/working/en-vi-machine-translation",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    learning_rate=2e-5,
    save_total_limit=1,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_train,
    eval_dataset=preprocessed_val
)
trainer.train()

In [None]:
# ! pip install sacrebleu==2.5.1

Collecting sacrebleu==2.5.1
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu==2.5.1)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting tabulate>=0.8.9 (from sacrebleu==2.5.1)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting lxml (from sacrebleu==2.5.1)
  Downloading lxml-5.3.1-cp310-cp310-win_amd64.whl.metadata (3.8 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading lxml-5.3.1-cp310-cp310-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 3.8/3.8 MB 22.7 MB/s eta 0:00:00
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: tabulate, portalocker, lxml, sacrebleu
Successfully installed lxml-5.3.1 portalocker-3.1.1 sacrebleu-2.5.1 tabulate-0.9.0


In [141]:
import sacrebleu
def greedy_decode(model, src, max_len, tokenizer, device="cpu"):
    src = torch.tensor(src).unsqueeze(0).to(device)
    mask_encoder = torch.zeros(
        (src.size()[1], src.size()[1]), device=device).type(torch.bool)
    context = model.encoder(src, mask_encoder)
    y_start = torch.full((1, 1), tokenizer.added_tokens_encoder["<bos>"], dtype=torch.long).to(device)

    output = []
    # teacher forcing
    for i in range(max_len):
        mask_decoder = generate_square_subsequent_mask(y_start.shape[1], device)
        output_decoder = model.decoder(y_start, context, mask_decoder)
        _, next_word = output_decoder.max(dim=-1)
        y_start = next_word
        output.append(next_word.item())

        if next_word.item() == tokenizer.added_tokens_encoder["<eos>"]:
            break
    return output




# Test
def translate():
    model.eval()
    src = preprocessed_test[0]["input_ids"]
    tgt = preprocessed_test[0]["labels"]
    output = greedy_decode(model, src, MAX_LENGTH,
                           tokenizer_vi, device="cpu")
    
    print("Input:", tokenizer_en.decode(src))
    print("Target:", tokenizer_vi.decode(tgt))
    print("Predict:", tokenizer_vi.decode(output))

    bleu_score = sacrebleu.corpus_bleu(
        [tokenizer_vi.decode(output)], [[tokenizer_vi.decode(tgt)]], force=True).score
    print("BLEU Score:", bleu_score)

translate()

  src = torch.tensor(src).unsqueeze(0).to(device)


Input: When I was little , I thought my country was the best on the planet , and I grew up singing a song called & quot ; Nothing To <unk> . & quot ; <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Target: <bos> Khi tôi còn nhỏ , Tôi nghĩ rằng <unk> Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài & quot ; Chúng ta chẳng có gì phải ghen tị . & quot ; <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predict: Heatherwick Free ăn Quỷ tiêng xứng ogami tấ TQ ngập Quỷ tiêng xứng ogami tấ TQ ngập Quỷ tiêng xứng ogami tấ TQ ngập Quỷ tiêng xứng ogami tấ TQ ngập Quỷ tiêng xứng ogami tấ TQ ngập Quỷ tiêng xứng ogami tấ TQ ngập