## Load dataset and pip install

In [1]:
import torch

In [None]:
# ! pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

In [3]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [5]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Tokenizer

### Import tokenizer

In [6]:
from transformers import AutoTokenizer

model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [7]:
tokenizer.pad_token_id

0

### Tokenize

In [None]:
MAX_LEN = 75

def preprocess_function(examples):
    prefix = "translate English to Vietnamese: "
    src_inputs = [prefix + text for text in examples["en"]]
    input_ids = tokenizer(
        src_inputs, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt"
    )['input_ids']
    labels = tokenizer(
        examples["vi"], padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt"
    )['input_ids']

    # Lấy ra ID của token pad
    pad_token_id = tokenizer.pad_token_id
    labels[labels == pad_token_id] = -100
    return {
        'input_ids' : input_ids, 
        'labels' : labels
    }

In [12]:
ds_train = ds['train'].select(range(100)).map(
    preprocess_function, batched=True)
ds_val = ds['validation'].select(range(100)).map(
    preprocess_function, batched=True)
ds_test = ds['test'].select(range(100)).map(
    preprocess_function, batched=True)

Map: 100%|██████████| 100/100 [00:00<00:00, 2095.43 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 2746.60 examples/s]


## Model

In [8]:
from transformers import AutoModelForSeq2SeqLM

model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [10]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

## Trainner

### Test trainning

In [15]:
# Ví dụ với một cặp câu
input_text = "translate English to Vietnamese: This is a pen."
target_text = "Đây là một cái bút."

# Mã hóa đầu vào và đầu ra
encoded_input = tokenizer(input_text, return_tensors="pt")
labels = tokenizer(target_text, return_tensors="pt")
print(encoded_input.attention_mask)
pred = model.generate(
    encoded_input.input_ids,
    attention_mask=encoded_input.attention_mask,
    max_length=20
)

tokenizer.batch_decode(pred)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


['<pad> Das ist ein Stift.</s>']

In [None]:
decoded_pred = tokenizer.batch_decode(
    pred, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

decoded_label = tokenizer.batch_decode(
    labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print("decoded_pred", decoded_pred)
print("decoded_label", decoded_label)

decoded_pred ['Rachel Pike: Khoa học đằng sau một tiêu đề về khí hậu']
decoded_label ['Khoa học đằng sau một tiêu đề về khí hậu']


### Compute metrics

Trong trường hợp T5, prefix "translate English to Vietnamese: " thường chỉ dùng để báo cho mô hình biết đang cần thực hiện tác vụ dịch (vì T5 hay dùng “instruction/prefix” như vậy). Thông thường, mô hình sẽ không sinh ra prefix đó trong output, mà chỉ sinh ra nội dung câu dịch
==> kh cần xử lý prefix, bạn có thể coi ở ví dụ trên

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Lấy ra ID của token pad
    pad_token_id = tokenizer.pad_token_id
    labels[labels == -100] = pad_token_id

    decoded_pred = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_label = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_pred, decoded_label = postprocess_text(decoded_pred, decoded_label)

    result = metric.compute(predictions=decoded_pred,
                            references=decoded_label)

    result = {"bleu": result["score"]}
    return result

### Trainner

In [None]:
# Disable wandb
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
import os
os.environ["WANDB_DISABLED"] = "true"


training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/T5small/en-vi-t5small",
    predict_with_generate=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=1,
    num_train_epochs=5,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val
    data_collator=data_collator,
    tokenizer=tokenizer,  # You can also use 'processing_class=tokenizer' if needed
    compute_metrics=compute_metrics,
)

trainer.train()

## Inferences (beam-greedy search)

In [None]:
trainer.save_model("/kaggle/working/T5small/en-vi-t5small")
tokenizer = AutoTokenizer.from_pretrained(
    "/kaggle/working/T5small/en-vi-t5small")
model = AutoModelForSeq2SeqLM.from_pretrained(
    "/kaggle/working/T5small/en-vi-t5small")

In [None]:
# Download model
import sacrebleu
from transformers import pipeline

translator = pipeline(task="translation_en_to_vi", model=model, tokenizer = tokenizer)

# Test a sample with beam search
translated_text = translator("I go to school", num_beams=2)
print(translated_text)

# Greedy search
pred_sentences_greedy = translator(ds_test["en"], batch_size=8, num_beams=1, do_sample=False)

# Beam search
pred_sentences_beam = translator(ds_test["en"], batch_size=8, num_beams=5)

# Tính BLEU cho từng kiểu suy luận
bleu_score_greedy = sacrebleu.corpus_bleu(pred_sentences_greedy, [ds_test["vi"]], force=True)
bleu_score_beam = sacrebleu.corpus_bleu(pred_sentences_beam, [ds_test["vi"]], force=True)

print("BLEU (greedy):", bleu_score_greedy)
print("BLEU (beam):", bleu_score_beam)