## Load dataset and pip install

In [1]:
import torch

In [2]:
! pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [5]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Tokenizer

### Import tokenizer

In [6]:
from transformers import AutoTokenizer

model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
tokenizer.pad_token_id

0

### Tokenize

In [8]:
MAX_LEN = 75

def preprocess_function(examples):
    prefix = "translate English to Vietnamese: "
    src_inputs = [prefix + text for text in examples["en"]]
    input_ids = tokenizer(
        src_inputs, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt"
    )['input_ids']
    labels = tokenizer(
        examples["vi"], padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt"
    )['input_ids']

    # Lấy ra ID của token pad
    pad_token_id = tokenizer.pad_token_id
    labels[labels == pad_token_id] = -100
    return {
        'input_ids' : input_ids, 
        'labels' : labels
    }

In [9]:
ds_train = ds['train'].map(
    preprocess_function, batched=True)
ds_val = ds['validation'].map(
    preprocess_function, batched=True)
ds_test = ds['test'].select(range(100)).map(
    preprocess_function, batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Model

In [10]:
from transformers import AutoModelForSeq2SeqLM

model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

## Trainner

### Test trainning

In [12]:
# Ví dụ với một cặp câu
input_text = "translate English to Vietnamese: This is a pen."
target_text = "Đây là một cái bút."

# Mã hóa đầu vào và đầu ra
encoded_input = tokenizer(input_text, return_tensors="pt")
labels = tokenizer(target_text, return_tensors="pt")
print(encoded_input.attention_mask)
pred = model.generate(
    encoded_input.input_ids,
    attention_mask=encoded_input.attention_mask,
    max_length=20
)

tokenizer.batch_decode(pred)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


['<pad> Das ist ein Stift.</s>']

In [13]:
decoded_pred = tokenizer.batch_decode(
    pred, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

decoded_label = tokenizer.batch_decode(
    labels.input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print("decoded_pred", decoded_pred)
print("decoded_label", decoded_label)

decoded_pred ['Das ist ein Stift.']
decoded_label ['ây là mt cái bt.']


### Compute metrics

Trong trường hợp T5, prefix "translate English to Vietnamese: " thường chỉ dùng để báo cho mô hình biết đang cần thực hiện tác vụ dịch (vì T5 hay dùng “instruction/prefix” như vậy). Thông thường, mô hình sẽ không sinh ra prefix đó trong output, mà chỉ sinh ra nội dung câu dịch
==> kh cần xử lý prefix, bạn có thể coi ở ví dụ trên

In [14]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Lấy ra ID của token pad
    pad_token_id = tokenizer.pad_token_id
    labels[labels == -100] = pad_token_id

    decoded_pred = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_label = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_pred, decoded_label = postprocess_text(decoded_pred, decoded_label)

    result = metric.compute(predictions=decoded_pred,
                            references=decoded_label)

    result = {"bleu": result["score"]}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

### Trainner

In [15]:
# Disable wandb
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
import os
os.environ["WANDB_DISABLED"] = "true"


training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/T5small/en-vi-t5small",
    predict_with_generate=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=1,
    num_train_epochs=5,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,  # You can also use 'processing_class=tokenizer' if needed
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Bleu
1,1.622,1.267473,1.717399
2,1.309,1.11024,2.463202
3,1.1959,1.036011,2.885743
4,1.139,1.006736,3.088758
5,1.1138,0.992542,3.108382


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=41665, training_loss=1.2759198649195969, metrics={'train_runtime': 4625.3553, 'train_samples_per_second': 144.115, 'train_steps_per_second': 9.008, 'total_flos': 1.3215353720832e+16, 'train_loss': 1.2759198649195969, 'epoch': 5.0})

## Inferences (beam-greedy search)

In [16]:
trainer.save_model("/kaggle/working/T5small/en-vi-t5small")
tokenizer = AutoTokenizer.from_pretrained(
    "/kaggle/working/T5small/en-vi-t5small")
model = AutoModelForSeq2SeqLM.from_pretrained(
    "/kaggle/working/T5small/en-vi-t5small")

In [17]:
# Download model
import sacrebleu
from transformers import pipeline

translator = pipeline(task="translation_en_to_vi", model=model, tokenizer = tokenizer)

# Greedy search
pred_sentences_greedy = translator(ds_test["en"], batch_size=8, num_beams=1, do_sample=False)
# Chuyển List[Dict["translation_text"]] => List[str]
pred_sentences_greedy = [item["translation_text"] for item in pred_sentences_greedy]


# Beam search
pred_sentences_beam = translator(ds_test["en"], batch_size=8, num_beams=5)
# Chuyển List[Dict["translation_text"]] => List[str]
pred_sentences_beam = [item["translation_text"] for item in pred_sentences_beam]
# Tính BLEU cho từng kiểu suy luận
bleu_score_greedy = sacrebleu.corpus_bleu(pred_sentences_greedy, [ds_test["vi"]], force=True)
bleu_score_beam = sacrebleu.corpus_bleu(pred_sentences_beam, [ds_test["vi"]], force=True)

print("BLEU (greedy):", bleu_score_greedy)
print("BLEU (beam):", bleu_score_beam)

Device set to use cuda:0
Your input_length: 49 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 39 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 56 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 61 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 33 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 41 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 25 is bigger than 0.9 * max_leng

BLEU (greedy): BLEU = 0.03 22.3/4.6/1.4/0.2 (BP = 0.011 ratio = 0.181 hyp_len = 485 ref_len = 2685)
BLEU (beam): BLEU = 0.05 25.6/5.5/1.5/0.2 (BP = 0.019 ratio = 0.201 hyp_len = 539 ref_len = 2685)
