## Load dataset and pip install

In [1]:
import torch

In [2]:
! pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [5]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Tokenizer

### Import tokenizer

In [6]:
from transformers import AutoTokenizer

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [7]:
tokenizer.pad_token_id

1

### Tokenize

In [8]:
MAX_LEN = 75

def preprocess_function(examples):
    input_ids = tokenizer(
        examples["en"], padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt"
    )['input_ids']
    labels = tokenizer(
        examples["vi"], padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt"
    )['input_ids']

    # Lấy ra ID của token pad
    pad_token_id = tokenizer.pad_token_id
    labels[labels == pad_token_id] = -100
    return {
        'input_ids' : input_ids, 
        'labels' : labels
    }

In [9]:
ds_train = ds['train'].select(range(20)).map(preprocess_function, batched=True)
ds_val = ds['validation'].select(range(20)).map(preprocess_function, batched=True)
ds_test = ds['test'].select(range(10)).map(preprocess_function, batched=True)
ds_train

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['en', 'vi', 'input_ids', 'labels'],
    num_rows: 20
})

## Model

In [10]:
from transformers import AutoModelForSeq2SeqLM

model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [11]:
model

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

## Trainner

### Test trainning

In [12]:
preds_sample = torch.tensor(ds_train[0]['input_ids']).unsqueeze(0)
labels_sample = torch.tensor(ds_train[0]['labels']).unsqueeze(0)

pad_token_id = tokenizer.pad_token_id
labels_sample[labels_sample == -100] = pad_token_id

preds = model.generate(input_ids=preds_sample)
preds

tensor([[     2, 250024, 127055,  66937,     13,     12,  67766,   2546, 218877,
            858,    889,  10037,   6248,   1893,  17964,  42254,      2]])

In [13]:
decoded_pred = tokenizer.batch_decode(
    preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

decoded_label = tokenizer.batch_decode(
    labels_sample, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print("decoded_pred", decoded_pred)
print("decoded_label", decoded_label)

decoded_pred ['Rachel Pike: Khoa học đằng sau một tiêu đề về khí hậu']
decoded_label ['Khoa học đằng sau một tiêu đề về khí hậu']


### Compute metrics

In [14]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Lấy ra ID của token pad
    pad_token_id = tokenizer.pad_token_id
    labels[labels == -100] = pad_token_id

    decoded_pred = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_label = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_pred, decoded_label = postprocess_text(decoded_pred, decoded_label)

    result = metric.compute(predictions=decoded_pred,
                            references=decoded_label)

    result = {"bleu": result["score"]}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

### Trainner

In [15]:
# Disable wandb
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
import os
os.environ["WANDB_DISABLED"] = "true"


training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/mBart50/en-vi-mbart50",
    predict_with_generate=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=1,
    num_train_epochs=3,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,  # You can also use 'processing_class=tokenizer' if needed
    compute_metrics=compute_metrics,
)
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu
1,4.0104,2.681265,4.091715
2,1.9263,2.431512,2.903776
3,1.481,2.314733,8.130164


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=3, training_loss=2.4725913206736245, metrics={'train_runtime': 131.5272, 'train_samples_per_second': 0.456, 'train_steps_per_second': 0.023, 'total_flos': 9523519488000.0, 'train_loss': 2.4725913206736245, 'epoch': 3.0})

## Inferences (beam-greedy search)

In [16]:
trainer.save_model("/kaggle/working/mBart50/en-vi-mbart50")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/mBart50/en-vi-mbart50")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/mBart50/en-vi-mbart50")

In [17]:
# Download model
import sacrebleu
from transformers import pipeline

translator = pipeline(task="translation_en_to_vi", model=model, tokenizer = tokenizer)

# Greedy search
pred_sentences_greedy = translator(ds_test["en"], batch_size=8, num_beams=1, do_sample=False)
# Chuyển List[Dict["translation_text"]] => List[str]
pred_sentences_greedy = [item["translation_text"] for item in pred_sentences_greedy]


# Beam search
pred_sentences_beam = translator(ds_test["en"], batch_size=8, num_beams=5)
# Chuyển List[Dict["translation_text"]] => List[str]
pred_sentences_beam = [item["translation_text"] for item in pred_sentences_beam]
# Tính BLEU cho từng kiểu suy luận
bleu_score_greedy = sacrebleu.corpus_bleu(pred_sentences_greedy, [ds_test["vi"]], force=True)
bleu_score_beam = sacrebleu.corpus_bleu(pred_sentences_beam, [ds_test["vi"]], force=True)

print("BLEU (greedy):", bleu_score_greedy)
print("BLEU (beam):", bleu_score_beam)

Device set to use cuda:0


BLEU (greedy): BLEU = 0.48 11.8/1.2/0.3/0.2 (BP = 0.512 ratio = 0.599 hyp_len = 178 ref_len = 297)
BLEU (beam): BLEU = 0.53 10.4/0.9/0.2/0.1 (BP = 0.713 ratio = 0.747 hyp_len = 222 ref_len = 297)
