In [1]:
# %pip install -U datasets sacrebleu

In [2]:
import numpy as np
import datasets

## 1. Dataset

### 1.1 Load dataset English-Vietnamese

In [3]:
en_vi_dataset = datasets.load_dataset("mt_eng_vietnamese", name="iwslt2015-en-vi")

In [4]:
en_vi_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 133318
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
})

In [5]:
en_vi_dataset["train"][0]["translation"]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

In [6]:
from transformers import AutoTokenizer

model_name = "ngocquanofficial/machine_translation_VinAI"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
source_lang = "en"
target_lang = "vi"
prefix = "translate English to Vietnamese: "
max_input_length = 128    
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
en_vi_dataset = en_vi_dataset.map(preprocess_function, batched=True)

## 2. Model

In [9]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("ngocquanofficial/machine_translation_VinAI")

In [10]:
# from transformers.models.t5 import T5ForConditionalGeneration, T5Config

# t5_config = T5Config.from_pretrained("ngocquanofficial/machine_translation_VinAI")

# model = T5ForConditionalGeneration(config=t5_config)

## 3. Evaluate

In [11]:
import evaluate
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## 4. Training

In [12]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer


batch_size = 64
training_args = Seq2SeqTrainingArguments(
    output_dir="out_dir",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    save_strategy="epoch",
    load_best_model_at_end=True,
    predict_with_generate=True,
    fp16=True, 
    optim="adamw_torch",
)

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=en_vi_dataset["train"],
    eval_dataset=en_vi_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()

  0%|          | 0/10420 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.4829, 'learning_rate': 1.9040307101727448e-05, 'epoch': 0.24}
{'loss': 1.4554, 'learning_rate': 1.8080614203454897e-05, 'epoch': 0.48}
{'loss': 1.4407, 'learning_rate': 1.7120921305182344e-05, 'epoch': 0.72}
{'loss': 1.4327, 'learning_rate': 1.616122840690979e-05, 'epoch': 0.96}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.2264691591262817, 'eval_bleu': 18.8423, 'eval_gen_len': 16.6793, 'eval_runtime': 9.7903, 'eval_samples_per_second': 129.618, 'eval_steps_per_second': 2.043, 'epoch': 1.0}
{'loss': 1.3704, 'learning_rate': 1.5201535508637238e-05, 'epoch': 1.2}
{'loss': 1.3575, 'learning_rate': 1.4241842610364684e-05, 'epoch': 1.44}
{'loss': 1.3622, 'learning_rate': 1.3282149712092132e-05, 'epoch': 1.68}
{'loss': 1.3649, 'learning_rate': 1.2322456813819578e-05, 'epoch': 1.92}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.222044587135315, 'eval_bleu': 18.8403, 'eval_gen_len': 16.6777, 'eval_runtime': 9.6858, 'eval_samples_per_second': 131.017, 'eval_steps_per_second': 2.065, 'epoch': 2.0}
{'loss': 1.3248, 'learning_rate': 1.1362763915547026e-05, 'epoch': 2.16}
{'loss': 1.3088, 'learning_rate': 1.0403071017274472e-05, 'epoch': 2.4}
{'loss': 1.3098, 'learning_rate': 9.44337811900192e-06, 'epoch': 2.64}
{'loss': 1.3039, 'learning_rate': 8.483685220729368e-06, 'epoch': 2.88}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.2216891050338745, 'eval_bleu': 18.9997, 'eval_gen_len': 16.6359, 'eval_runtime': 9.4092, 'eval_samples_per_second': 134.868, 'eval_steps_per_second': 2.126, 'epoch': 3.0}
{'loss': 1.2802, 'learning_rate': 7.523992322456814e-06, 'epoch': 3.12}
{'loss': 1.2641, 'learning_rate': 6.5642994241842614e-06, 'epoch': 3.36}
{'loss': 1.2683, 'learning_rate': 5.6046065259117085e-06, 'epoch': 3.6}
{'loss': 1.2739, 'learning_rate': 4.644913627639156e-06, 'epoch': 3.84}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.2213001251220703, 'eval_bleu': 19.0839, 'eval_gen_len': 16.647, 'eval_runtime': 9.4148, 'eval_samples_per_second': 134.788, 'eval_steps_per_second': 2.124, 'epoch': 4.0}
{'loss': 1.2641, 'learning_rate': 3.687140115163148e-06, 'epoch': 4.08}
{'loss': 1.2443, 'learning_rate': 2.727447216890595e-06, 'epoch': 4.32}
{'loss': 1.2433, 'learning_rate': 1.7677543186180424e-06, 'epoch': 4.56}
{'loss': 1.2417, 'learning_rate': 8.080614203454896e-07, 'epoch': 4.8}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.2219176292419434, 'eval_bleu': 19.0503, 'eval_gen_len': 16.6344, 'eval_runtime': 9.5111, 'eval_samples_per_second': 133.423, 'eval_steps_per_second': 2.103, 'epoch': 5.0}
{'train_runtime': 3994.2477, 'train_samples_per_second': 166.887, 'train_steps_per_second': 2.609, 'train_loss': 1.3263574289040008, 'epoch': 5.0}


TrainOutput(global_step=10420, training_loss=1.3263574289040008, metrics={'train_runtime': 3994.2477, 'train_samples_per_second': 166.887, 'train_steps_per_second': 2.609, 'train_loss': 1.3263574289040008, 'epoch': 5.0})

In [15]:
trainer.evaluate(eval_dataset=en_vi_dataset["test"])

  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.2213001251220703,
 'eval_bleu': 19.0839,
 'eval_gen_len': 16.647,
 'eval_runtime': 9.4567,
 'eval_samples_per_second': 134.191,
 'eval_steps_per_second': 2.115,
 'epoch': 5.0}

In [16]:
# pre-trained model: ngocquanofficial/machine_translation_VinAI

# - eval_only - 0 epoch
#     - loss: 1.7319
#     - BLEU: 16.7368

# - fine_tune - 5 epochs 2e-5
#     - loss: 1.2213
#     - BLEU: 19.0839
    
# - cfg_only - 10 epochs 1e-4
#     - loss: 3.209
#     - BLEU: 6.9639


## 5. Inference

In [17]:
text_input = "I am learning how to translate"
inputs_ids =  tokenizer.encode(text_input, return_tensors="pt").to("cuda")
output_ids = model.generate(inputs_ids, max_length=128, num_return_sequences=1)
tokenizer.decode(output_ids[0], skip_special_tokens=True)

'Tôi đang học cách dịch nó ra tiếng'