#VI-EN Machine Translation using mBART50 Model
**Dataset: IWSLT15-en-vi**

**khuongvd00@gmail.com**

In [67]:
!nvidia-smi

Wed Mar  1 20:24:03 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    50W / 400W |  15875MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
%cd /content/drive/MyDrive/NLP/machine_translation

In [3]:
!pip install -q transformers==4.26.1 sentencepiece==0.1.97 datasets==2.9.0 sacrebleu==2.3.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [68]:
import os
import numpy as np

import sacrebleu

import sentencepiece

import torch
from torch.utils.data import Dataset

from datasets import load_dataset, load_metric
from transformers import *

##1.Prepare Data

In [5]:
class NMTDataset(Dataset):
    def __init__(self, cfg, data_type="train"):
        super().__init__()
        self.cfg = cfg

        self.src_texts, self.tgt_texts = self.read_data(data_type)

        self.src_input_ids = self.texts_to_sequences(self.src_texts)
        self.labels = self.texts_to_sequences(self.tgt_texts)

    def read_data(self, data_type):
        data = load_dataset(
            "mt_eng_vietnamese", 
            "iwslt2015-en-vi", 
            split=data_type
        )
        src_texts = [sample["translation"][self.cfg.src_lang] for sample in data]
        tgt_texts = [sample["translation"][self.cfg.tgt_lang] for sample in data]
        return src_texts, tgt_texts
    
    def texts_to_sequences(self, texts):
        data_inputs = self.cfg.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.cfg.max_len,
            return_tensors='pt'
        )
        return data_inputs.input_ids

    def __getitem__(self, idx):
        return {
            "input_ids": self.src_input_ids[idx],
            "labels": self.labels[idx]
        }

    def __len__(self):
        return np.shape(self.src_input_ids)[0]

##2.Config

In [8]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Data
    src_lang = 'vi'
    tgt_lang = 'en'
    max_len = 100

    # Model
    model_name = "facebook/mbart-large-50-many-to-many-mmt"

    # Training
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 5e-5
    train_batch_size = 32
    eval_batch_size = 32
    num_train_epochs = 3
    ckpt_dir = model_name.split('/')[1]
    eval_steps = 500

    # Inference
    beam_size = 5

cfg = NMTConfig()

##3.Load Tokenizer and Model

In [None]:
cfg.tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name, src_lang="vi_VN",tgt_lang = "en_XX")
model = MBartForConditionalGeneration.from_pretrained(cfg.model_name)

train_dataset = NMTDataset(cfg, data_type="train")
valid_dataset = NMTDataset(cfg, data_type="validation")
test_dataset = NMTDataset(cfg, data_type="test")

##4.Load Metric

In [11]:
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = cfg.tokenizer.batch_decode(preds, skip_special_tokens=True)

    decoded_labels = cfg.tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != cfg.tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    
    return result

  metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

##5.Training

In [12]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    save_strategy='steps',
    save_steps=cfg.eval_steps,
    eval_steps=cfg.eval_steps,
    output_dir=cfg.ckpt_dir,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    learning_rate=cfg.learning_rate,
    weight_decay=0.005,
    num_train_epochs=cfg.num_train_epochs,
)

data_collator = DataCollatorForSeq2Seq(
    cfg.tokenizer, 
    model=model
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=cfg.tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Gen Len
500,0.5946,0.305772,35.3575,31.0489
1000,0.2415,0.300835,35.6363,31.1411
1500,0.2445,0.30054,34.8787,31.8463
2000,0.2431,0.301833,35.524,31.1292
2500,0.2405,0.300886,35.9667,30.7833
3000,0.2398,0.297898,35.9793,31.2065
3500,0.2382,0.293183,36.1534,31.2695
4000,0.2366,0.296028,35.3871,31.647
4500,0.1883,0.318681,35.3111,30.8432
5000,0.1675,0.315368,36.5669,30.6375


***** Running Evaluation *****
  Num examples = 1269
  Batch size = 32
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_t

KeyboardInterrupt: ignored

##6.Evaluate

In [18]:
def load_model(cfg, checkpoint_name):
    # Load Tokenizer
    tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name, src_lang="vi_VN",tgt_lang = "en_XX")

    # Load Model
    model_save_path = f"{cfg.ckpt_dir}/{checkpoint_name}"
    model = MBartForConditionalGeneration.from_pretrained(model_save_path)

    # Inference Param
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    return tokenizer, model, device

In [21]:
from tqdm import tqdm
def inference(
    text, 
    src_tokenizer, 
    tgt_tokenizer, 
    model, 
    device="cpu", 
    max_length=75,
    beam_size=5
    ):
    inputs = src_tokenizer(
        text, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
        )
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    model.to(device)

    outputs = model.generate(
        input_ids, 
        attention_mask=attention_mask, 
        max_length=max_length, 
        early_stopping=True, 
        num_beams=beam_size, 
        length_penalty=2.0
    )

    output_str = tgt_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return output_str

def inference_bath(
    texts, 
    tokenizer, 
    model, 
    device="cpu", 
    max_length=50,
    beam_size=5,
    batch_size=32
    ):

    pred_texts = []

    if len(texts) < batch_size:
        batch_size = len(texts)

    for x in tqdm(range(0, len(texts), batch_size)):
        text = texts[x:x+batch_size]

        inputs = tokenizer(
            text, 
            padding="max_length", 
            truncation=True, 
            max_length=max_length, 
            return_tensors="pt"
            )
        
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)
        model.to(device)

        outputs = model.generate(
            input_ids, 
            attention_mask=attention_mask, 
            max_length=max_length, 
            early_stopping=True, 
            num_beams=beam_size, 
            length_penalty=2.0
        )

        output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        pred_texts.extend(output_str)
        torch.cuda.empty_cache()

    return pred_texts

In [16]:
data = load_dataset("mt_eng_vietnamese", "iwslt2015-en-vi", split="test")
src_texts = [sample["translation"]["vi"] for sample in data]
tgt_texts = [sample["translation"]["en"] for sample in data]



###Checkpoint-5000 (Best Valid Bleu_Score)

In [None]:
tokenizer, model, device = load_model(cfg, checkpoint_name="checkpoint-5000")

**Beam size = 1**

In [None]:
pred_texts = inference_bath(src_texts, tokenizer, model, device, beam_size=1)

In [24]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 31.50 63.8/40.1/28.1/20.3 (BP = 0.906 ratio = 0.910 hyp_len = 25756 ref_len = 28297)

**Beam size = 5**

In [None]:
pred_texts = inference_bath(src_texts, tokenizer, model, device)

In [26]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 33.46 63.4/41.0/29.0/21.1 (BP = 0.942 ratio = 0.943 hyp_len = 26697 ref_len = 28297)

###Checkpoint-3500 (Best Valid Loss)

In [None]:
tokenizer, model, device = load_model(cfg, checkpoint_name="checkpoint-3500")

**Beam size = 1**

In [None]:
pred_texts = inference_bath(src_texts, tokenizer, model, device, beam_size=1)

In [29]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 32.69 64.3/41.3/29.0/20.9 (BP = 0.917 ratio = 0.920 hyp_len = 26041 ref_len = 28297)

**Beam size = 5**

In [None]:
pred_texts = inference_bath(src_texts, tokenizer, model, device)

In [40]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 34.07 62.8/40.9/29.1/21.2 (BP = 0.960 ratio = 0.961 hyp_len = 27185 ref_len = 28297)