#VI-EN Machine Translation using BERT-to-GPT2 Model
**Dataset: IWSLT15-en-vi**

**khuongvd00@gmail.com**

In [1]:
!nvidia-smi

Wed Mar  1 17:19:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!pip install -q transformers==4.26.1 datasets==2.9.0 sacrebleu==2.3.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m103.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%cd /content/drive/MyDrive/NLP/machine_translation

In [6]:
import os
import numpy as np

import sacrebleu

import torch
from torch.utils.data import Dataset

from datasets import load_dataset, load_metric
from transformers import *

##1.Prepare Data

In [7]:
class NMTDataset(Dataset):
    def __init__(self, cfg, data_type="train"):
        super().__init__()
        self.cfg = cfg

        self.src_texts, self.tgt_texts = self.read_data(data_type)

        self.src_input_ids, self.src_attention_mask = self.texts_to_sequences(self.src_texts)
        self.tgt_input_ids, self.tgt_attention_mask, self.labels = self.texts_to_sequences(
            self.tgt_texts, 
            is_src=False
        )

    def read_data(self, data_type):
        data = load_dataset(
            "mt_eng_vietnamese", 
            "iwslt2015-en-vi", 
            split=data_type
        )
        src_texts = [sample["translation"][self.cfg.src_lang] for sample in data]
        tgt_texts = [sample["translation"][self.cfg.tgt_lang] for sample in data]
        return src_texts, tgt_texts
    
    def texts_to_sequences(self, texts, is_src=True):
        if is_src:
            src_inputs = self.cfg.src_tokenizer(
                texts,
                padding='max_length',
                truncation=True,
                max_length=self.cfg.src_max_len,
                return_tensors='pt'
            )
            return (
                src_inputs.input_ids, 
                src_inputs.attention_mask
            )

        else:
            if self.cfg.add_special_tokens:
                texts = [
                    ' '.join([
                        self.cfg.tgt_tokenizer.bos_token,
                        text,
                        self.cfg.tgt_tokenizer.eos_token
                        ]) 
                    for text in texts
                ]
            tgt_inputs = self.cfg.tgt_tokenizer(
                texts,
                padding='max_length',
                truncation=True,
                max_length=self.cfg.tgt_max_len,
                return_tensors='pt'
            )

            labels = tgt_inputs.input_ids.numpy().tolist()
            labels = [
                [
                    -100 if token_id == self.cfg.tgt_tokenizer.pad_token_id else token_id
                    for token_id in label
                ]
                for label in labels
            ]

            labels = torch.LongTensor(labels)

            return (
                tgt_inputs.input_ids,
                tgt_inputs.attention_mask,
                labels
            )

    def __getitem__(self, idx):
        return {
            "input_ids": self.src_input_ids[idx],
            "attention_mask": self.src_attention_mask[idx],
            "decoder_input_ids": self.tgt_input_ids[idx],
            "decoder_attention_mask": self.tgt_attention_mask[idx],
            "labels": self.labels[idx]
        }

    def __len__(self):
        return np.shape(self.src_input_ids)[0]

##2.Load Tokenizer and Model

In [8]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def load_tokenizer(model_name_or_path):
    if 'bert' in model_name_or_path.split('-'):
        return BertTokenizerFast.from_pretrained(model_name_or_path)
    elif 'gpt2' in model_name_or_path.split('-'):
        return GPT2TokenizerFast.from_pretrained(model_name_or_path)
    else:
        return AutoTokenizer.from_pretrained(model_name_or_path)

In [9]:
class Manager():
    def __init__(self, cfg, is_train=True):
        self.cfg = cfg

        print("Loading Tokenizer...")
        self.get_tokenizer()

        print("Loading Model...")
        self.get_model()

        print("Loading Metric...")
        self.bleu_metric = load_metric("sacrebleu")

        print("Check Save Model Path")
        if not os.path.exists(self.cfg.ckpt_dir):
            os.mkdir(self.cfg.ckpt_dir)

        if is_train:
            # Load dataloaders
            print("Loading Dataset...")
            self.train_dataset = NMTDataset(self.cfg, data_type="train")
            self.valid_dataset = NMTDataset(self.cfg, data_type="validation")
        
        print("Setting finished.")

    def get_tokenizer(self):
        if self.cfg.load_model_from_path:
            self.cfg.src_tokenizer = load_tokenizer(self.cfg.ckpt_dir)
            self.cfg.tgt_tokenizer = load_tokenizer(self.cfg.ckpt_dir)
        else:
            self.cfg.src_tokenizer = load_tokenizer(self.cfg.src_model_name)
            self.cfg.tgt_tokenizer = load_tokenizer(self.cfg.tgt_model_name)
            if "bert" in self.cfg.tgt_model_name.split('-'):
                self.cfg.add_special_tokens = False
                self.cfg.bos_token_id = self.cfg.tgt_tokenizer.cls_token_id
                self.cfg.eos_token_id = self.cfg.tgt_tokenizer.sep_token_id
                self.cfg.pad_token_id = self.cfg.tgt_tokenizer.pad_token_id
            else:
                self.cfg.add_special_tokens = True
                self.cfg.tgt_tokenizer.add_special_tokens(
                    {
                        "bos_token": "[BOS]", 
                        "eos_token": "[EOS]", 
                        "pad_token": "[PAD]"
                    }
                )
                self.cfg.bos_token_id = self.cfg.tgt_tokenizer.bos_token_id
                self.cfg.eos_token_id = self.cfg.tgt_tokenizer.eos_token_id
                self.cfg.pad_token_id = self.cfg.tgt_tokenizer.pad_token_id
        self.cfg.src_tokenizer.save_pretrained(
                os.path.join(self.cfg.ckpt_dir, f"{self.cfg.src_lang}_tokenizer_{cfg.src_model_name}")
            )
        
        self.cfg.tgt_tokenizer.save_pretrained(
                os.path.join(self.cfg.ckpt_dir, f"{self.cfg.tgt_lang}_tokenizer_{cfg.tgt_model_name}")
            )
    
    def get_model(self):
        if self.cfg.load_model_from_path:
            save_model_path = os.path.join(self.cfg.ckpt_dir, self.cfg.ckpt_name)
            self.model = EncoderDecoderModel.from_pretrained(save_model_path)
        else:
            self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                self.cfg.src_model_name, 
                self.cfg.tgt_model_name
            )
            self.model.decoder.resize_token_embeddings(len(self.cfg.tgt_tokenizer))
            self.model.config.decoder_start_token_id = self.cfg.bos_token_id
            self.model.config.eos_token_id = self.cfg.eos_token_id
            self.model.config.pad_token_id = self.cfg.pad_token_id
            self.model.config.vocab_size = len(self.cfg.tgt_tokenizer)
            self.model.config.max_length = self.cfg.max_length_decoder
            self.model.config.min_length = self.cfg.min_length_decoder
            self.model.config.no_repeat_ngram_size = 3
            self.model.config.early_stopping = True
            self.model.config.length_penalty = 2.0
            self.model.config.num_beams = self.cfg.beam_size

    def train(self):
        print("Training...")
        if self.cfg.use_eval_steps:
            training_args = Seq2SeqTrainingArguments(
                predict_with_generate=True,
                evaluation_strategy="steps",
                save_strategy='steps',
                save_steps=self.cfg.eval_steps,
                eval_steps=self.cfg.eval_steps,
                output_dir=self.cfg.ckpt_dir,
                per_device_train_batch_size=self.cfg.train_batch_size,
                per_device_eval_batch_size=self.cfg.eval_batch_size,
                learning_rate=self.cfg.learning_rate,
                weight_decay=0.005,
                num_train_epochs=self.cfg.num_train_epochs
            )
        else:
            training_args = Seq2SeqTrainingArguments(
                predict_with_generate=True,
                evaluation_strategy="epoch",
                save_strategy='epoch',
                output_dir=self.cfg.ckpt_dir,
                per_device_train_batch_size=self.cfg.train_batch_size,
                per_device_eval_batch_size=self.cfg.eval_batch_size,
                learning_rate=self.cfg.learning_rate,
                weight_decay=0.005,
                num_train_epochs=self.cfg.num_train_epochs
            )

        data_collator = DataCollatorForSeq2Seq(
            self.cfg.tgt_tokenizer, 
            model=self.model
        )

        trainer = Seq2SeqTrainer(
            self.model,
            training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.valid_dataset,
            data_collator=data_collator,
            tokenizer=self.cfg.tgt_tokenizer,
            compute_metrics=self.compute_metrics
        )

        trainer.train()

    def compute_metrics(self, eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = self.cfg.tgt_tokenizer.batch_decode(preds, skip_special_tokens=True)

        labels = np.where(labels != -100, labels, self.cfg.tgt_tokenizer.pad_token_id)
        decoded_labels = self.cfg.tgt_tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        result = self.bleu_metric.compute(
            predictions=decoded_preds, 
            references=decoded_labels
        )

        result = {"bleu_score": result["score"]}

        prediction_lens = [np.count_nonzero(pred != self.cfg.tgt_tokenizer.pad_token_id) for pred in preds]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}

        return result

##3.Config

In [10]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Data
    src_lang = 'vi'
    tgt_lang = 'en'
    src_max_len = 75
    tgt_max_len = 75

    # Model
    src_model_name = "bert-base-multilingual-cased"
    tgt_model_name = "gpt2"

    # Training
    load_model_from_path = False
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 3e-5
    train_batch_size = 32
    eval_batch_size = 32
    num_train_epochs =25
    ckpt_dir = src_model_name + '_to_' + tgt_model_name
    use_eval_steps = False
    eval_steps = 2000

    # Inference
    max_length_decoder = 75
    min_length_decoder = 25
    beam_size = 5

cfg = NMTConfig()

##4.Training

In [None]:
manager = Manager(cfg, is_train=True)

In [None]:
manager.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Training...


***** Running training *****
  Num examples = 133318
  Num Epochs = 25
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 104175
  Number of trainable parameters = 330662400
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu Score,Gen Len
1,1.9818,1.788188,12.708,38.2206
2,1.6482,1.503495,18.6612,35.8101
3,1.4398,1.411196,21.6888,34.8597
4,1.3073,1.370574,22.6092,34.9338
5,1.1986,1.343941,23.176,35.2356
6,1.1116,1.347576,23.7805,34.8582
7,1.0383,1.350916,24.0384,34.6942
8,0.968,1.373242,23.6123,35.1001
9,0.9102,1.401362,23.9068,34.4113
10,0.8471,1.422908,23.9192,34.7029


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  "num_beams": 5,
  "pad_token_id": 50259,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 50257,
  "early_stopping": true,
  "eos_token_id": 50258,
  "length_penalty": 2.0,
  "max_length": 75,
  "min_length": 25,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 50259,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 50257,
  "early_stopping": true,
  "eos_token_id": 50258,
  "length_penalty": 2.0,
  "max_length": 75,
  "min_length": 25,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 50259,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 50257,
  "early_stopping": true,
  "eos_token_id": 50258,
  "length_penalty": 2.0,
  "max_length": 75,
  "min_length": 25,
  "no_repeat_ngram_size": 3,
  "num_beams": 5,
  "pad_token_id": 50259,
  "t

Epoch,Training Loss,Validation Loss,Bleu Score,Gen Len
1,1.9818,1.788188,12.708,38.2206
2,1.6482,1.503495,18.6612,35.8101
3,1.4398,1.411196,21.6888,34.8597
4,1.3073,1.370574,22.6092,34.9338
5,1.1986,1.343941,23.176,35.2356
6,1.1116,1.347576,23.7805,34.8582
7,1.0383,1.350916,24.0384,34.6942
8,0.968,1.373242,23.6123,35.1001
9,0.9102,1.401362,23.9068,34.4113
10,0.8471,1.422908,23.9192,34.7029


KeyboardInterrupt: ignored

##5.Evaluate

In [11]:
def load_model(cfg, checkpoint_name):
    # Load Tokenizer
    src_tokenizer_save_path = f"{cfg.ckpt_dir}/{cfg.src_lang}_tokenizer_{cfg.src_model_name}"
    src_tokenizer = BertTokenizerFast.from_pretrained(src_tokenizer_save_path)

    tgt_tokenizer_save_path = f"{cfg.ckpt_dir}/{cfg.tgt_lang}_tokenizer_{cfg.tgt_model_name}"
    tgt_tokenizer = GPT2TokenizerFast.from_pretrained(tgt_tokenizer_save_path)

    # Load Model
    model_save_path = f"{cfg.ckpt_dir}/{checkpoint_name}"
    model = EncoderDecoderModel.from_pretrained(model_save_path)

    # Inference Param
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    return src_tokenizer, tgt_tokenizer, model, device

In [12]:
from tqdm import tqdm
def inference(
    text, 
    src_tokenizer, 
    tgt_tokenizer, 
    model, 
    device="cpu", 
    max_length=75,
    beam_size=5
    ):
    inputs = src_tokenizer(
        text, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
        )
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    model.to(device)

    outputs = model.generate(
        input_ids, 
        attention_mask=attention_mask, 
        max_length=max_length, 
        early_stopping=True, 
        num_beams=beam_size, 
        length_penalty=2.0
    )

    output_str = tgt_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return output_str

def inference_bath(
    texts, 
    src_tokenizer, 
    tgt_tokenizer, 
    model, 
    device="cpu", 
    max_length=75,
    beam_size=5,
    batch_size=32
    ):

    pred_texts = []

    if len(texts) < batch_size:
        batch_size = len(texts)

    for x in tqdm(range(0, len(texts), batch_size)):
        text = texts[x:x+batch_size]

        inputs = src_tokenizer(
            text, 
            padding="max_length", 
            truncation=True, 
            max_length=max_length, 
            return_tensors="pt"
            )
        
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)
        model.to(device)

        outputs = model.generate(
            input_ids, 
            attention_mask=attention_mask, 
            max_length=max_length, 
            early_stopping=True, 
            num_beams=beam_size, 
            length_penalty=2.0
        )

        output_str = tgt_tokenizer.batch_decode(outputs, skip_special_tokens=True)
        pred_texts.extend(output_str)
        torch.cuda.empty_cache()

    return pred_texts

In [None]:
data = load_dataset("mt_eng_vietnamese", "iwslt2015-en-vi", split="test")
src_texts = [sample["translation"]["vi"] for sample in data]
tgt_texts = [sample["translation"]["en"] for sample in data]

###Checkpoint-50004 (Best Valid Bleu Score

In [None]:
src_tokenizer, tgt_tokenizer, model, device = load_model(cfg, checkpoint_name="checkpoint-50004")

**Beam_size=1**

In [None]:
pred_texts = inference_bath(src_texts, src_tokenizer, tgt_tokenizer, model, device, beam_size=1)

In [16]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 23.42 48.7/28.3/18.2/12.0 (BP = 1.000 ratio = 1.243 hyp_len = 35182 ref_len = 28297)

**Beam_size=5**

In [None]:
pred_texts = inference_bath(src_texts, src_tokenizer, tgt_tokenizer, model, device)

In [18]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 23.46 48.6/28.3/18.3/12.1 (BP = 1.000 ratio = 1.271 hyp_len = 35974 ref_len = 28297)

###Checkpoint-20835 (Best Valid Loss)

In [None]:
src_tokenizer, tgt_tokenizer, model, device = load_model(cfg, checkpoint_name="checkpoint-20835")

**Beam_size=1**

In [None]:
pred_texts = inference_bath(src_texts, src_tokenizer, tgt_tokenizer, model, device, beam_size=1)

In [21]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 23.56 49.1/28.5/18.4/12.0 (BP = 1.000 ratio = 1.217 hyp_len = 34451 ref_len = 28297)

**Beam_size=5**

In [None]:
pred_texts = inference_bath(src_texts, src_tokenizer, tgt_tokenizer, model, device)

In [24]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 23.19 48.1/28.1/18.1/11.8 (BP = 1.000 ratio = 1.277 hyp_len = 36131 ref_len = 28297)