# En-Vi Translator with low resource

## Installation

In [None]:
# !pip install -q install torch torchvision torchaudio
# !pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

## Libraries

In [None]:
import os
from tqdm.autonotebook import tqdm
import numpy as np

import torch
from torch.utils.data import Dataset

from datasets import load_dataset
import evaluate
from transformers import PreTrainedTokenizer, PreTrainedModel
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

## Build Dataset

In [None]:
class NMTDataset(Dataset):
    def __init__(self, cfg, split='train', prefix=''):
        self.cfg = cfg

        src_texts, tgt_texts = self.read_data(split, prefix)

        self.src_input_ids = self.text_to_sequence(src_texts)
        self.labels = self.text_to_sequence(tgt_texts)

    def read_data(self, split, prefix):
        dataset = load_dataset('mt_eng_vietnamese', 
                               'iwslt2015-en-vi', 
                               split=split,
                               cache_dir=self.cfg.cache_dir)

        src_texts = [prefix + sample['translation'][self.cfg.src_lang] for sample in dataset]
        tgt_texts = [sample['translation'][self.cfg.tgt_lang] for sample in dataset]

        return src_texts, tgt_texts
    
    def text_to_sequence(self, text):
        inputs = self.cfg.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.cfg.max_length,
            return_tensors='pt'
        )

        return inputs.input_ids
    
    def __getitem__(self, index):
        return {
            'input_ids': self.src_input_ids[index],
            'labels': self.labels[index]
        }
    
    def __len__(self):
        return np.shape(self.src_input_ids)[0]

## Configuration

In [None]:
class BaseConfig:
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)


class NMTConfig(BaseConfig):
    # Data
    src_lang = 'en'
    tgt_lang = 'vi'
    max_length = 75
    add_special_token = True
    augmented_data_size = 0.0001

    # Model
    model_name = "VietAI/envit5-translation"
    cache_dir = './.cache/'

    # Training
    device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
    if device == 'mps':
        use_mps_device=True
        
    learning_rate = 1e-5
    train_batch_size = 16
    eval_batch_size = 16
    num_train_epochs = 2
    save_total_limit = 1
    ckpt_dir = f'./checkpoints'
    eval_steps = 1000

    # interfere
    beam_size = 5

In [None]:
cfg = NMTConfig()
finetuned = True

try:
    # Load model locally if exist
    print('Try load model locally!')
    model_dir = './models/finetuned-EnViT5'
    cfg.tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, local_files_only=True)
    print('Loading local model successfully!')

except:
    print('Loading local model failed!\nDownloading from Huggingface!')
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, cache_dir=cfg.cache_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(cfg.model_name, cache_dir=cfg.cache_dir)

    finetuned = False

## Setup evaluation metrics

In [None]:
metric = evaluate.load('sacrebleu', cache_dir=cfg.cache_dir)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = cfg.tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, cfg.tokenizer.pad_token_id)
    decoded_labels = cfg.tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != cfg.tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Training

### Finetune on En-Vi dataset

In [None]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    output_dir=cfg.ckpt_dir,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    use_mps_device=cfg.use_mps_device,
    save_total_limit=cfg.save_total_limit,
    learning_rate=cfg.learning_rate,
    num_train_epochs=cfg.num_train_epochs,
    load_best_model_at_end=True
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=cfg.tokenizer,
    model=model
)

In [None]:
if not finetuned:
    train_dataset = NMTDataset(cfg, 'train')
    valid_dataset = NMTDataset(cfg, 'validation')

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        tokenizer=cfg.tokenizer,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

    model_dir = './models/finetuned-EnViT5'
    trainer.save_model(output_dir=model_dir)

### Create synthetic data by back-translating

In [None]:
class Augmented_NMTDataset(NMTDataset):
    def __init__(self, cfg, augmented_src=[], augmented_tgt=[], split='train', prefix=''):
        super().__init__(cfg, split, prefix)

        augmented_src_sequence = self.text_to_sequence(augmented_src)
        augmented_tgt_sequence = self.text_to_sequence(augmented_tgt)

        self.src_input_ids = torch.cat((self.src_input_ids, augmented_src_sequence), dim=0)
        self.labels = torch.cat((self.labels, augmented_tgt_sequence), dim=0)

    def read_data(self, split, prefix):
        return super().read_data(split, prefix)

    def text_to_sequence(self, text):
        return super().text_to_sequence(text)

    def __getitem__(self, idx):
        return {
            'input_ids': self.src_input_ids[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return super().__len__()

In [None]:
def inference(text,
              tokenizer: PreTrainedTokenizer,
              model: PreTrainedModel,
              device=NMTConfig.device,
              max_length=NMTConfig.max_length,
              beam_size=NMTConfig.beam_size):
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

    device = torch.device(device)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    model.to(device)

    outputs = model.generate(input_ids,
                             attention_mask=attention_mask,
                             max_length=max_length,
                             early_stopping=True,
                             num_beams=beam_size,
                             length_penalty=2.0)

    output_strs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    del input_ids
    del attention_mask

    return output_strs

In [None]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [line.strip() for line in file.readlines()]

    return data

In [None]:
train_augmented_tgt_texts = load_data('./data/PhoMT/tokenization/train/train.vi')
train_augmented_tgt_texts = train_augmented_tgt_texts[:int(cfg.augmented_data_size * len(train_augmented_tgt_texts))]

eval_augmented_tgt_texts = load_data('./data/PhoMT/tokenization/dev/dev.vi')
eval_augmented_tgt_texts = eval_augmented_tgt_texts[:int(cfg.augmented_data_size * len(eval_augmented_tgt_texts))]

train_augmented_src_texts = []
eval_augmented_src_texts = []

batch_size = 32
for s, tgt, src in zip(('train', 'validation'),
                        (train_augmented_tgt_texts, eval_augmented_tgt_texts),
                        (train_augmented_src_texts, eval_augmented_src_texts)):
    print(f'Back translation {s} set')
    for i in tqdm(range(len(tgt) // batch_size + 1)):
        batch_data = tgt[i * batch_size: min((i + 1) * batch_size, len(tgt))]
        output_strs = inference(batch_data, cfg.tokenizer, model)
        output_strs = [output.strip('en: ') for output in output_strs]
        src += output_strs

### Re - Train with new final data (original data + synthetic data)

In [None]:
train_augmented_dataset = Augmented_NMTDataset(cfg,
                                            augmented_src=train_augmented_src_texts,
                                            augmented_tgt=train_augmented_tgt_texts,
                                            split='train')
eval_augmented_dataset = Augmented_NMTDataset(cfg,
                                              augmented_src=eval_augmented_src_texts,
                                              augmented_tgt=eval_augmented_tgt_texts,
                                              split='validation')

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=cfg.tokenizer,
    train_dataset=train_augmented_dataset,
    eval_dataset=eval_augmented_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
model_dir = './models/augmented-EnViT5'
trainer.save_model(output_dir=model_dir)