In [1]:
!pip install torch~=2.3.0 torchvision
!pip install transformers datasets
!pip install transformers[torch]
!pip install sacrebleu

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch~=2.3.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch~=2.3.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch~=2.3.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch~=2.3.0)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch~=2.3.0)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch~=2.3.0)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch~=2.3.0)
  Using cached nvidia_curand_cu12-10.3.2.106-p

In [1]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import DatasetDict, Dataset, load_metric
import numpy as np
import os

# Load the dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def load_phoMT_data(data_dir):
  splits = ['train', 'dev', 'test']
  datasets = {}

  for split in splits:
    en_file = os.path.join(data_dir, 'tokenization', split, f"{split}.en")
    vi_file = os.path.join(data_dir, 'tokenization', split, f"{split}.vi")

    with open(en_file, 'r', encoding='utf-8') as f:
      en_lines = f.readlines()
    with open(vi_file, 'r', encoding='utf-8') as f:
      vi_lines = f.readlines()

    assert len(en_lines) == len(vi_lines), f"Msimatch in number of lines between {en_file} and {vi_file}"

    data = {'translation': [{'en': en.strip(), 'vi': vi.strip()} for en, vi in zip(en_lines, vi_lines)]}
    datasets[split] = Dataset.from_dict(data)

  return DatasetDict(datasets)

data_dir = '/content/drive/My Drive/PhoMT'
phoMT_datasets = load_phoMT_data(data_dir)

In [5]:
print("Training dataset size: ")
print(len(phoMT_datasets['train']))

print("Dev dataset size: ")
print(len(phoMT_datasets['dev']))

print("Test dataset size: ")
print(len(phoMT_datasets['test']))

Training dataset size: 
2977999
Dev dataset size: 
18719
Test dataset size: 
19151


# Preview the dataset

In [None]:
import pandas as pd

train_df = pd.DataFrame(phoMT_datasets['train'][:5])
dev_df = pd.DataFrame(phoMT_datasets['dev'][:5])
test_df = pd.DataFrame(phoMT_datasets['test'][:5])

print("Training Set Preview:")
print(train_df)

print("\nDevelopment Set Preview:")
print(dev_df)

print("\nTest Set Preview:")
print(test_df)

Training Set Preview:
                                         translation
0  {'en': 'It begins with a countdown .', 'vi': '...
1  {'en': 'On August 14th , 1947 , a woman in Bom...
2  {'en': 'Across India , people hold their breat...
3  {'en': 'And at the stroke of midnight , a squi...
4  {'en': 'These events form the foundation of " ...

Development Set Preview:
                                         translation
0  {'en': '﻿Hurricane Dorian , one of the most po...
1  {'en': 'Dorian is especially dangerous due to ...
2  {'en': 'The storm passed by the Leeward Island...
3  {'en': 'The United States branch office contin...
4  {'en': 'At this time , there have been no repo...

Test Set Preview:
                                         translation
0  {'en': 'Brother Albert Barnett and his wife , ...
1  {'en': 'Severe storms ripped through parts of ...
2  {'en': 'Two days of heavy rain , high winds , ...
3  {'en': 'Sadly , Brother Albert Barnett and his...
4  {'en': 'The United States bra

# Preprocess the dataset

In [None]:
from transformers import MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-vi'
tokenizer = MarianTokenizer.from_pretrained(model_name)

def preprocess(examples):
  inputs = [example['en'] for example in examples['translation']]
  targets = [example['vi'] for example in examples['translation']]

  model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')
  model_inputs['labels'] = labels['input_ids']

  return model_inputs

tokenized_datasets = phoMT_datasets.map(preprocess, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



Map:   0%|          | 0/2977999 [00:00<?, ? examples/s]



Map:   0%|          | 0/18719 [00:00<?, ? examples/s]

Map:   0%|          | 0/19151 [00:00<?, ? examples/s]

## Save the tokenized datasets

In [None]:
tokenized_datasets.save_to_disk('/content/drive/My Drive/tokenized_datasets')

# Load model and tokenizer

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Load the tokenized datasets

In [None]:
from datasets import load_from_disk

tokenized_datasets = load_from_disk('/content/drive/My Drive/tokenized_datasets')

# Load the model and define training arguments

In [None]:
model_name = 'Helsinki-NLP/opus-mt-en-vi'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [None]:
model_path = '/content/drive/My Drive/fine-tuned-mt-en-vi'
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(model_path)



In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='/content/results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

metric = load_metric("sacrebleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    bleu = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return {"bleu": bleu['score']}

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'].shuffle(seed=12).select(range(15000)),
    eval_dataset=tokenized_datasets['dev'].shuffle(seed=12).select(range(8000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

The repository for sacrebleu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/sacrebleu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


## Train and save the model

In [None]:
trainer.train()

model.save_pretrained('/content/drive/My Drive/fine-tuned-mt-en-vi')
tokenizer.save_pretrained('/content/drive/My Drive/fine-tuned-mt-en-vi')

Epoch,Training Loss,Validation Loss,Bleu
1,0.2993,0.252391,35.834319
2,0.2569,0.253251,35.860697
3,0.2453,0.25417,35.827233


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[53684]], 'forced_eos_token_id': 0}


('/content/drive/My Drive/fine-tuned-mt-en-vi/tokenizer_config.json',
 '/content/drive/My Drive/fine-tuned-mt-en-vi/special_tokens_map.json',
 '/content/drive/My Drive/fine-tuned-mt-en-vi/vocab.json',
 '/content/drive/My Drive/fine-tuned-mt-en-vi/added_tokens.json')

## Inference the model on dev and test set

In [None]:
eval_args = Seq2SeqTrainingArguments(
    output_dir='content/results',
    per_device_eval_batch_size=8,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

results = trainer.evaluate()
print(f"Test dataset BLEU score: {results['eval_bleu']}")

trainer = Seq2SeqTrainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_datasets['dev'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

results = trainer.evaluate()
print(f"Dev dataset BLEU score: {results['eval_bleu']}")

Test dataset BLEU score: 34.81295215528332


Dev dataset BLEU score: 35.26301846136086


# Inference

In [None]:
model_path = '/content/drive/My Drive/fine-tuned-mt-en-vi'
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(model_path)

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated_tokens = model.generate(**inputs)
    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translation

# Example usage
text = "Hurricane Dorian, one of the most powerful storms ever recorded in the Atlantic Ocean, made landfall as a Category 5 storm on Great Abaco Island in the northern Bahamas on Sunday morning, September 1, 2019."
translation = translate(text)
print(translation)