In [None]:
# Install OpenNMT-py 3.x
!pip3 install OpenNMT-py



# Prepare Your Datasets


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Open the folder where you saved your prepapred datasets from the processing step
%cd /content/drive/MyDrive/nmt/
!ls

/content/drive/MyDrive/nmt
compute-bleu.py				      merged.vi-filtered.vi.subword.train
config.yaml				      model_released.pt
merged.source				      models
merged.source-filtered.source		      MT-Preparation
merged.source-filtered.source.subword	      README
merged.source-filtered.source.subword.dev     run
merged.source-filtered.source.subword.test    source.model
merged.source-filtered.source.subword.train   source.vocab
merged.vi				      target.model
merged.vi-filtered.vi			      target.vocab
merged.vi-filtered.vi.subword		      test.translated
merged.vi-filtered.vi.subword.dev	      test.translated.desubword
merged.vi-filtered.vi.subword.test	      train.log
merged.vi-filtered.vi.subword.test.desubword


# Create the Training Configuration File

The following config file matches most of the recommended values for the Transformer model [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762). As the current dataset is small, we reduced the following values:
* `train_steps` - for datasets with a few millions of sentences, consider using a value between 100000 and 200000, or more! Enabling the option `early_stopping` can help stop the training when there is no considerable improvement.
* `valid_steps` - 10000 can be good if the value `train_steps` is big enough.
* `warmup_steps` - obviously, its value must be less than `train_steps`. Try 4000 and 8000 values.

In [None]:
# Create the YAML configuration file

config = '''# config.yaml


## Where the samples will be written
save_data: run

# Training files
data:
    corpus_1:
        path_src: merged.source-filtered.source.subword.train
        path_tgt: merged.vi-filtered.vi.subword.train
        transforms: [filtertoolong]
    valid:
        path_src: merged.source-filtered.source.subword.dev
        path_tgt: merged.vi-filtered.vi.subword.dev
        transforms: [filtertoolong]

# Vocabulary files, generated by onmt_build_vocab
src_vocab: run/source.vocab
tgt_vocab: run/target.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 150000
tgt_vocab_size: 50000

# Filter out source/target longer than n if [filtertoolong] enabled
src_seq_length: 150
src_seq_length: 150

# Tokenization options
src_subword_model: source.model
tgt_subword_model: target.model

# Where to save the log file and the output models/checkpoints
log_file: train.log
save_model: models/model.enfrde

# Stop training if it does not improve after n validations
early_stopping: 4

# Default: 5000 - Save a model checkpoint for each n
save_checkpoint_steps: 1000

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 3435

# Default: 100000 - Train the model to max n steps
# Increase to 200000 or more for large datasets
# For fine-tuning, add up the required steps to the original steps
train_steps: 20000

# Default: 10000 - Run validation after n steps
valid_steps: 1000

# Default: 4000 - for large datasets, try up to 8000
warmup_steps: 1000
report_every: 100

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
bucket_size: 262144
num_workers: 0  # Default: 2, set to 0 when RAM out of memory
batch_type: "tokens"
batch_size: 8192   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 4096
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
# warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
'''

with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config)

# Build Vocabulary

For large datasets, it is not feasable to use all words/tokens found in the corpus. Instead, a specific set of vocabulary is extracted from the training dataset, usually betweeen 32k and 100k words. This is the main purpose of the vocabulary building step.

In [None]:
# Find the number of CPUs/cores on the machine
!nproc --all

2


In [None]:
# Build Vocabulary

# -config: path to your config.yaml file
# -n_sample: use -1 to build vocabulary on all the segment in the training dataset
# -num_threads: change it to match the number of CPUs to run it faster

!onmt_build_vocab -config config.yaml -n_sample -1 -num_threads 2

2023-12-01 05:41:05.769306: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 05:41:05.769360: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 05:41:05.769402: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 05:41:05.777254: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-01 05:41:08.437972: I tensorflow/c

From the **Runtime menu** > **Change runtime type**, make sure that the "**Hardware accelerator**" is "**GPU**".


In [None]:
# Check if the GPU is active
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-23c970b2-6d3f-928a-7bea-458ceaa39ad0)


In [None]:
# Check if the GPU is visable to PyTorch

import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

gpu_memory = torch.cuda.mem_get_info(0)
print("Free GPU memory:", gpu_memory[0]/1024**2, "out of:", gpu_memory[1]/1024**2)

True
Tesla V100-SXM2-16GB
Free GPU memory: 15842.125 out of: 16150.875


# Training


In [None]:
!rm -rf drive/MyDrive/nmt/models/

In [None]:
# Train the NMT model
!onmt_train -config config.yaml

2023-12-01 05:43:02.330533: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 05:43:02.330590: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 05:43:02.330632: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 05:43:02.338998: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-01 05:43:04.969490: I tensorflow/c

In [None]:
# For error debugging try:
# !dmesg -T

In [None]:
!onmt_train -config config.yaml -train_from models/model.enfrde_step_3000.pt

2023-12-01 10:39:33.687015: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 10:39:33.687099: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 10:39:33.687143: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 10:39:33.695406: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-01 10:39:36.342830: I tensorflow/c

In [None]:
!onmt_average_models -models /content/drive/MyDrive/nmt/models/model.enfrde_step_18000.pt /content/drive/MyDrive/nmt/models/model.enfrde_step_20000.pt -output /content/drive/MyDrive/nmt/models/model_avg.pt

2023-12-01 13:07:00.897398: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 13:07:00.897455: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 13:07:00.897498: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 13:07:00.907958: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-01 13:07:04.357158: I tensorflow/c

# Translation

Translation Options:
* `-model` - specify the last model checkpoint name; try testing the quality of multiple checkpoints
* `-src` - the subworded test dataset, source file
* `-output` - give any file name to the new translation output file
* `-gpu` - GPU ID, usually 0 if you have one GPU. Otherwise, it will translate on CPU, which would be slower.
* `-min_length` - [optional] to avoid empty translations
* `-verbose` - [optional] if you want to print translations

Refer to [OpenNMT-py translation options](https://opennmt.net/OpenNMT-py/options/translate.html) for more details.

In [None]:
# Translate the "subworded" source file of the test dataset
!onmt_translate -model model_avg.pt -src merged.source-filtered.source.subword.test -output test.translated -gpu 0 -min_length 1

2023-12-02 14:10:42.991695: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-02 14:10:42.991753: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-02 14:10:42.991791: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-02 14:10:42.999895: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-02 14:10:45.464835: I tensorflow/c

In [None]:
# Check the first 5 lines of the translation file
!head -n 5 test.translated

▁Sau ▁đó ▁chúng ▁tôi ▁hỏi , ▁" Bạn ▁có ▁muốn ▁làm ▁một ▁cái ▁khác ▁với ▁ $ 2 , 7 0 ▁không ?"
▁B ất ▁cứ ▁ai ▁cũng ▁có ▁thể ▁nhấn ▁vào ▁nút ▁" refresh " ▁liên ▁quan ▁đến ▁chức ▁năng ▁phổi , ▁và ▁nó ▁sẽ ▁đưa ▁bản ▁báo ▁cáo ▁của ▁tôi .
▁Có ▁thể ▁cô ▁ấy ▁ngồi ▁với ▁tôi ▁hàng ▁giờ ▁liền , ▁và ▁giơ ▁tay ▁lên ▁cho ▁tôi ▁xem ▁thế ▁giới ▁bị ▁nghèo ▁đói .
▁Vâng , ▁vâng , ▁đây ▁lại ▁là ▁một ▁điều ▁khó ▁khăn .
▁Điề u ▁này ▁dẫn ▁đến ▁một ▁tổ ▁tiên ▁duy ▁nhất ▁cách ▁đây ▁khoảng ▁ 2 0 , 0 0 0 ▁đến ▁ 2 5 , 0 0 0 ▁năm ?


In [None]:
# If needed install/update sentencepiece
!pip3 install --upgrade -q sentencepiece

# Desubword the translation file
!python3 MT-Preparation/subwording/3-desubword.py target.model test.translated

Done desubwording! Output: test.translated.desubword


In [None]:
# Check the first 5 lines of the desubworded translation file
!head -n 5 test.translated.desubword

Sau đó chúng tôi hỏi, "Bạn có muốn làm một cái khác với $2,70 không?"
Bất cứ ai cũng có thể nhấn vào nút "refresh" liên quan đến chức năng phổi, và nó sẽ đưa bản báo cáo của tôi.
Có thể cô ấy ngồi với tôi hàng giờ liền, và giơ tay lên cho tôi xem thế giới bị nghèo đói.
Vâng, vâng, đây lại là một điều khó khăn.
Điều này dẫn đến một tổ tiên duy nhất cách đây khoảng 20,000 đến 25,000 năm?


In [None]:
# Desubword the target file (reference) of the test dataset
!python3 MT-Preparation/subwording/3-desubword.py target.model merged.vi-filtered.vi.subword.test

Done desubwording! Output: merged.vi-filtered.vi.subword.test.desubword


In [None]:
# Check the first 5 lines of the desubworded reference
!head -n 5 merged.vi-filtered.vi.subword.test.desubword

Sau đó chúng tôi hỏi họ, "Bạn có muốn lắp một cái khác với $2,70 không?"
Vì vậy bất kỳ người dùng nào đều có thể đi vào đây và bấm vào "báo cáo chức năng phổi" và nó sẽ lấy những số liệu đó để cho ra bản báo cáo này mà tôi đã tạo ra.
Cô ngồi bên tôi hàng giờ đồng hồ, và mở mắt cho tôi về thế giới của đói nghèo.
Được rồi. Vâng, đây lại là một câu khó.
Phải chăng nó thực sự sẽ dẫn đến một ông tổ chung duy nhất nào đó sống cách đây khoảng vài 20 hay 25 nghìn năm?


# MT Evaluation
Evaluation using BLEU. Files must be detokenized/desubworded beforehand.

In [None]:
# Download the BLEU script
!wget https://raw.githubusercontent.com/ymoslem/MT-Evaluation/main/BLEU/compute-bleu.py

--2023-12-01 06:21:18--  https://raw.githubusercontent.com/ymoslem/MT-Evaluation/main/BLEU/compute-bleu.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 957 [text/plain]
Saving to: ‘compute-bleu.py’


2023-12-01 06:21:18 (19.3 MB/s) - ‘compute-bleu.py’ saved [957/957]



In [None]:
# Install sacrebleu
!pip3 install sacrebleu



In [None]:
# Evaluate the translation (without subwording)
!python3 compute-bleu.py test.translated.desubword merged.vi-filtered.vi.subword.test.desubword

Reference 1st sentence: Sau đó chúng tôi hỏi, "Bạn có muốn làm một cái khác với $2,70 không?"
MTed 1st sentence: Sau đó chúng tôi hỏi họ, "Bạn có muốn lắp một cái khác với $2,70 không?"
BLEU:  30.04134039264607


In [None]:
!onmt_release_model --model "/content/drive/MyDrive/nmt/models/model_avg.pt" --output "/content/drive/MyDrive/nmt/models/model_released.pt"

2023-12-01 13:13:55.878968: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 13:13:55.879020: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 13:13:55.879054: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 13:13:55.886703: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-01 13:13:59.230108: I tensorflow/c