# Install dependencies

In [None]:
!pip install fairseq
!pip install sentencepiece
!pip install wandb

Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq)
  Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting omegaconf<2.1 (from fairseq)
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Collecting sacrebleu>=1.4.12 (from fairseq)
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairseq)
 

In [None]:
!git clone https://github.com/google/sentencepiece.git
!mkdir sentencepiece/build
!cd sentencepiece/build && cmake ..
!cd sentencepiece/build && make -j $(nproc)

Cloning into 'sentencepiece'...
remote: Enumerating objects: 4823, done.[K
remote: Counting objects: 100% (1450/1450), done.[K
remote: Compressing objects: 100% (320/320), done.[K
remote: Total 4823 (delta 1175), reused 1195 (delta 1089), pack-reused 3373[K
Receiving objects: 100% (4823/4823), 26.77 MiB | 28.74 MiB/s, done.
Resolving deltas: 100% (3314/3314), done.
  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- VERSION: 0.2.00
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - don

In [None]:
%env SPM=/content/sentencepiece/build/src
!echo $SPM

env: SPM=/content/sentencepiece/build/src
/content/sentencepiece/build/src


In [None]:
!wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz
!tar -xzvf mbart.cc25.v2.tar.gz

--2023-08-16 08:41:49--  https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.224.2.21, 13.224.2.6, 13.224.2.42, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.224.2.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5618826847 (5.2G) [application/gzip]
Saving to: ‘mbart.cc25.v2.tar.gz’


2023-08-16 08:43:55 (42.6 MB/s) - ‘mbart.cc25.v2.tar.gz’ saved [5618826847/5618826847]

mbart.cc25.v2/
mbart.cc25.v2/sentence.bpe.model
mbart.cc25.v2/dict.txt
mbart.cc25.v2/model.pt


In [None]:
# Import neccessary libs
import os
import torch
from glob import glob
from typing import List

from google.colab import drive
from fairseq.data import Dictionary
from fairseq.tokenizer import tokenize_line

In [None]:
# Mount from drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Create stored data folder
!mkdir tokenized
!mkdir model

# Dataset pre-processing
1. Split dataset to train, val, test set
1.   Using the pretrained Sentencepiece model to pre-process our data (also do this to validation/test sets). It will split the sentences into **subwords**, adding the special symbol `▁` to the first subword of a word.





In [None]:
!$SPM/spm_encode --model="drive/MyDrive/sentencepiece.bpe.model" --output_format=piece < "drive/MyDrive/train.en" > tokenized/train.spm.en
!$SPM/spm_encode --model="drive/MyDrive/sentencepiece.bpe.model" --output_format=piece < "drive/MyDrive/train.vi" > tokenized/train.spm.vi

!$SPM/spm_encode --model="drive/MyDrive/sentencepiece.bpe.model" --output_format=piece < "drive/MyDrive/valid.en" > tokenized/val.spm.en
!$SPM/spm_encode --model="drive/MyDrive/sentencepiece.bpe.model" --output_format=piece < "drive/MyDrive/valid.vi" > tokenized/val.spm.vi

# Pruning the pre-trained model
Most of the words in the large vocabulary used by the original pre-training model are not actually used in the finetune process, so this part of redundant information can be removed.

Reduce the size of the pre-trained model by pruning the word embeddings for fine-tuning:

- Firstly, build a new vocab for our dataset

In [None]:
# Build new vocab
def pad_dict(d: Dictionary, num_extra_symbols: int, padding_factor: int = 8) -> None:
    i = 0
    while (len(d) + num_extra_symbols) % padding_factor != 0:
        symbol = f"madeupword{i:04d}"
        d.add_symbol(symbol, n=0)
        i += 1

langs = 'ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN'
data_dir = "./tokenized/*.spm.*"
output = './model/dict.txt'

langs = langs.split(",")
ft_dict = Dictionary()

for data_path in glob(data_dir):
  Dictionary.add_file_to_dictionary(data_path, ft_dict, tokenize_line, 4)

ft_dict.finalize(padding_factor=0)
pad_dict(ft_dict, len(langs) + 1)
ft_dict.save(output)

- Pruning word embeddings base on new vocab built

In [None]:
# Pruning word embeddings by the new vocab

def load_dict(langs: List[str], path: str) -> Dictionary:
    d = Dictionary.load(path)
    for l in langs:
        d.add_symbol(f"[{l}]")
    d.add_symbol("<mask>")
    return d

pre_train_dir = './mbart.cc25.v2'
ft_dict_path = './model/dict.txt'
output = './model/model.pt'

pre_dict = load_dict(langs, os.path.join(pre_train_dir, "dict.txt"))
ft_dict = load_dict(langs, ft_dict_path)
data = torch.load(os.path.join(pre_train_dir, "model.pt"))
model = data["model"]
mapping: List[int] = []

for i in range(len(ft_dict)):
    word = ft_dict[i]
    mapping.append(pre_dict.index(word))

for name in ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]:
    pre_tensor: torch.Tensor = model[name]
    ft_tensor = torch.zeros(
        [len(ft_dict), 1024], dtype=pre_tensor.dtype, layout=pre_tensor.layout, device=pre_tensor.device,
    )
    for ft_i, pre_i in enumerate(mapping):
      ft_tensor[ft_i] = pre_tensor[pre_i]
    model[name] = ft_tensor

torch.save(data, output)

# Binarize pre-process data

Binarize the data to the Fairseq format.

In [None]:
!fairseq-preprocess \
  --source-lang "vi" \
  --target-lang "en" \
  --trainpref "tokenized/train.spm" \
  --validpref "tokenized/val.spm" \
  --destdir "bin" \
  --bpe sentencepiece \
  --thresholdtgt 0 \
  --thresholdsrc 0 \
  --srcdict "model/dict.txt" \
  --tgtdict "model/dict.txt" \
  --workers 70

2023-08-16 08:49:50 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-08-16 08:49:50 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe='sen

# Training

Enable half precision (`--fp16`) or define/use a smaller model to speed up the training. But in case using (`--memory-efficient-fp16`) for using memory more efficient

In the case of Colab timing out, change the `--keep-interval-updates` and `--no-epoch-checkpoints` flags to save intermidate checkpoints and then resume the training from the last checkpoint.

In [None]:
#%reset # Release memory

In [None]:
!fairseq-train \
  "bin" \
  --encoder-normalize-before --decoder-normalize-before \
  --arch mbart_large --layernorm-embedding \
  --task translation_from_pretrained_bart \
  --source-lang vi --target-lang en \
  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
  --dataset-impl mmap \
  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
  --lr-scheduler polynomial_decay --lr 5e-05 --warmup-updates 2500 --total-num-update 40000 \
  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
  --max-tokens 4000 --update-freq 2 \
  --save-interval 1 --save-interval-updates 1000 --keep-interval-updates 10 --no-epoch-checkpoints \
  --seed 222 --log-format simple --log-interval 2 \
  --restore-file model/model.pt \
  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
  --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN \
  --memory-efficient-fp16
  # --ddp-backend no_c10d \
  --wandb-project "finetune mBart"

2023-08-16 08:58:49 | INFO | numexpr.utils | NumExpr defaulting to 4 threads.
2023-08-16 08:58:50 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-08-16 08:58:52 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 2, 'log_format': 'simple', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 222, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_si

# Generate on Vi-En

Get sacrebleu on finetuned vi-en model

In [None]:
# !fairseq-generate "bin" \
#   --path checkpoints/checkpoint_best.pt \
#   --task translation_from_pretrained_bart \
#   --gen-subset valid \
#   --source-lang vi --target-lang en \
#   --bpe 'sentencepiece' --sentencepiece-model drive/MyDrive/sentencepiece.bpe.model \
#   --sacrebleu --remove-bpe 'sentencepiece' \
#   --beam 5 --nbest 1 \
#   --batch-size 32 --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN > result.txt


In [None]:
# !tail -n 20 result.txt

In [None]:
# # Write test data to file
# with open('vi_en.ref', 'w') as f:
#     for line in en_test:
#         f.write(f"{line}\n")

In [None]:
# !cat result.txt | grep -P "^H" | sort -V | cut -f 3- | sed 's/\[en\]//g' > vi_en.hyp
# # #!cat result.txt | grep -P "^T" | sort -V | cut -f 2- | sed 's/\[en\]//g' | $SPM/spm_decode --model drive/MyDrive/mbart.cc25.v2/mbart.cc25.v2/sentencepiece.bpe.model > vi_en.ref
# !sacrebleu -tok 'none' -s 'none' drive/MyDrive/valid.en < vi_en.hyp