In [None]:
import pandas as pd
from google.colab import drive

In [None]:
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [None]:
%cd '/content/MyDrive/MyDrive/Colab_Notebooks'

/content/MyDrive/MyDrive/Colab_Notebooks


In [None]:
def read(path) -> list:
    with open(path, 'r', encoding = 'UTF-8') as f:
        data = f.readlines()
    return data

In [None]:
vitrain = read('data/Train/train2023.vi')
lotrain = read('data/Train/train2023.lo')
videv = read('data/Dev/dev2023.vi')
lodev = read('data/Dev/dev2023.lo')
vitest = read('data/VLSP2023.TestSet/test_vi.txt')
lotest = read('data/VLSP2023.TestSet/test_lo.txt')

In [None]:
!pip install sentencepiece --q
!pip install fairseq --q
!pip install sacrebleu --q
!pip install sacremoses --q

In [None]:
import sentencepiece as sp
sp.SentencePieceTrainer.train(input = ['data/Train/train2023.lo', 'data/Train/train2023.vi'],
                              model_prefix = 'spm', vocab_size = 18000,
                              character_coverage = 0.998,
                              model_type = 'bpe',
                              max_sentence_length = 128,
                              num_threads = 8,
                              bos_id=0, pad_id=1, eos_id=2, unk_id=3
                              )

In [None]:
!cut -f1 spm.vocab | tail -n +5 | sed "s/$/ 100/g" > dict.txt

In [None]:
spp = sp.SentencePieceProcessor(model_file = 'spm.model')

In [None]:
def encoder(data: list, output: str):
    with open(output, 'w', encoding='utf-8') as fo:
        for line in data:
            encoded = spp.encode(line, out_type = str)
            encoded_line = ' '.join(encoded)
            fo.write(encoded_line + "\n")

In [None]:
encoder(vitrain, 'train.spm.vi')
encoder(lotrain, 'train.spm.lo')
encoder(videv, 'dev.spm.vi')
encoder(lodev, 'dev.spm.lo')

In [None]:
!fairseq-preprocess --trainpref "train.spm" --validpref "dev.spm" \
    --destdir "bin" \
    --joined-dictionary \
    --srcdict "dict.txt"\
    --source-lang "lo" \
    --target-lang "vi" \
    --bpe sentencepiece \
    --workers 16

2023-12-03 11:29:05.590522: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-03 11:29:05.590579: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-03 11:29:05.590615: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-03 11:29:05.598240: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-03 11:29:08 | INFO | fairseq.tasks

In [None]:
!fairseq-train "bin" \
    --source-lang "lo" \
    --target-lang "vi" \
    --fp16 \
    --max-epoch 30 \
    --max-tokens 2048 \
    --arch 'fconv_wmt_en_de' \
    --optimizer 'adam' \
    --adam-betas '(0.9, 0.98)' \
    --lr 3e-5 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.1 --weight-decay 0.0001 \
    --criterion 'label_smoothed_cross_entropy' \
    --scoring 'bleu' \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok 'moses' \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric 'bleu' \
    --maximize-best-checkpoint-metric \
    --no-epoch-checkpoints \
    --num-workers 4

2023-12-03 11:29:14.947121: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-03 11:29:14.947190: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-03 11:29:14.947230: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-03 11:29:14.958398: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-03 11:29:18 | INFO | numexpr.utils

In [None]:
encoder(vitest, 'test.spm.vi')
encoder(lotest, 'test.spm.lo')

In [36]:
!fairseq-preprocess --trainpref "train.spm" --validpref "dev.spm" --testpref "test.spm" \
    --destdir "bin" \
    --joined-dictionary \
    --srcdict "dict.txt"\
    --source-lang "lo" \
    --target-lang "vi" \
    --bpe sentencepiece \
    --workers 16

2023-12-14 12:05:49.290693: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-14 12:05:49.290749: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-14 12:05:49.290795: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-14 12:05:49.299099: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-14 12:05:52 | INFO | fairseq.tasks.text_to

In [38]:
!fairseq-generate "bin" \
    --path "checkpoints/checkpoint_last.pt" \
    --batch-size 32 --beam 5

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
D-475	-1.641184687614441	▁Và ▁có ▁thể ▁ở ▁đây ▁, ▁chúng ▁ta ▁đang ▁bắt ▁đầu ▁thay ▁đổi ▁.
P-475	-0.1434 -1.2655 -1.4784 -5.5222 -1.6748 -1.8546 -3.1806 -0.5294 -1.4806 -1.3254 -0.0323 -5.2246 -0.0698 -0.8295 -0.0069
S-99	▁ສະ ▁ຫມ ອງ ▁ບໍ່ ເປັນຫຍັງ . ▁ບໍ່ມີ ປັນ ຫາ ຫຍັງ ກັບ ໃຈ .
T-99	▁N ão ▁bộ ▁không ▁bị ▁làm ▁sao ▁. ▁Tr í ▁óc ▁không ▁bị ▁làm ▁sao ▁cả ▁.
H-99	-2.1488168239593506	▁Không ▁có ▁gì ▁không ▁có ▁gì ▁. ▁Không ▁có ▁vấn ▁đề ▁gì ▁.
D-99	-2.1488168239593506	▁Không ▁có ▁gì ▁không ▁có ▁gì ▁. ▁Không ▁có ▁vấn ▁đề ▁gì ▁.
P-99	-4.7568 -1.7701 -2.0221 -2.9790 -3.0081 -1.2713 -3.4752 -0.5603 -1.3058 -5.3166 -0.0554 -2.3244 -1.1575 -0.0805
S-709	▁ທຸກ ຢ່າງ ແມ່ນ ປອມ , ▁ແຕ່ ພວກເຮົາ ຄິດ ວ່າມັນ ເປັນ ຂອງ ແທ້ .
T-709	▁V ạn ▁hữu ▁là ▁giả ▁huy ễn ▁mà ▁chúng ▁ta ▁lại ▁cho ▁là ▁thật .
H-709	-1.3367853164672852	▁Tất ▁cả ▁mọi ▁thứ ▁, ▁nhưng ▁chúng ▁ta ▁nghĩ ▁rằng ▁nó ▁là ▁một ▁cái ▁gì ▁đó ▁.
D-709	-1.3367853164672852	▁Tất ▁cả ▁mọi ▁thứ 