### Install dependencies

In [None]:
!git clone https://github.com/facebookresearch/XLM
!pip install -e XLM
!pip install pythainlp
!pip install -r ../requirements.txt
!git clone https://github.com/glample/fastBPE
!mkdir -p XLM/tools/fastBPE
!g++ -std=c++11 -pthread -O3 fastBPE/fastBPE/main.cc -IfastBPE -o XLM/tools/fastBPE/fast
!git clone https://github.com/NVIDIA/apex
!cd apex && python3 setup.py install

### Download the data

In [None]:
!mkdir -p data/{para,mono}
!mkdir data/mono/{eu,en}
!wget https://object.pouta.csc.fi/OPUS-CCMatrix/v1/moses/en-eu.txt.zip 
!unzip en-eu.txt.zip CCMatrix.en-eu.{en,eu}
!head -130000 CCMatrix.en-eu.en data/mono/dataset.en
!head -130000 CCMatrix.en-eu.eu data/mono/dataset.eu
!paste data/mono/dataset.eu data/mono/dataset.en > dataset.eu-en
!rm en-eu.txt.zip
!wget --trust-server-names https://tinyurl.com/flores200dataset
!tar -xvf flores200_dataset.tar.gz
!cat flores200_dataset/{dev,devtest}/eus_Latn.{dev,devtest} > data/para/valid.eu
!cat flores200_dataset/{dev,devtest}/eng_Latn.{dev,devtest} > data/para/valid.en
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v2022-03-03/moses/en-eu.txt.zip
!unzip en-eu.txt.zip Tatoeba.en-eu.{en,eu}
!cat Tatoeba.en-eu.eu | grep '\S' > data/para/test.eu
!cat Tatoeba.en-eu.en | grep '\S' > data/para/test.en
!rm en-eu.txt.zip

!mkdir models

### Clean the dataset

python3 ../src/cleaner.py -s dataset.eu-en --src_lang eu --tgt_lang en -t dataset.eu-en.fixed --verify_langs --filter_by_sims

### Preprocess the data

In [None]:
!bash preprocess.sh --src eu --tgt en

### Train on uncleaned data

In [None]:
!python XLM/train.py \
    --exp_name uncleaned_data \
    --data_path data/processed/en-eu \
    --dump_path models \
    --lgs en-eu \
    --clm_steps "" \
    --mlm_steps "" \
    --mt_steps "eu-en" \
    --encoder_only false \
    --emb_dim 128 \
    --n_layers 4 \
    --n_heads 8 \
    --dropout 0.1  \
    --attention_dropout 0.1  \
    --epoch_size 100000 \
    --max_epoch 10000 \
    --batch_size 32 \
    --optimizer adam,lr=0.00001 \
    --eval_bleu true \
    --stopping_criterion 'valid_eu-en_mt_bleu,50' 

### Replace the uncleaned dataset with cleaned one

!rm -rf data/mono
!mkdir -p data/mono
!cut -f1 dataset.eu-en.fixed > data/mono/dataset.eu
!cut -f2 dataset.eu-en.fixed > data/mono/dataset.en
!bash preprocess.sh --src eu --tgt en

### Train on cleaned data

In [None]:
!python XLM/train.py \
    --exp_name cleaned_data \
    --data_path data/processed/en-eu \
    --dump_path models \
    --lgs en-eu \
    --clm_steps "" \
    --mlm_steps "" \
    --mt_steps "eu-en" \
    --encoder_only false \
    --emb_dim 128 \
    --n_layers 4 \
    --n_heads 8 \
    --dropout 0.1  \
    --attention_dropout 0.1  \
    --epoch_size 100000 \
    --max_epoch 10000 \
    --batch_size 32 \
    --optimizer adam,lr=0.00001 \
    --eval_bleu true \
    --stopping_criterion 'valid_eu-en_mt_bleu,50' 