In [None]:
!pip install fairseq

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:  
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
import os

In [None]:
#join all paths to a base path 
BASE_PATH = "/content/drive/MyDrive/NLP/CA5"
TOKENIZED_DATA_PATH = os.path.join(BASE_PATH, "tokenized_data")
TRAIN_DATA_PATH = os.path.join(TOKENIZED_DATA_PATH, "train")
VALID_DATA_PATH = os.path.join(TOKENIZED_DATA_PATH, "valid")
TEST_DATA_PATH = os.path.join(TOKENIZED_DATA_PATH, "test")
VOCAB_SIZE = 10000
LEARNING_RATE = 0.0025
LABEL_SMOOTHING = 0.2
ADAM_BETA11 = 0.9
ADAM_BETA22 = 0.98
DROPOUT = 0.25

In [None]:
!mkdir -p ./data_bin

fairseq_preprocess_command = f"""
    !fairseq-preprocess --source-lang en --target-lang fa \
  --trainpref {TRAIN_DATA_PATH} \
  --validpref {VALID_DATA_PATH} \
  --testpref {TEST_DATA_PATH} \
  --destdir ./data_bin/
  """




In [None]:
%%bash -s "$fairseq_preprocess_command"

In [None]:
fairseq_train_command = f"""
!fairseq-train \
    "./data_bin/" \
    --arch lstm --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '({ADAM_BETA11},{ADAM_BETA22})' --clip-norm 0.0 \
    --lr {LEARNING_RATE} --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout {DROPOUT} --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing {LABEL_SMOOTHING} \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --fp16 --memory-efficient-fp16 \
    --max-epoch 5 \
    --save-dir ./data_bin/checkpoints/ \
    --tensorboard-logdir ./data_bin/logs
"""

In [None]:
# echo "$fairseq_train_command"

In [None]:
%%bash -s "$fairseq_train_command"