In [0]:
!unzip ./Corpora.zip -d ./
!git clone https://github.com/OpenNMT/OpenNMT-py.git
!python ./OpenNMT-py/setup.py install
!pip install -U torchtext
!nvidia-smi
!mkdir ./data ./models ./predictions

In [0]:
!python OpenNMT-py/preprocess.py \
-train_src ./Corpora/En2Fa-Translation/Train/train.en \
-train_tgt ./Corpora/En2Fa-Translation/Train/train.fa \
-valid_src ./Corpora/En2Fa-Translation/Dev/dev.en \
-valid_tgt ./Corpora/En2Fa-Translation/Dev/dev.fa \
-save_data ./data/trans \
-src_seq_length 50 \
-tgt_seq_length 50 \
-lower

In [0]:
!python OpenNMT-py/train.py \
-data ./data/trans \
-src_word_vec_size 512 \
-tgt_word_vec_size 512 \
-encoder_type "rnn" \
-decoder_type "rnn" \
-layers 6 \
-rnn_size 512 \
-rnn_type "LSTM" \
-save_model ./models/trans \
-save_checkpoint_steps 2500 \
-gpu_ranks 0 \
-world_size 1 \
-seed 2020 \
-batch_size 32 \
-normalization "sents" \
-train_steps 50000 \
-early_stopping 0 \
-optim "adam" \
-max_grad_norm 5 \
-dropout 0.4 \
-learning_rate 0.001 \
-learning_rate_decay 0.8 \
-start_decay_steps 10000 \
-decay_steps 5000 \
-decay_method "none" \
-warmup_steps 1000 \
-report_every 500 \
-log_file "stats.txt"

In [0]:
import os
import subprocess
import matplotlib.pyplot as plt

In [0]:
def execute(cmd):
    process = subprocess.Popen(
        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    (result, error) = process.communicate()
    rc = process.wait()
    if rc != 0:
        print ("Error: failed to execute command: ", cmd)
        print (error.rstrip().decode("utf-8"))
    return result.rstrip().decode("utf-8"), error.rstrip().decode("utf-8")

In [0]:
files_list = sorted(
    os.listdir('./models'), 
    key=lambda x: int(x[len('trans_step_'):-3])
)
bleu_file = open('bleu_results.txt', 'w')
for file in files_list:
    if file.startswith('trans'):
        num_step = int(file[len('trans_step_'):-3])
        pred_file = f'pred_step_{num_step}.txt'
        out = execute(f'python OpenNMT-py/translate.py -model ./models/{file} \
        -src ./Corpora/En2Fa-Translation/Dev/dev.en \
        -output ./predictions/{pred_file} -replace_unk')
        out = execute(f'perl OpenNMT-py/tools/multi-bleu.perl \
        ./Corpora/En2Fa-Translation/Dev/dev.fa < ./predictions/{pred_file}')
        print(f'Model {file} evaluation finished.')
        print(out)
        bleu_file.write(f'{num_step} {out[0]}\n')
bleu_file.close()

In [0]:
!python OpenNMT-py/translate.py \
-model ./models/trans_step_50000.pt \
-src ./Corpora/En2Fa-Translation/Test/test.en \
-output ./predictions/test.txt \
-replace_unk -verbose

[2020-05-28 15:41:54,759 INFO] Translating shard 0.

SENT 1: ['hello', ',', 'do', 'we', 'drive', 'together', 'to', 'Hanover', 'on', 'the', 'twenty-eighth', 'of', 'March', '?']
PRED 1: سلام ، ما میتوانیم در مزرعه پرورش صحبت کنیم ؟
PRED SCORE: -9.2768

SENT 2: ['it', 'is', 'more', 'comfortable', 'by', 'train', '.']
PRED 2: آن کمی pool .
PRED SCORE: -5.8298

SENT 3: ['do', 'you', 'go', 'by', 'car', 'and', 'I', 'go', 'by', 'train', '?']
PRED 3: آیا شما تا یازده به خانه من بیایی ؟
PRED SCORE: -7.6610

SENT 4: ['I', 'would', 'like', 'to', 'go', 'by', 'train', '.', 'and', 'what', 'would', 'you', 'like', '?']
PRED 4: من دوست دارم تا شما بپرسم . اگر شما برای شما چطور است ؟
PRED SCORE: -13.5178

SENT 5: ['if', 'we', 'take', 'the', '$I-$C-$E', 'train', 'at', 'six', 'past', 'seven', ',', 'we', 'will', 'arrive', 'at', 'twenty-five', 'past', 'eight', '.']
PRED 5: اگر ما ساعت ساعت شش صبح ، ما ما میتوانیم ساعت ساعت شش بعد از ظهر .
PRED SCORE: -15.0150

SENT 6: ['which', 'cafe', '?']
PRED 6: کجا ؟
PRED

In [0]:
!perl  OpenNMT-py/tools/multi-bleu.perl \
./Corpora/En2Fa-Translation/Test/test.fa0 \
./Corpora/En2Fa-Translation/Test/test.fa1 \
./Corpora/En2Fa-Translation/Test/test.fa2 \
./Corpora/En2Fa-Translation/Test/test.fa3 < ./predictions/test.txt

In [0]:
results = open('./stats.txt', 'r')
accuracies = []
steps = []
perplexities = []
for line in results:
    line = line.rstrip()
    if 'acc:' in line:
        print(line)
        accuracies.append(
            float(line[line.index('acc:')+6:line.index('ppl:')-2])
        )
        perplexities.append(
            float(line[line.index('ppl:')+5:line.index('xent:')-2])
        )
        steps.append(
            int(line[line.index('Step')+5:line.index('/50000')])
        )

In [0]:
plt.plot(steps, accuracies)
plt.title('Train data accuracy')
plt.xlabel('steps')
plt.ylabel('accuracy')
plt.figure()
plt.plot(steps, perplexities)
plt.title('Train data perplexity')
plt.xlabel('steps')
plt.ylabel('perplexity')

In [0]:
results = open('./bleu_results.txt', 'r')
bleu_scores = []
steps = []
for line in results:
    line = line.rstrip()
    bleu_scores.append(
        float(line[line.index('BLEU')+7:line.index(',')])
    )
    steps.append(int(line[:line.index('BLEU')-3])*0.1)

In [0]:
plt.figure()
plt.plot(steps, bleu_scores)
plt.title('Train data BLEU')
plt.xlabel('steps (k)')
plt.ylabel('BLEU')

In [0]:
# !python OpenNMT-py/train.py \
# -data ./data/trans \
# -src_word_vec_size 512 \
# -tgt_word_vec_size 512 \
# -encoder_type "rnn" \
# -decoder_type "rnn" \
# -layers 6 \
# -rnn_size 256 \
# -rnn_type "GRU" \
# -save_model ./models/trans \
# -save_checkpoint_steps 2500 \
# -gpu_ranks 0 \
# -world_size 1 \
# -seed 2020 \
# -batch_size 32 \
# -normalization "sents" \
# -train_steps 50000 \
# -early_stopping 0 \
# -optim "adam" \
# -max_grad_norm 5 \
# -dropout 0.4 \
# -learning_rate 0.001 \
# -learning_rate_decay 0.8 \
# -start_decay_steps 10000 \
# -decay_steps 5000 \
# -decay_method "none" \
# -warmup_steps 1000 \
# -report_every 500 \
# -log_file "stats.txt"