In [1]:
import mxnet as mx
import gluonnlp as nlp

import numpy as np
import os
import random
import sacremoses
import time
from tqdm.notebook import tqdm as tqdm

# Local Libraries
import nmt
import dataprocessor
import utils
import nmt.transformer_hparams

# Seeds for reproducibility
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)

# CPU setup
# ctx = mx.cpu()
# Single GPU setup
ctx = mx.gpu(0)

[nltk_data] Downloading package punkt to /home/andreto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andreto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/andreto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# IWSLT2015 Dataset (Train, Validation and Test)

# Dataset Parameters
src_lang, tgt_lang = "vi", "en"
src_max_len, tgt_max_len = 50, 50

iwslt_train_text = nlp.data.IWSLT2015("train",
                                      src_lang=src_lang,
                                      tgt_lang=tgt_lang)

iwslt_val_text   = nlp.data.IWSLT2015("val",
                                      src_lang=src_lang,
                                      tgt_lang=tgt_lang)

iwslt_test_text  = nlp.data.IWSLT2015("test",
                                      src_lang=src_lang,
                                      tgt_lang=tgt_lang)


iwslt_src_vocab = iwslt_train_text.src_vocab
iwslt_tgt_vocab = iwslt_train_text.tgt_vocab

  'Detected a corrupted index in the deserialize vocabulary. '


In [3]:
print("Length of train set:", len(iwslt_train_text))
print("Length of val set  :", len(iwslt_val_text))
print("Length of test set :", len(iwslt_test_text))

Length of train set: 133166
Length of val set  : 1553
Length of test set : 1268


In [4]:
# Dataset processing: clipping, tokenizing, indexing and adding of EOS (src/tgt) / BOS (tgt)
iwslt_train_processed = iwslt_train_text.transform(
    dataprocessor.TrainValDataTransform(
        iwslt_src_vocab,
        iwslt_tgt_vocab,
        src_max_len,
        tgt_max_len),
    lazy=False)

iwslt_val_processed   = iwslt_val_text.transform(
    dataprocessor.TrainValDataTransform(
        iwslt_src_vocab,
        iwslt_tgt_vocab,
        src_max_len,
        tgt_max_len),
    lazy=False)

iwslt_test_processed  = iwslt_test_text.transform(
    dataprocessor.TrainValDataTransform(
        iwslt_src_vocab,
        iwslt_tgt_vocab,
        src_max_len,
        tgt_max_len),
    lazy=False)

In [5]:
# Target Sequences (Val, Test)
fetch_tgt_sentence = lambda src, tgt: tgt.split()
val_tgt_sentences = list(iwslt_val_text.transform(fetch_tgt_sentence))
test_tgt_sentences = list(iwslt_test_text.transform(fetch_tgt_sentence))

In [6]:
# Create Gluon Datasets
# Not needed for training, as training data will be sharded later
iwslt_train_transformed = iwslt_train_processed.transform(
    lambda src, tgt: (src, tgt, len(src), len(tgt)),
    lazy=False)

iwslt_val_dataset = mx.gluon.data.SimpleDataset(
    [(ele[0], ele[1], len(ele[0]), len(ele[1]),i) for i, ele in enumerate(iwslt_val_processed)])

iwslt_test_dataset = mx.gluon.data.SimpleDataset(
    [(ele[0], ele[1], len(ele[0]), len(ele[1]), i) for i, ele in enumerate(iwslt_test_processed)])

In [7]:
# Hyperparameters for Dataloaders and Training
hparams = nmt.transformer_hparams

In [8]:
hparams.num_hidden = 1024
hparams.num_layers = 2
hparams.dropout = 0.2
hparams.num_buckets = 5
hparams.lr = 0.0002
#hparams.lr = 0.0003 achieves 21.44 test_bleu: qualitative evaluation didn't work
#hparams.lr = 0.0001 achieves 19.66 test_bleu: qualitative evaluation worked
hparams.clip = 5
hparams.epochs = 12

In [9]:
# Create Gluon Samplers and DataLoaders

# Helper function for lengths
def get_data_lengths(dataset):
    get_lengths = lambda *args: (args[2], args[3])
    return list(dataset.transform(get_lengths))

# Bucket scheme
bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)

iwslt_train_lengths = get_data_lengths(iwslt_train_transformed)
iwslt_val_lengths = get_data_lengths(iwslt_val_dataset)
iwslt_test_lengths = get_data_lengths(iwslt_test_dataset)

train_batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Pad(),
    nlp.data.batchify.Pad(),
    nlp.data.batchify.Stack(dtype='float32'),
    nlp.data.batchify.Stack(dtype='float32'))

test_batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Pad(),
    nlp.data.batchify.Pad(),
    nlp.data.batchify.Stack(dtype='float32'),
    nlp.data.batchify.Stack(dtype='float32'),
    nlp.data.batchify.Stack())

target_val_lengths = list(map(lambda x: x[-1], iwslt_val_lengths))
target_test_lengths = list(map(lambda x: x[-1], iwslt_test_lengths))

train_batch_sampler = nlp.data.FixedBucketSampler(
    lengths=iwslt_train_lengths,
    batch_size=hparams.batch_size,
    num_buckets=hparams.num_buckets,
    ratio=0,
    shuffle=True,
    use_average_length=False,
    num_shards=0,
    bucket_scheme=bucket_scheme)
    
train_data_loader = nlp.data.ShardedDataLoader(
    iwslt_train_transformed,
    batch_sampler=train_batch_sampler,
    batchify_fn=train_batchify_fn,
    num_workers=8)

val_batch_sampler = nlp.data.FixedBucketSampler(
    lengths=iwslt_val_lengths,
    batch_size=hparams.test_batch_size,
    num_buckets=hparams.num_buckets,
    ratio=0,
    shuffle=False,
    use_average_length=False,
    bucket_scheme=bucket_scheme)

val_data_loader = mx.gluon.data.DataLoader(
    iwslt_val_dataset,
    batch_sampler=val_batch_sampler,
    batchify_fn=test_batchify_fn,
    num_workers=8)

test_batch_sampler = nlp.data.FixedBucketSampler(
    lengths=iwslt_test_lengths,
    batch_size=hparams.test_batch_size,
    num_buckets=hparams.num_buckets,
    ratio=0,
    shuffle=False,
    use_average_length=False,
    bucket_scheme=bucket_scheme)

test_data_loader = mx.gluon.data.DataLoader(
    iwslt_test_dataset,
    batch_sampler=test_batch_sampler,
    batchify_fn=test_batchify_fn,
    num_workers=8)

  'Padding value is not given and will be set automatically to 0 '


In [10]:
# Transformer Model
transformer_encoder, transformer_decoder, transformer_one_step_ahead_decoder = nlp.model.transformer.get_transformer_encoder_decoder(
    hidden_size=hparams.num_hidden,
    dropout=hparams.dropout,
    num_layers=hparams.num_layers)

transformer_model = nlp.model.translation.NMTModel(
    src_vocab=iwslt_src_vocab,
    tgt_vocab=iwslt_tgt_vocab,
    encoder=transformer_encoder,
    decoder=transformer_decoder,
    one_step_ahead_decoder=transformer_one_step_ahead_decoder,
    #embed_size=hparams.num_hidden,
    embed_size=hparams.num_units,
    prefix='transformer_')

transformer_model.initialize(init=mx.init.Xavier(magnitude=1.0), ctx=ctx)
static_alloc = True
transformer_model.hybridize(static_alloc=static_alloc)

In [11]:
# Translator (using model defined above)
transformer_translator = nmt.translation.BeamSearchTranslator(
    model=transformer_model,
    beam_size=hparams.beam_size,
    scorer=nlp.model.BeamSearchScorer(
        alpha=hparams.lp_alpha,
        K=hparams.lp_k),
    max_length=150)

In [12]:
# Loss function
loss_function = nlp.loss.MaskedSoftmaxCELoss()
loss_function.hybridize(static_alloc=static_alloc)

In [13]:
# Evaluation function (used also on training loop for validation)
def evaluate(
    data_loader,
    model,
    translator,
    loss_function,
    tgt_vocab):

    translation_out = []
    all_inst_ids = []
    avg_loss_denom = 0
    avg_loss = 0.0
    
    for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) in enumerate(tqdm(data_loader)):
        
        src_seq = src_seq.as_in_context(ctx)
        tgt_seq = tgt_seq.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx)
        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
        
        # Calculating Loss
        out, _ = model(
            src_seq,
            tgt_seq[:, :-1],
            src_valid_length,
            tgt_valid_length - 1)

        loss = loss_function(
            out,
            tgt_seq[:, 1:],
            tgt_valid_length - 1).sum().asscalar()
        
        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
        avg_loss += loss * (tgt_seq.shape[1] - 1)
        avg_loss_denom += (tgt_valid_length - 1).sum().asscalar()
        
        # Translate
        samples, _, sample_valid_length = translator.translate(
            src_seq=src_seq,
            src_valid_length=src_valid_length)
        
        max_score_sample = samples[:, 0, :].asnumpy()
        sample_valid_length = sample_valid_length[:, 0].asnumpy()
        
        for i in range(max_score_sample.shape[0]):
            translation_out.append(
                [tgt_vocab.idx_to_token[ele] for ele in
                 max_score_sample[i][1:(sample_valid_length[i] - 1)]])
    
    avg_loss = avg_loss / avg_loss_denom
    real_translation_out = [None for _ in range(len(all_inst_ids))]
    
    for ind, sentence in zip(all_inst_ids, translation_out):
        real_translation_out[ind] = sentence
    
    return avg_loss, real_translation_out

In [14]:
# Let's train
trainer = mx.gluon.Trainer(transformer_model.collect_params(), hparams.optimizer, {'learning_rate': hparams.lr})

best_valid_bleu = 0.0

for epoch_id in tqdm(range(hparams.epochs)):

    log_loss = 0
    log_denom = 0
    log_avg_gnorm = 0
    log_wc = 0
    log_start_time = time.time()
    
    for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length) in enumerate(tqdm(train_data_loader)):
        
        src_seq = src_seq.as_in_context(ctx)
        tgt_seq = tgt_seq.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx)
        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
        
        with mx.autograd.record():
            out, _ = transformer_model(
                src_seq,
                tgt_seq[:, :-1],
                src_valid_length,
                tgt_valid_length - 1)

            loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
            loss = loss * (tgt_seq.shape[1] - 1)
            log_loss += loss * tgt_seq.shape[0]
            log_denom += (tgt_valid_length - 1).sum()
            loss = loss / (tgt_valid_length - 1).mean()
            loss.backward()
        
        grads = [p.grad(ctx) for p in transformer_model.collect_params().values() if p.grad_req != "null"]
        gnorm = mx.gluon.utils.clip_global_norm(grads, hparams.clip)
        trainer.step(1)
        
        src_wc = src_valid_length.sum().asscalar()
        tgt_wc = (tgt_valid_length - 1).sum().asscalar()
        log_loss = log_loss.asscalar()
        log_denom = log_denom.asscalar()
        log_avg_gnorm += gnorm
        log_wc += src_wc + tgt_wc
        
        if (batch_id + 1) % hparams.log_interval == 0:
            wps = log_wc / (time.time() - log_start_time)
            print("[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, "
                         "throughput={:.2f}K wps, wc={:.2f}K"
                         .format(epoch_id, batch_id + 1, len(train_data_loader),
                                 log_loss / log_denom,
                                 np.exp(log_loss / log_denom),
                                 log_avg_gnorm / hparams.log_interval,
                                 wps / 1000, log_wc / 1000))
            
            log_start_time = time.time()
            log_loss = 0
            log_denom = 0
            log_avg_gnorm = 0
            log_wc = 0
    
    valid_loss, valid_translation_out = evaluate(
        val_data_loader,
        transformer_model,
        transformer_translator,
        loss_function,
        iwslt_tgt_vocab)

    valid_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([val_tgt_sentences], valid_translation_out)
    print("[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}"
          .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))

    if valid_bleu_score > best_valid_bleu:
        best_valid_bleu = valid_bleu_score
        print("Save best parameters to {}".format(hparams.file_name))
        transformer_model.save_parameters(hparams.file_name)
    
    if epoch_id + 1 >= (hparams.epochs * 2) // 3:
        new_lr = trainer.learning_rate * hparams.lr_update_factor
        print("Learning rate change to {}".format(new_lr))
        trainer.set_learning_rate(new_lr)

if os.path.exists(hparams.file_name):
    transformer_model.load_parameters(hparams.file_name)

valid_loss, valid_translation_out = evaluate(
    val_data_loader,
    transformer_model,
    transformer_translator,
    loss_function,
    iwslt_tgt_vocab)

valid_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([val_tgt_sentences], valid_translation_out)
print("Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}"
      .format(valid_loss, np.exp(valid_loss), valid_bleu_score * 100))

test_loss, test_translation_out = evaluate(
    test_data_loader,
    transformer_model,
    transformer_translator,
    loss_function,
    iwslt_tgt_vocab)

test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([test_tgt_sentences], test_translation_out)
print("Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'"
      .format(test_loss, np.exp(test_loss), test_bleu_score * 100))

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1043 [00:00<?, ?it/s]

Extension horovod.torch has not been built: /home/ubuntu/anaconda3/envs/mxnet_p37/lib/python3.7/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-37m-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2022-06-19 14:06:32.626 ip-172-31-28-47:4312 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-06-19 14:06:32.654 ip-172-31-28-47:4312 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
[Epoch 0 Batch 100/1043] loss=6.7084, ppl=819.2355, gnorm=1.8441, throughput=66.23K wps, wc=606.57K
[Epoch 0 Batch 200/1043] loss=5.4545, ppl=233.7993, gnorm=0.8658, throughput=78.02K wps, wc=584.18K
[Epoch 0 Batch 300/1043] loss=5.0439, ppl=155.0811, gnorm=0.7771, throughput=79.05K wps, wc=592.78K
[Epoch 0 Batch 400/1043] loss=4.7569, ppl=116.3892, gnorm=0.7737, throughput=77.17K wps, wc=571.26K
[Epoch 0 Batch 500/1043] loss=4.5697, ppl=96.5154

  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 0] valid Loss=3.6623, valid ppl=38.9501, valid bleu=8.60
Save best parameters to transformer_vi_en_512.params


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 1 Batch 100/1043] loss=3.9303, ppl=50.9234, gnorm=0.9466, throughput=78.88K wps, wc=593.66K
[Epoch 1 Batch 200/1043] loss=3.8821, ppl=48.5280, gnorm=0.8841, throughput=80.44K wps, wc=631.62K
[Epoch 1 Batch 300/1043] loss=3.7671, ppl=43.2557, gnorm=0.8781, throughput=78.21K wps, wc=599.65K
[Epoch 1 Batch 400/1043] loss=3.6893, ppl=40.0150, gnorm=0.9162, throughput=77.84K wps, wc=585.70K
[Epoch 1 Batch 500/1043] loss=3.5871, ppl=36.1288, gnorm=0.9167, throughput=74.58K wps, wc=567.14K
[Epoch 1 Batch 600/1043] loss=3.5016, ppl=33.1693, gnorm=0.9948, throughput=74.13K wps, wc=521.63K
[Epoch 1 Batch 700/1043] loss=3.5757, ppl=35.7195, gnorm=0.9630, throughput=78.12K wps, wc=591.94K
[Epoch 1 Batch 800/1043] loss=3.4869, ppl=32.6842, gnorm=0.9859, throughput=76.83K wps, wc=563.74K
[Epoch 1 Batch 900/1043] loss=3.4072, ppl=30.1794, gnorm=0.9480, throughput=77.68K wps, wc=578.06K
[Epoch 1 Batch 1000/1043] loss=3.3057, ppl=27.2668, gnorm=0.9951, throughput=73.23K wps, wc=523.53K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 1] valid Loss=3.1175, valid ppl=22.5907, valid bleu=12.38
Save best parameters to transformer_vi_en_512.params


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 2 Batch 100/1043] loss=3.2211, ppl=25.0565, gnorm=1.0404, throughput=75.46K wps, wc=542.87K
[Epoch 2 Batch 200/1043] loss=3.2101, ppl=24.7816, gnorm=1.0168, throughput=77.13K wps, wc=575.69K
[Epoch 2 Batch 300/1043] loss=3.1919, ppl=24.3334, gnorm=1.0130, throughput=77.58K wps, wc=568.29K
[Epoch 2 Batch 400/1043] loss=3.1759, ppl=23.9478, gnorm=1.0601, throughput=77.36K wps, wc=578.89K
[Epoch 2 Batch 500/1043] loss=3.1396, ppl=23.0946, gnorm=1.0487, throughput=77.97K wps, wc=590.93K
[Epoch 2 Batch 600/1043] loss=3.1260, ppl=22.7835, gnorm=1.1220, throughput=76.77K wps, wc=561.47K
[Epoch 2 Batch 700/1043] loss=3.1379, ppl=23.0560, gnorm=1.0278, throughput=79.60K wps, wc=624.59K
[Epoch 2 Batch 800/1043] loss=3.0243, ppl=20.5802, gnorm=1.0761, throughput=76.45K wps, wc=564.54K
[Epoch 2 Batch 900/1043] loss=3.0408, ppl=20.9211, gnorm=1.0561, throughput=77.88K wps, wc=583.36K
[Epoch 2 Batch 1000/1043] loss=2.9381, ppl=18.8802, gnorm=1.1090, throughput=74.20K wps, wc=528.11K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 2] valid Loss=2.8164, valid ppl=16.7168, valid bleu=15.07
Save best parameters to transformer_vi_en_512.params


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 3 Batch 100/1043] loss=2.9372, ppl=18.8634, gnorm=1.0504, throughput=80.26K wps, wc=630.11K
[Epoch 3 Batch 200/1043] loss=2.8474, ppl=17.2424, gnorm=1.1350, throughput=77.10K wps, wc=572.70K
[Epoch 3 Batch 300/1043] loss=2.8795, ppl=17.8057, gnorm=1.1045, throughput=76.46K wps, wc=598.93K
[Epoch 3 Batch 400/1043] loss=2.7426, ppl=15.5277, gnorm=1.1372, throughput=74.76K wps, wc=537.22K
[Epoch 3 Batch 500/1043] loss=2.7151, ppl=15.1064, gnorm=1.1466, throughput=73.82K wps, wc=529.51K
[Epoch 3 Batch 600/1043] loss=2.8010, ppl=16.4608, gnorm=1.0932, throughput=78.91K wps, wc=610.71K
[Epoch 3 Batch 700/1043] loss=2.6975, ppl=14.8428, gnorm=1.1676, throughput=73.69K wps, wc=519.76K
[Epoch 3 Batch 800/1043] loss=2.6737, ppl=14.4930, gnorm=1.1357, throughput=74.42K wps, wc=539.30K
[Epoch 3 Batch 900/1043] loss=2.7145, ppl=15.0971, gnorm=1.1404, throughput=76.50K wps, wc=581.69K
[Epoch 3 Batch 1000/1043] loss=2.7309, ppl=15.3462, gnorm=1.1503, throughput=76.93K wps, wc=569.12K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 3] valid Loss=2.6274, valid ppl=13.8376, valid bleu=17.55
Save best parameters to transformer_vi_en_512.params


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 4 Batch 100/1043] loss=2.5427, ppl=12.7137, gnorm=1.1242, throughput=73.01K wps, wc=568.46K
[Epoch 4 Batch 200/1043] loss=2.5178, ppl=12.4007, gnorm=1.1387, throughput=75.09K wps, wc=542.90K
[Epoch 4 Batch 300/1043] loss=2.5404, ppl=12.6849, gnorm=1.1521, throughput=75.18K wps, wc=555.44K
[Epoch 4 Batch 400/1043] loss=2.5353, ppl=12.6207, gnorm=1.1682, throughput=75.14K wps, wc=543.93K
[Epoch 4 Batch 500/1043] loss=2.5616, ppl=12.9569, gnorm=1.1864, throughput=77.47K wps, wc=579.34K
[Epoch 4 Batch 600/1043] loss=2.5642, ppl=12.9904, gnorm=1.1868, throughput=78.87K wps, wc=601.62K
[Epoch 4 Batch 700/1043] loss=2.5646, ppl=12.9956, gnorm=1.1445, throughput=78.02K wps, wc=597.87K
[Epoch 4 Batch 800/1043] loss=2.4197, ppl=11.2420, gnorm=1.1771, throughput=72.84K wps, wc=507.35K
[Epoch 4 Batch 900/1043] loss=2.5877, ppl=13.2992, gnorm=1.1411, throughput=80.15K wps, wc=624.02K
[Epoch 4 Batch 1000/1043] loss=2.5248, ppl=12.4888, gnorm=1.2029, throughput=77.75K wps, wc=580.31K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 4] valid Loss=2.5413, valid ppl=12.6965, valid bleu=18.59
Save best parameters to transformer_vi_en_512.params


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 5 Batch 100/1043] loss=2.2943, ppl=9.9173, gnorm=1.1877, throughput=70.90K wps, wc=529.43K
[Epoch 5 Batch 200/1043] loss=2.4243, ppl=11.2946, gnorm=1.1439, throughput=78.70K wps, wc=611.54K
[Epoch 5 Batch 300/1043] loss=2.3747, ppl=10.7475, gnorm=1.1691, throughput=77.44K wps, wc=582.88K
[Epoch 5 Batch 400/1043] loss=2.3706, ppl=10.7039, gnorm=1.2060, throughput=75.52K wps, wc=556.31K
[Epoch 5 Batch 500/1043] loss=2.1880, ppl=8.9173, gnorm=1.2242, throughput=69.79K wps, wc=472.55K
[Epoch 5 Batch 600/1043] loss=2.3917, ppl=10.9321, gnorm=1.1742, throughput=77.69K wps, wc=586.79K
[Epoch 5 Batch 700/1043] loss=2.3410, ppl=10.3916, gnorm=1.2015, throughput=75.84K wps, wc=561.02K
[Epoch 5 Batch 800/1043] loss=2.3298, ppl=10.2760, gnorm=1.1815, throughput=75.56K wps, wc=555.63K
[Epoch 5 Batch 900/1043] loss=2.4026, ppl=11.0523, gnorm=1.1827, throughput=79.09K wps, wc=600.79K
[Epoch 5 Batch 1000/1043] loss=2.4892, ppl=12.0520, gnorm=1.1765, throughput=81.92K wps, wc=659.40K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 5] valid Loss=2.4685, valid ppl=11.8044, valid bleu=19.57
Save best parameters to transformer_vi_en_512.params


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 6 Batch 100/1043] loss=2.2380, ppl=9.3742, gnorm=1.1522, throughput=77.89K wps, wc=603.59K
[Epoch 6 Batch 200/1043] loss=2.1875, ppl=8.9132, gnorm=1.1880, throughput=76.24K wps, wc=562.11K
[Epoch 6 Batch 300/1043] loss=2.2740, ppl=9.7180, gnorm=1.1601, throughput=79.83K wps, wc=624.24K
[Epoch 6 Batch 400/1043] loss=2.2598, ppl=9.5811, gnorm=1.2093, throughput=78.05K wps, wc=603.10K
[Epoch 6 Batch 500/1043] loss=2.2568, ppl=9.5524, gnorm=1.2441, throughput=76.24K wps, wc=561.10K
[Epoch 6 Batch 600/1043] loss=2.2465, ppl=9.4548, gnorm=1.2140, throughput=76.14K wps, wc=565.55K
[Epoch 6 Batch 700/1043] loss=2.2451, ppl=9.4411, gnorm=1.2296, throughput=76.12K wps, wc=555.39K
[Epoch 6 Batch 800/1043] loss=2.2161, ppl=9.1716, gnorm=1.2531, throughput=71.75K wps, wc=537.32K
[Epoch 6 Batch 900/1043] loss=2.2698, ppl=9.6774, gnorm=1.2136, throughput=77.77K wps, wc=591.29K
[Epoch 6 Batch 1000/1043] loss=2.2374, ppl=9.3689, gnorm=1.2336, throughput=75.75K wps, wc=554.05K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 6] valid Loss=2.4405, valid ppl=11.4787, valid bleu=19.49


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 7 Batch 100/1043] loss=2.1473, ppl=8.5616, gnorm=1.1758, throughput=74.77K wps, wc=587.59K
[Epoch 7 Batch 200/1043] loss=2.1825, ppl=8.8688, gnorm=1.1932, throughput=79.83K wps, wc=622.04K
[Epoch 7 Batch 300/1043] loss=2.0657, ppl=7.8909, gnorm=1.2132, throughput=73.69K wps, wc=540.54K
[Epoch 7 Batch 400/1043] loss=2.0335, ppl=7.6411, gnorm=1.2544, throughput=71.88K wps, wc=502.07K
[Epoch 7 Batch 500/1043] loss=2.1515, ppl=8.5977, gnorm=1.2361, throughput=76.74K wps, wc=583.79K
[Epoch 7 Batch 600/1043] loss=2.1973, ppl=9.0004, gnorm=1.2101, throughput=78.78K wps, wc=609.75K
[Epoch 7 Batch 700/1043] loss=2.1568, ppl=8.6438, gnorm=1.2432, throughput=75.91K wps, wc=577.30K
[Epoch 7 Batch 800/1043] loss=2.0550, ppl=7.8065, gnorm=1.2672, throughput=72.33K wps, wc=505.00K
[Epoch 7 Batch 900/1043] loss=2.1399, ppl=8.4989, gnorm=1.2154, throughput=77.48K wps, wc=590.09K
[Epoch 7 Batch 1000/1043] loss=2.1901, ppl=8.9359, gnorm=1.2096, throughput=78.45K wps, wc=599.85K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 7] valid Loss=2.4142, valid ppl=11.1805, valid bleu=20.13
Save best parameters to transformer_vi_en_512.params
Learning rate change to 0.0001


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 8 Batch 100/1043] loss=1.9423, ppl=6.9748, gnorm=1.1630, throughput=75.39K wps, wc=556.55K
[Epoch 8 Batch 200/1043] loss=2.0301, ppl=7.6145, gnorm=1.1573, throughput=78.74K wps, wc=601.04K
[Epoch 8 Batch 300/1043] loss=2.0363, ppl=7.6622, gnorm=1.1539, throughput=79.10K wps, wc=614.51K
[Epoch 8 Batch 400/1043] loss=1.9333, ppl=6.9120, gnorm=1.2045, throughput=74.46K wps, wc=539.14K
[Epoch 8 Batch 500/1043] loss=1.9802, ppl=7.2445, gnorm=1.1950, throughput=76.23K wps, wc=561.68K
[Epoch 8 Batch 600/1043] loss=1.9891, ppl=7.3091, gnorm=1.2018, throughput=73.70K wps, wc=567.04K
[Epoch 8 Batch 700/1043] loss=2.0401, ppl=7.6916, gnorm=1.1765, throughput=78.33K wps, wc=600.47K
[Epoch 8 Batch 800/1043] loss=2.0376, ppl=7.6720, gnorm=1.1732, throughput=79.50K wps, wc=610.25K
[Epoch 8 Batch 900/1043] loss=1.9602, ppl=7.1007, gnorm=1.1951, throughput=75.80K wps, wc=562.08K
[Epoch 8 Batch 1000/1043] loss=1.9308, ppl=6.8950, gnorm=1.2208, throughput=73.77K wps, wc=530.23K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 8] valid Loss=2.3786, valid ppl=10.7895, valid bleu=20.54
Save best parameters to transformer_vi_en_512.params
Learning rate change to 5e-05


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 9 Batch 100/1043] loss=1.9284, ppl=6.8782, gnorm=1.1618, throughput=73.38K wps, wc=576.48K
[Epoch 9 Batch 200/1043] loss=1.8704, ppl=6.4907, gnorm=1.1736, throughput=75.75K wps, wc=550.86K
[Epoch 9 Batch 300/1043] loss=2.0102, ppl=7.4649, gnorm=1.1480, throughput=80.92K wps, wc=633.26K
[Epoch 9 Batch 400/1043] loss=1.9573, ppl=7.0804, gnorm=1.1929, throughput=76.97K wps, wc=579.86K
[Epoch 9 Batch 500/1043] loss=1.8593, ppl=6.4192, gnorm=1.1888, throughput=75.05K wps, wc=543.97K
[Epoch 9 Batch 600/1043] loss=1.8956, ppl=6.6567, gnorm=1.1880, throughput=75.21K wps, wc=547.23K
[Epoch 9 Batch 700/1043] loss=1.8804, ppl=6.5562, gnorm=1.1867, throughput=76.76K wps, wc=580.76K
[Epoch 9 Batch 800/1043] loss=1.8278, ppl=6.2201, gnorm=1.1924, throughput=73.34K wps, wc=533.16K
[Epoch 9 Batch 900/1043] loss=1.9034, ppl=6.7089, gnorm=1.1740, throughput=77.18K wps, wc=583.95K
[Epoch 9 Batch 1000/1043] loss=1.9554, ppl=7.0671, gnorm=1.1755, throughput=78.62K wps, wc=601.85K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 9] valid Loss=2.3591, valid ppl=10.5811, valid bleu=20.73
Save best parameters to transformer_vi_en_512.params
Learning rate change to 2.5e-05


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 10 Batch 100/1043] loss=1.9649, ppl=7.1339, gnorm=1.1642, throughput=79.40K wps, wc=616.40K
[Epoch 10 Batch 200/1043] loss=1.8010, ppl=6.0560, gnorm=1.1976, throughput=73.72K wps, wc=530.46K
[Epoch 10 Batch 300/1043] loss=1.7824, ppl=5.9442, gnorm=1.1884, throughput=74.01K wps, wc=528.15K
[Epoch 10 Batch 400/1043] loss=1.8086, ppl=6.1018, gnorm=1.1734, throughput=75.41K wps, wc=552.92K
[Epoch 10 Batch 500/1043] loss=1.7766, ppl=5.9097, gnorm=1.1915, throughput=73.25K wps, wc=518.86K
[Epoch 10 Batch 600/1043] loss=1.9344, ppl=6.9196, gnorm=1.1462, throughput=80.35K wps, wc=630.69K
[Epoch 10 Batch 700/1043] loss=1.8438, ppl=6.3204, gnorm=1.1790, throughput=75.78K wps, wc=555.67K
[Epoch 10 Batch 800/1043] loss=1.8859, ppl=6.5924, gnorm=1.1687, throughput=77.33K wps, wc=582.86K
[Epoch 10 Batch 900/1043] loss=1.9531, ppl=7.0504, gnorm=1.1451, throughput=81.08K wps, wc=647.88K
[Epoch 10 Batch 1000/1043] loss=1.9252, ppl=6.8563, gnorm=1.1675, throughput=78.71K wps, wc=611.42K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 10] valid Loss=2.3550, valid ppl=10.5381, valid bleu=21.05
Save best parameters to transformer_vi_en_512.params
Learning rate change to 1.25e-05


  0%|          | 0/1043 [00:00<?, ?it/s]

[Epoch 11 Batch 100/1043] loss=1.8764, ppl=6.5302, gnorm=1.1583, throughput=74.52K wps, wc=594.99K
[Epoch 11 Batch 200/1043] loss=1.7861, ppl=5.9664, gnorm=1.1738, throughput=75.36K wps, wc=550.29K
[Epoch 11 Batch 300/1043] loss=1.8272, ppl=6.2166, gnorm=1.1700, throughput=76.28K wps, wc=567.04K
[Epoch 11 Batch 400/1043] loss=1.8277, ppl=6.2198, gnorm=1.1865, throughput=74.99K wps, wc=535.70K
[Epoch 11 Batch 500/1043] loss=1.8415, ppl=6.3062, gnorm=1.1643, throughput=76.84K wps, wc=582.91K
[Epoch 11 Batch 600/1043] loss=1.7840, ppl=5.9537, gnorm=1.1722, throughput=75.41K wps, wc=555.74K
[Epoch 11 Batch 700/1043] loss=1.8103, ppl=6.1121, gnorm=1.1841, throughput=74.93K wps, wc=553.90K
[Epoch 11 Batch 800/1043] loss=1.9372, ppl=6.9392, gnorm=1.1759, throughput=79.58K wps, wc=618.74K
[Epoch 11 Batch 900/1043] loss=1.8652, ppl=6.4573, gnorm=1.1608, throughput=77.92K wps, wc=593.70K
[Epoch 11 Batch 1000/1043] loss=1.8979, ppl=6.6716, gnorm=1.1896, throughput=77.66K wps, wc=589.68K


  0%|          | 0/27 [00:00<?, ?it/s]

[Epoch 11] valid Loss=2.3541, valid ppl=10.5284, valid bleu=20.95
Learning rate change to 6.25e-06


  0%|          | 0/27 [00:00<?, ?it/s]

Best model valid Loss=2.3550, valid ppl=10.5381, valid bleu=21.05


  0%|          | 0/23 [00:00<?, ?it/s]

Best model test Loss=2.2326, test ppl=9.3240, test bleu=22.87'


In [18]:
print("Qualitative Evaluation: Translating from Vietnamese to English")

expected_tgt_seq = "I like to read books."
print("Expected translation:")
print(expected_tgt_seq)
# From Google Translate
src_seq = "Tôi thích đọc sách kỹ thuật."
print("In Vietnamese (from Google Translate):")
print(src_seq)

src_sentence = iwslt_src_vocab[src_seq.split()]
src_sentence.append(iwslt_src_vocab[iwslt_src_vocab.eos_token])
src_npy = np.array(src_sentence, dtype=np.int32)
src_nd = mx.nd.array(src_npy)
src_nd = src_nd.reshape((1, -1)).as_in_context(ctx)
src_valid_length = mx.nd.array([src_nd.shape[1]]).as_in_context(ctx)

samples, _, sample_valid_length = transformer_translator.translate(
    src_seq=src_nd,
    src_valid_length=src_valid_length)

max_score_sample = samples[:, 0, :].asnumpy()
sample_valid_length = sample_valid_length[:, 0].asnumpy()

translation_out = []
for i in range(max_score_sample.shape[0]):
    translation_out.append(
        [iwslt_tgt_vocab.idx_to_token[ele] for ele in
         max_score_sample[i][1:(sample_valid_length[i] - 1)]])

print("The English translation is:")
print(" ".join(translation_out[0]))

Qualitative Evaluation: Translating from Vietnamese to English
Expected translation:
I like to read books.
In Vietnamese (from Google Translate):
Tôi thích đọc sách kỹ thuật.
The English translation is:
I love reading books .
