In [1]:
import mxnet as mx
import gluonnlp as nlp

import time
import random
import numpy as np
import sacremoses
from tqdm import tqdm_notebook as tqdm

# Local Libraries
import nmt
import dataprocessor
import utils
import nmt.gnmt_hparams

# Seeds for reproducibility
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)

# CPU setup
# ctx = mx.cpu()
# Single GPU setup
ctx = mx.gpu(0)

[nltk_data] Downloading package punkt to /home/andreto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andreto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/andreto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Dataset Parameters
src_lang, tgt_lang = 'en', 'de'
# No limit on sentences length
src_max_len, tgt_max_len = -1, -1

In [3]:
# WMT2016 Dataset (Train and Evaluation)
wmt_train_text_bpe = nlp.data.WMT2016BPE("train", # BPE: cheapest --> cheap@@, est
                                         src_lang=src_lang,
                                         tgt_lang=tgt_lang)

wmt_train_text     = nlp.data.WMT2016("train",
                                      src_lang=src_lang,
                                      tgt_lang=tgt_lang)

wmt_test_text_bpe  = nlp.data.WMT2016BPE("newstest2016", # BPE: cheapest --> cheap@@, est
                                         src_lang=src_lang,
                                         tgt_lang=tgt_lang)

wmt_test_text      = nlp.data.WMT2016("newstest2016",
                                     src_lang=src_lang,
                                     tgt_lang=tgt_lang)

wmt_src_vocab = wmt_train_text_bpe.src_vocab
wmt_tgt_vocab = wmt_train_text_bpe.tgt_vocab

  'Detected a corrupted index in the deserialize vocabulary. '


In [4]:
# Processing datasets
# Filtering training data to a maximum number of samples,
# so that training can be handled in a reasonable time (~8 hrs)
# in single GPU setups
max_samples = int(1e6)
wmt_train_text_bpe = mx.gluon.data.SimpleDataset([wmt_train_text_bpe[i] for i in range(max_samples)])
wmt_train_text     = mx.gluon.data.SimpleDataset([wmt_train_text[i] for i in range(max_samples)])
wmt_test_text_bpe  = mx.gluon.data.SimpleDataset(wmt_test_text_bpe)
wmt_test_text      = mx.gluon.data.SimpleDataset(wmt_test_text)

In [5]:
# Dataset example (human-readable): English and German
print(wmt_test_text[16][0])
print(wmt_test_text[16][1])

By the end of the day, there would be one more death: Lamb took his own life as police closed in on him.
Bis zum Ende des Tages gab es einen weiteren Tod: Lamm nahm sich das Leben, als die Polizei ihn einkesselte.


In [6]:
# Retrieve (split) translated sequences (target)
wmt_train_tgt_sentences = wmt_train_text.transform(lambda src, tgt: tgt)
wmt_test_tgt_sentences  = wmt_test_text.transform(lambda src, tgt: tgt)
print("Sample target sentence:")
print(wmt_test_tgt_sentences[16])

Sample target sentence:
Bis zum Ende des Tages gab es einen weiteren Tod: Lamm nahm sich das Leben, als die Polizei ihn einkesselte.


In [7]:
# Dataset processing: clipping, tokenizing, indexing and adding of EOS (src/tgt) / BOS (tgt)
wmt_transform_fn = dataprocessor.TrainValDataTransform(wmt_src_vocab, wmt_tgt_vocab)

wmt_train_processed = wmt_train_text_bpe.transform(wmt_transform_fn, lazy=False)
wmt_test_processed  = wmt_test_text_bpe.transform(wmt_transform_fn, lazy=False)

wmt_train_text_with_len = wmt_train_processed.transform(nmt.utils.get_length_index_fn(), lazy=False)
wmt_test_text_with_len  = wmt_test_processed.transform(nmt.utils.get_length_index_fn(), lazy=False)

print(wmt_test_text_with_len[16][0])
print(wmt_test_text_with_len[16][1])

[ 2083 28753 16760 23875 28753 15230    28 28783 31223 12931 24017 23247
 15259   569  5971 12813 29083 20097 24348 22312 12290 24829 14439 20585
 24004 20061    62     3]
[    2  1897 31601  3259 15535  9414 18646 17382 16407 30851  9629   569
  5971 22642 23439 27119 15199  6041    28 11681 15681  7670 20454 16394
 21488 26868 28535    62     3]


In [8]:
# Batcher
wmt_batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Pad(),                   # Source Token IDs
    nlp.data.batchify.Pad(),                   # Target Token IDs
    nlp.data.batchify.Stack(dtype='float32'),  # Source Sequence Length
    nlp.data.batchify.Stack(dtype='float32'),  # Target Sequence Length
    nlp.data.batchify.Stack())                 # Index

  'Padding value is not given and will be set automatically to 0 '


In [9]:
# Hyperparameters
hparams = nmt.gnmt_hparams

In [10]:
# Samplers
wmt_train_batch_sampler = nlp.data.FixedBucketSampler(
    lengths=wmt_train_text_with_len.transform(lambda src, tgt, src_len, tgt_len, idx: (src_len, tgt_len)),
    num_buckets=hparams.num_buckets,
    batch_size=hparams.batch_size)
print(wmt_train_batch_sampler.stats())

wmt_test_batch_sampler = nlp.data.FixedBucketSampler(
    lengths=wmt_test_text_with_len.transform(lambda src, tgt, src_len, tgt_len, idx: (src_len, tgt_len)),
    num_buckets=hparams.num_buckets,
    batch_size=hparams.test_batch_size)
print(wmt_test_batch_sampler.stats())

FixedBucketSampler:
  sample_num=1000000, batch_num=15628
  key=[(28, 29), (55, 56), (82, 83), (109, 110), (136, 137)]
  cnt=[484394, 419332, 89416, 6836, 22]
  batch_size=[64, 64, 64, 64, 64]
FixedBucketSampler:
  sample_num=2999, batch_num=97
  key=[(23, 26), (43, 48), (63, 70), (83, 92), (103, 114)]
  cnt=[1417, 1191, 329, 56, 6]
  batch_size=[32, 32, 32, 32, 32]


In [11]:
# DataLoaders
wmt_train_data_loader = mx.gluon.data.DataLoader(
    wmt_train_text_with_len,
    batch_sampler=wmt_train_batch_sampler,
    batchify_fn=wmt_batchify_fn,
    num_workers=8)
print('Number of training batches:', len(wmt_train_data_loader))

wmt_test_data_loader = mx.gluon.data.DataLoader(
    wmt_test_text_with_len,
    batch_sampler=wmt_test_batch_sampler,
    batchify_fn=wmt_batchify_fn,
    num_workers=8)
print('Number of testing batches:', len(wmt_test_data_loader))

Number of training batches: 15628
Number of testing batches: 97


In [12]:
# Model
encoder, decoder, one_step_ahead_decoder = nmt.gnmt.get_gnmt_encoder_decoder(
    hidden_size=hparams.num_hidden,
    dropout=hparams.dropout,
    num_layers=hparams.num_layers,
    num_bi_layers=hparams.num_bi_layers)

gnmt_model = nlp.model.translation.NMTModel(
    src_vocab=wmt_src_vocab,
    tgt_vocab=wmt_tgt_vocab,
    encoder=encoder,
    decoder=decoder,
    one_step_ahead_decoder=one_step_ahead_decoder,
    embed_size=hparams.num_hidden,
    prefix='gnmt_')

gnmt_model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
static_alloc = True
gnmt_model.hybridize(static_alloc=static_alloc)

In [13]:
scorer=nlp.model.BeamSearchScorer(
    alpha=hparams.lp_alpha,
    K=hparams.lp_k)

gnmt_translator = nmt.translation.BeamSearchTranslator(
    model=gnmt_model,
    beam_size=hparams.beam_size,
    scorer=scorer,
    max_length=tgt_max_len + 100)

print("Use beam_size={}, alpha={}, K={}".format(hparams.beam_size, hparams.lp_alpha, hparams.lp_k))

Use beam_size=10, alpha=1.0, K=5


In [14]:
# Evaluation (Baseline)
eval_start_time = time.time()
wmt_loss_function = nlp.loss.MaskedSoftmaxCELoss()
wmt_loss_function.hybridize()
wmt_detokenizer = nlp.data.SacreMosesDetokenizer()

gnmt_test_loss, gnmt_test_translation_out = nmt.utils.evaluate(
    gnmt_model,
    wmt_test_data_loader,
    wmt_loss_function,
    gnmt_translator,
    wmt_tgt_vocab,
    wmt_detokenizer,
    ctx)

gnmt_test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu(
    [wmt_test_tgt_sentences],
    gnmt_test_translation_out,
    tokenized=False,
    tokenizer=hparams.bleu,
    split_compound_word=False,
    bpe=False)

print('WMT16 EN-DE SOTA model test loss: %.2f; test bleu score: %.2f; time cost %.2fs' %(gnmt_test_loss, gnmt_test_bleu_score * 100, (time.time() - eval_start_time)))

  0%|          | 0/97 [00:00<?, ?it/s]

Extension horovod.torch has not been built: /home/ubuntu/anaconda3/envs/mxnet_p37/lib/python3.7/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-37m-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2022-06-04 21:12:09.664 ip-172-31-28-47:2314 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-06-04 21:12:09.694 ip-172-31-28-47:2314 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
WMT16 EN-DE SOTA model test loss: 7.75; test bleu score: 0.00; time cost 125.44s


In [15]:
# Training
trainer = mx.gluon.Trainer(gnmt_model.collect_params(), 'adam', {'learning_rate': hparams.lr})

In [16]:
nmt.utils.train(
    gnmt_model,
    wmt_train_data_loader,
    wmt_test_data_loader,
    wmt_loss_function,
    trainer,
    gnmt_translator,
    wmt_tgt_vocab,
    wmt_detokenizer,
    wmt_test_tgt_sentences,
    hparams.save_dir,
    hparams,
    ctx)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15628 [00:00<?, ?it/s]

[Epoch 0 Batch 10/15628] loss=9.6243, ppl=15128.5737, gnorm=1.9159, throughput=12.34K wps, wc=99.61K
[Epoch 0 Batch 20/15628] loss=7.9087, ppl=2720.7845, gnorm=0.6544, throughput=19.67K wps, wc=105.05K
[Epoch 0 Batch 30/15628] loss=7.6519, ppl=2104.5417, gnorm=0.4327, throughput=19.34K wps, wc=105.35K
[Epoch 0 Batch 40/15628] loss=7.5384, ppl=1878.8451, gnorm=0.2846, throughput=20.33K wps, wc=105.56K
[Epoch 0 Batch 50/15628] loss=7.5069, ppl=1820.6161, gnorm=0.2547, throughput=20.39K wps, wc=105.68K
[Epoch 0 Batch 60/15628] loss=7.4718, ppl=1757.8342, gnorm=0.2675, throughput=20.41K wps, wc=105.29K
[Epoch 0 Batch 70/15628] loss=7.4867, ppl=1784.1696, gnorm=0.2836, throughput=20.22K wps, wc=105.16K
[Epoch 0 Batch 80/15628] loss=7.4260, ppl=1679.0376, gnorm=0.2879, throughput=20.06K wps, wc=105.24K
[Epoch 0 Batch 90/15628] loss=7.3874, ppl=1615.4975, gnorm=0.3212, throughput=19.21K wps, wc=104.38K
[Epoch 0 Batch 100/15628] loss=7.3273, ppl=1521.2603, gnorm=0.2824, throughput=19.18K wps, 

[Epoch 0 Batch 830/15628] loss=5.2158, ppl=184.1577, gnorm=0.2711, throughput=19.23K wps, wc=77.95K
[Epoch 0 Batch 840/15628] loss=5.2508, ppl=190.7132, gnorm=0.2761, throughput=18.61K wps, wc=78.97K
[Epoch 0 Batch 850/15628] loss=5.1554, ppl=173.3676, gnorm=0.2798, throughput=18.69K wps, wc=77.99K
[Epoch 0 Batch 860/15628] loss=5.1844, ppl=178.4695, gnorm=0.2858, throughput=20.21K wps, wc=79.14K
[Epoch 0 Batch 870/15628] loss=5.1232, ppl=167.8785, gnorm=0.2685, throughput=18.62K wps, wc=78.52K
[Epoch 0 Batch 880/15628] loss=5.0713, ppl=159.3846, gnorm=0.2799, throughput=19.24K wps, wc=77.92K
[Epoch 0 Batch 890/15628] loss=5.2012, ppl=181.4908, gnorm=0.2749, throughput=18.37K wps, wc=78.71K
[Epoch 0 Batch 900/15628] loss=5.1875, ppl=179.0148, gnorm=0.2652, throughput=19.21K wps, wc=79.19K
[Epoch 0 Batch 910/15628] loss=5.1658, ppl=175.1717, gnorm=0.2670, throughput=18.73K wps, wc=79.05K
[Epoch 0 Batch 920/15628] loss=5.1539, ppl=173.0987, gnorm=0.2533, throughput=18.54K wps, wc=78.10K


[Epoch 0 Batch 1650/15628] loss=4.6844, ppl=108.2469, gnorm=0.3247, throughput=17.10K wps, wc=47.66K
[Epoch 0 Batch 1660/15628] loss=4.7090, ppl=110.9453, gnorm=0.3314, throughput=17.29K wps, wc=47.13K
[Epoch 0 Batch 1670/15628] loss=4.7722, ppl=118.1776, gnorm=0.3263, throughput=17.38K wps, wc=46.58K
[Epoch 0 Batch 1680/15628] loss=4.5803, ppl=97.5462, gnorm=0.3160, throughput=17.74K wps, wc=47.93K
[Epoch 0 Batch 1690/15628] loss=4.6665, ppl=106.3281, gnorm=0.3271, throughput=17.72K wps, wc=47.36K
[Epoch 0 Batch 1700/15628] loss=4.6613, ppl=105.7778, gnorm=0.3208, throughput=18.04K wps, wc=47.34K
[Epoch 0 Batch 1710/15628] loss=4.6217, ppl=101.6682, gnorm=0.3146, throughput=18.66K wps, wc=47.34K
[Epoch 0 Batch 1720/15628] loss=4.6342, ppl=102.9432, gnorm=0.3101, throughput=17.01K wps, wc=46.90K
[Epoch 0 Batch 1730/15628] loss=4.5949, ppl=98.9793, gnorm=0.3097, throughput=15.84K wps, wc=46.05K
[Epoch 0 Batch 1740/15628] loss=4.7189, ppl=112.0496, gnorm=0.3175, throughput=17.81K wps, wc

[Epoch 0 Batch 2470/15628] loss=4.3917, ppl=80.7753, gnorm=0.3176, throughput=17.84K wps, wc=46.92K
[Epoch 0 Batch 2480/15628] loss=4.3385, ppl=76.5888, gnorm=0.3105, throughput=17.76K wps, wc=47.65K
[Epoch 0 Batch 2490/15628] loss=4.3495, ppl=77.4423, gnorm=0.3021, throughput=17.45K wps, wc=46.17K
[Epoch 0 Batch 2500/15628] loss=4.2718, ppl=71.6531, gnorm=0.3110, throughput=17.29K wps, wc=46.79K
[Epoch 0 Batch 2510/15628] loss=4.2050, ppl=67.0208, gnorm=0.3010, throughput=17.96K wps, wc=47.41K
[Epoch 0 Batch 2520/15628] loss=4.3976, ppl=81.2541, gnorm=0.3015, throughput=18.48K wps, wc=47.41K
[Epoch 0 Batch 2530/15628] loss=4.3108, ppl=74.4966, gnorm=0.2974, throughput=18.03K wps, wc=47.03K
[Epoch 0 Batch 2540/15628] loss=4.5109, ppl=91.0021, gnorm=0.3145, throughput=16.98K wps, wc=47.98K
[Epoch 0 Batch 2550/15628] loss=4.3145, ppl=74.7754, gnorm=0.3024, throughput=18.42K wps, wc=46.81K
[Epoch 0 Batch 2560/15628] loss=4.4121, ppl=82.4416, gnorm=0.3041, throughput=17.44K wps, wc=47.17K


[Epoch 0 Batch 3290/15628] loss=4.0375, ppl=56.6854, gnorm=0.2964, throughput=17.45K wps, wc=47.82K
[Epoch 0 Batch 3300/15628] loss=4.1398, ppl=62.7876, gnorm=0.3104, throughput=18.85K wps, wc=47.90K
[Epoch 0 Batch 3310/15628] loss=4.0667, ppl=58.3614, gnorm=0.3074, throughput=16.85K wps, wc=47.16K
[Epoch 0 Batch 3320/15628] loss=4.0911, ppl=59.8050, gnorm=0.3000, throughput=18.30K wps, wc=48.03K
[Epoch 0 Batch 3330/15628] loss=4.0488, ppl=57.3306, gnorm=0.2989, throughput=17.82K wps, wc=47.82K
[Epoch 0 Batch 3340/15628] loss=4.1154, ppl=61.2779, gnorm=0.3094, throughput=16.93K wps, wc=47.39K
[Epoch 0 Batch 3350/15628] loss=4.1085, ppl=60.8556, gnorm=0.3084, throughput=17.16K wps, wc=46.76K
[Epoch 0 Batch 3360/15628] loss=4.0102, ppl=55.1579, gnorm=0.3026, throughput=18.22K wps, wc=47.52K
[Epoch 0 Batch 3370/15628] loss=3.9692, ppl=52.9431, gnorm=0.2944, throughput=16.74K wps, wc=47.08K
[Epoch 0 Batch 3380/15628] loss=3.9812, ppl=53.5800, gnorm=0.3079, throughput=18.63K wps, wc=46.99K


[Epoch 0 Batch 4110/15628] loss=3.5139, ppl=33.5773, gnorm=0.3234, throughput=17.57K wps, wc=47.61K
[Epoch 0 Batch 4120/15628] loss=3.6485, ppl=38.4155, gnorm=0.3362, throughput=16.60K wps, wc=46.80K
[Epoch 0 Batch 4130/15628] loss=3.6365, ppl=37.9597, gnorm=0.3254, throughput=18.39K wps, wc=48.28K
[Epoch 0 Batch 4140/15628] loss=3.4712, ppl=32.1760, gnorm=0.3273, throughput=18.43K wps, wc=47.02K
[Epoch 0 Batch 4150/15628] loss=3.5866, ppl=36.1123, gnorm=0.3227, throughput=18.19K wps, wc=47.72K
[Epoch 0 Batch 4160/15628] loss=3.4281, ppl=30.8177, gnorm=0.3234, throughput=18.39K wps, wc=48.55K
[Epoch 0 Batch 4170/15628] loss=3.3550, ppl=28.6464, gnorm=0.3291, throughput=16.17K wps, wc=47.12K
[Epoch 0 Batch 4180/15628] loss=3.6053, ppl=36.7923, gnorm=0.3446, throughput=18.49K wps, wc=47.09K
[Epoch 0 Batch 4190/15628] loss=3.5705, ppl=35.5349, gnorm=0.3432, throughput=17.47K wps, wc=47.51K
[Epoch 0 Batch 4200/15628] loss=3.5446, ppl=34.6260, gnorm=0.3325, throughput=18.90K wps, wc=48.12K


[Epoch 0 Batch 4930/15628] loss=3.2916, ppl=26.8855, gnorm=0.3582, throughput=17.47K wps, wc=47.61K
[Epoch 0 Batch 4940/15628] loss=3.1489, ppl=23.3099, gnorm=0.3497, throughput=17.03K wps, wc=46.68K
[Epoch 0 Batch 4950/15628] loss=3.0962, ppl=22.1136, gnorm=0.3348, throughput=16.71K wps, wc=46.61K
[Epoch 0 Batch 4960/15628] loss=3.1986, ppl=24.4989, gnorm=0.3395, throughput=16.57K wps, wc=46.95K
[Epoch 0 Batch 4970/15628] loss=3.1274, ppl=22.8146, gnorm=0.3381, throughput=17.38K wps, wc=47.98K
[Epoch 0 Batch 4980/15628] loss=3.0774, ppl=21.7013, gnorm=0.3404, throughput=17.13K wps, wc=47.19K
[Epoch 0 Batch 4990/15628] loss=3.4202, ppl=30.5763, gnorm=0.3424, throughput=17.32K wps, wc=47.69K
[Epoch 0 Batch 5000/15628] loss=3.0838, ppl=21.8410, gnorm=0.3345, throughput=17.35K wps, wc=48.69K
[Epoch 0 Batch 5010/15628] loss=3.3011, ppl=27.1435, gnorm=0.3464, throughput=17.21K wps, wc=47.01K
[Epoch 0 Batch 5020/15628] loss=3.1142, ppl=22.5157, gnorm=0.3355, throughput=17.64K wps, wc=47.51K


[Epoch 0 Batch 5750/15628] loss=2.9519, ppl=19.1419, gnorm=0.3395, throughput=17.53K wps, wc=47.51K
[Epoch 0 Batch 5760/15628] loss=3.2880, ppl=26.7893, gnorm=0.3473, throughput=18.61K wps, wc=47.06K
[Epoch 0 Batch 5770/15628] loss=2.9352, ppl=18.8248, gnorm=0.3410, throughput=17.01K wps, wc=47.06K
[Epoch 0 Batch 5780/15628] loss=2.8665, ppl=17.5753, gnorm=0.3488, throughput=17.70K wps, wc=47.24K
[Epoch 0 Batch 5790/15628] loss=2.8711, ppl=17.6560, gnorm=0.3410, throughput=17.31K wps, wc=47.26K
[Epoch 0 Batch 5800/15628] loss=3.1127, ppl=22.4811, gnorm=0.3681, throughput=16.83K wps, wc=47.58K
[Epoch 0 Batch 5810/15628] loss=3.0025, ppl=20.1363, gnorm=0.3361, throughput=17.39K wps, wc=47.17K
[Epoch 0 Batch 5820/15628] loss=2.9901, ppl=19.8882, gnorm=0.3411, throughput=18.04K wps, wc=47.64K
[Epoch 0 Batch 5830/15628] loss=2.9718, ppl=19.5270, gnorm=0.3371, throughput=16.73K wps, wc=46.90K
[Epoch 0 Batch 5840/15628] loss=2.9161, ppl=18.4700, gnorm=0.3403, throughput=17.77K wps, wc=48.03K


[Epoch 0 Batch 6570/15628] loss=2.6576, ppl=14.2619, gnorm=0.3379, throughput=17.28K wps, wc=47.91K
[Epoch 0 Batch 6580/15628] loss=3.2218, ppl=25.0730, gnorm=0.3743, throughput=17.93K wps, wc=46.99K
[Epoch 0 Batch 6590/15628] loss=2.7054, ppl=14.9607, gnorm=0.3381, throughput=17.97K wps, wc=46.96K
[Epoch 0 Batch 6600/15628] loss=2.6508, ppl=14.1649, gnorm=0.3512, throughput=18.61K wps, wc=47.63K
[Epoch 0 Batch 6610/15628] loss=2.6302, ppl=13.8760, gnorm=0.3424, throughput=17.01K wps, wc=46.98K
[Epoch 0 Batch 6620/15628] loss=2.7175, ppl=15.1419, gnorm=0.3437, throughput=16.78K wps, wc=47.60K
[Epoch 0 Batch 6630/15628] loss=2.8089, ppl=16.5917, gnorm=0.3478, throughput=17.75K wps, wc=47.71K
[Epoch 0 Batch 6640/15628] loss=2.8059, ppl=16.5427, gnorm=0.3416, throughput=16.86K wps, wc=47.14K
[Epoch 0 Batch 6650/15628] loss=2.7743, ppl=16.0273, gnorm=0.3388, throughput=18.20K wps, wc=47.59K
[Epoch 0 Batch 6660/15628] loss=2.6610, ppl=14.3103, gnorm=0.3389, throughput=16.47K wps, wc=47.51K


[Epoch 0 Batch 7390/15628] loss=2.7072, ppl=14.9880, gnorm=0.3518, throughput=17.03K wps, wc=47.06K
[Epoch 0 Batch 7400/15628] loss=2.7328, ppl=15.3765, gnorm=0.3358, throughput=17.22K wps, wc=47.01K
[Epoch 0 Batch 7410/15628] loss=2.3871, ppl=10.8819, gnorm=0.3312, throughput=16.94K wps, wc=47.82K
[Epoch 0 Batch 7420/15628] loss=2.4734, ppl=11.8623, gnorm=0.3251, throughput=17.70K wps, wc=47.40K
[Epoch 0 Batch 7430/15628] loss=2.5763, ppl=13.1486, gnorm=0.3352, throughput=17.45K wps, wc=47.23K
[Epoch 0 Batch 7440/15628] loss=2.5065, ppl=12.2619, gnorm=0.3438, throughput=15.56K wps, wc=46.43K
[Epoch 0 Batch 7450/15628] loss=2.5186, ppl=12.4114, gnorm=0.3388, throughput=18.03K wps, wc=47.59K
[Epoch 0 Batch 7460/15628] loss=2.5739, ppl=13.1175, gnorm=0.3453, throughput=16.69K wps, wc=47.93K
[Epoch 0 Batch 7470/15628] loss=2.7144, ppl=15.0948, gnorm=0.3339, throughput=17.31K wps, wc=46.62K
[Epoch 0 Batch 7480/15628] loss=2.7165, ppl=15.1280, gnorm=0.3462, throughput=16.73K wps, wc=46.69K


[Epoch 0 Batch 8210/15628] loss=2.7116, ppl=15.0528, gnorm=0.4307, throughput=15.99K wps, wc=22.36K
[Epoch 0 Batch 8220/15628] loss=2.7412, ppl=15.5049, gnorm=0.4276, throughput=16.00K wps, wc=22.17K
[Epoch 0 Batch 8230/15628] loss=2.7282, ppl=15.3049, gnorm=0.4299, throughput=16.16K wps, wc=21.82K
[Epoch 0 Batch 8240/15628] loss=2.6328, ppl=13.9128, gnorm=0.4212, throughput=15.98K wps, wc=22.83K
[Epoch 0 Batch 8250/15628] loss=2.6000, ppl=13.4634, gnorm=0.4236, throughput=15.44K wps, wc=22.48K
[Epoch 0 Batch 8260/15628] loss=2.7952, ppl=16.3662, gnorm=0.4352, throughput=15.55K wps, wc=22.39K
[Epoch 0 Batch 8270/15628] loss=2.5992, ppl=13.4524, gnorm=0.4206, throughput=15.75K wps, wc=22.23K
[Epoch 0 Batch 8280/15628] loss=2.5491, ppl=12.7950, gnorm=0.4247, throughput=15.69K wps, wc=22.16K
[Epoch 0 Batch 8290/15628] loss=2.5804, ppl=13.2023, gnorm=0.4370, throughput=15.45K wps, wc=21.68K
[Epoch 0 Batch 8300/15628] loss=2.6459, ppl=14.0960, gnorm=0.4194, throughput=15.85K wps, wc=22.84K


[Epoch 0 Batch 9030/15628] loss=2.5762, ppl=13.1465, gnorm=0.4196, throughput=15.29K wps, wc=21.34K
[Epoch 0 Batch 9040/15628] loss=2.6445, ppl=14.0767, gnorm=0.4209, throughput=15.55K wps, wc=22.42K
[Epoch 0 Batch 9050/15628] loss=2.5823, ppl=13.2276, gnorm=0.4138, throughput=14.80K wps, wc=21.92K
[Epoch 0 Batch 9060/15628] loss=2.6693, ppl=14.4303, gnorm=0.4065, throughput=15.55K wps, wc=21.99K
[Epoch 0 Batch 9070/15628] loss=2.4976, ppl=12.1529, gnorm=0.4124, throughput=14.88K wps, wc=22.43K
[Epoch 0 Batch 9080/15628] loss=2.4863, ppl=12.0170, gnorm=0.4071, throughput=15.44K wps, wc=22.42K
[Epoch 0 Batch 9090/15628] loss=2.4978, ppl=12.1555, gnorm=0.4145, throughput=16.21K wps, wc=22.18K
[Epoch 0 Batch 9100/15628] loss=2.5146, ppl=12.3611, gnorm=0.4213, throughput=15.49K wps, wc=21.28K
[Epoch 0 Batch 9110/15628] loss=2.5865, ppl=13.2831, gnorm=0.4124, throughput=15.91K wps, wc=22.34K
[Epoch 0 Batch 9120/15628] loss=2.4717, ppl=11.8421, gnorm=0.4129, throughput=16.06K wps, wc=21.93K


[Epoch 0 Batch 9850/15628] loss=2.3234, ppl=10.2107, gnorm=0.4023, throughput=15.92K wps, wc=22.18K
[Epoch 0 Batch 9860/15628] loss=2.4645, ppl=11.7570, gnorm=0.4142, throughput=16.05K wps, wc=22.28K
[Epoch 0 Batch 9870/15628] loss=2.2799, ppl=9.7757, gnorm=0.4005, throughput=15.17K wps, wc=22.20K
[Epoch 0 Batch 9880/15628] loss=2.5724, ppl=13.0976, gnorm=0.4136, throughput=15.37K wps, wc=22.17K
[Epoch 0 Batch 9890/15628] loss=2.5420, ppl=12.7047, gnorm=0.4123, throughput=15.39K wps, wc=22.38K
[Epoch 0 Batch 9900/15628] loss=2.3070, ppl=10.0446, gnorm=0.4000, throughput=15.21K wps, wc=21.44K
[Epoch 0 Batch 9910/15628] loss=2.4053, ppl=11.0815, gnorm=0.4009, throughput=15.62K wps, wc=22.87K
[Epoch 0 Batch 9920/15628] loss=2.4285, ppl=11.3418, gnorm=0.4246, throughput=15.44K wps, wc=21.08K
[Epoch 0 Batch 9930/15628] loss=2.4920, ppl=12.0857, gnorm=0.4087, throughput=15.76K wps, wc=22.41K
[Epoch 0 Batch 9940/15628] loss=2.5851, ppl=13.2649, gnorm=0.4124, throughput=15.67K wps, wc=22.38K
[

[Epoch 0 Batch 10670/15628] loss=2.3224, ppl=10.2006, gnorm=0.4008, throughput=15.34K wps, wc=22.10K
[Epoch 0 Batch 10680/15628] loss=2.3562, ppl=10.5503, gnorm=0.4062, throughput=15.03K wps, wc=21.41K
[Epoch 0 Batch 10690/15628] loss=2.4967, ppl=12.1428, gnorm=0.4029, throughput=15.84K wps, wc=22.29K
[Epoch 0 Batch 10700/15628] loss=2.3747, ppl=10.7478, gnorm=0.4096, throughput=15.83K wps, wc=22.30K
[Epoch 0 Batch 10710/15628] loss=2.3624, ppl=10.6164, gnorm=0.3974, throughput=15.76K wps, wc=22.36K
[Epoch 0 Batch 10720/15628] loss=2.3408, ppl=10.3892, gnorm=0.4050, throughput=15.46K wps, wc=21.57K
[Epoch 0 Batch 10730/15628] loss=2.2601, ppl=9.5844, gnorm=0.3927, throughput=16.05K wps, wc=22.37K
[Epoch 0 Batch 10740/15628] loss=2.3414, ppl=10.3954, gnorm=0.3973, throughput=15.87K wps, wc=22.28K
[Epoch 0 Batch 10750/15628] loss=2.4049, ppl=11.0778, gnorm=0.4067, throughput=15.62K wps, wc=22.47K
[Epoch 0 Batch 10760/15628] loss=2.2759, ppl=9.7364, gnorm=0.4051, throughput=15.75K wps, wc

[Epoch 0 Batch 11490/15628] loss=2.3727, ppl=10.7268, gnorm=0.4033, throughput=15.73K wps, wc=22.62K
[Epoch 0 Batch 11500/15628] loss=2.1587, ppl=8.6602, gnorm=0.3867, throughput=15.80K wps, wc=22.49K
[Epoch 0 Batch 11510/15628] loss=2.3833, ppl=10.8401, gnorm=0.4008, throughput=15.81K wps, wc=23.23K
[Epoch 0 Batch 11520/15628] loss=2.4450, ppl=11.5308, gnorm=0.3999, throughput=16.51K wps, wc=22.89K
[Epoch 0 Batch 11530/15628] loss=2.2587, ppl=9.5708, gnorm=0.4003, throughput=15.32K wps, wc=22.58K
[Epoch 0 Batch 11540/15628] loss=2.1610, ppl=8.6798, gnorm=0.3833, throughput=16.43K wps, wc=23.42K
[Epoch 0 Batch 11550/15628] loss=2.1511, ppl=8.5940, gnorm=0.4041, throughput=14.99K wps, wc=22.11K
[Epoch 0 Batch 11560/15628] loss=2.3067, ppl=10.0409, gnorm=0.3993, throughput=16.08K wps, wc=22.35K
[Epoch 0 Batch 11570/15628] loss=2.2115, ppl=9.1295, gnorm=0.3893, throughput=16.79K wps, wc=22.96K
[Epoch 0 Batch 11580/15628] loss=2.4063, ppl=11.0926, gnorm=0.4035, throughput=15.81K wps, wc=22

[Epoch 0 Batch 12310/15628] loss=2.0348, ppl=7.6510, gnorm=0.3818, throughput=15.99K wps, wc=23.26K
[Epoch 0 Batch 12320/15628] loss=2.1434, ppl=8.5287, gnorm=0.3973, throughput=14.80K wps, wc=22.00K
[Epoch 0 Batch 12330/15628] loss=2.1598, ppl=8.6697, gnorm=0.3932, throughput=15.87K wps, wc=22.69K
[Epoch 0 Batch 12340/15628] loss=2.1893, ppl=8.9288, gnorm=0.3948, throughput=15.35K wps, wc=22.50K
[Epoch 0 Batch 12350/15628] loss=2.3734, ppl=10.7338, gnorm=0.4116, throughput=16.43K wps, wc=22.83K
[Epoch 0 Batch 12360/15628] loss=2.0735, ppl=7.9524, gnorm=0.3743, throughput=16.70K wps, wc=23.49K
[Epoch 0 Batch 12370/15628] loss=1.9853, ppl=7.2814, gnorm=0.3814, throughput=16.18K wps, wc=22.84K
[Epoch 0 Batch 12380/15628] loss=2.1424, ppl=8.5201, gnorm=0.3934, throughput=16.15K wps, wc=23.13K
[Epoch 0 Batch 12390/15628] loss=2.0285, ppl=7.6025, gnorm=0.3860, throughput=16.22K wps, wc=22.64K
[Epoch 0 Batch 12400/15628] loss=2.1034, ppl=8.1939, gnorm=0.4050, throughput=15.82K wps, wc=22.41K

[Epoch 0 Batch 13130/15628] loss=2.2005, ppl=9.0298, gnorm=0.3995, throughput=14.81K wps, wc=21.18K
[Epoch 0 Batch 13140/15628] loss=1.9645, ppl=7.1316, gnorm=0.3766, throughput=16.28K wps, wc=23.22K
[Epoch 0 Batch 13150/15628] loss=2.1207, ppl=8.3366, gnorm=0.3986, throughput=16.36K wps, wc=22.75K
[Epoch 0 Batch 13160/15628] loss=2.2520, ppl=9.5069, gnorm=0.3846, throughput=17.24K wps, wc=23.21K
[Epoch 0 Batch 13170/15628] loss=2.1331, ppl=8.4413, gnorm=0.3744, throughput=16.08K wps, wc=22.88K
[Epoch 0 Batch 13180/15628] loss=2.0675, ppl=7.9047, gnorm=0.3860, throughput=16.17K wps, wc=23.20K
[Epoch 0 Batch 13190/15628] loss=2.0618, ppl=7.8602, gnorm=0.3742, throughput=15.53K wps, wc=22.81K
[Epoch 0 Batch 13200/15628] loss=2.2130, ppl=9.1435, gnorm=0.3969, throughput=14.54K wps, wc=21.65K
[Epoch 0 Batch 13210/15628] loss=2.1012, ppl=8.1759, gnorm=0.3906, throughput=15.65K wps, wc=21.99K
[Epoch 0 Batch 13220/15628] loss=1.9988, ppl=7.3803, gnorm=0.3822, throughput=15.87K wps, wc=22.67K


[Epoch 0 Batch 13950/15628] loss=2.0337, ppl=7.6425, gnorm=0.3892, throughput=16.36K wps, wc=22.10K
[Epoch 0 Batch 13960/15628] loss=2.1768, ppl=8.8182, gnorm=0.4136, throughput=16.68K wps, wc=22.62K
[Epoch 0 Batch 13970/15628] loss=2.0621, ppl=7.8623, gnorm=0.3934, throughput=16.03K wps, wc=22.39K
[Epoch 0 Batch 13980/15628] loss=2.0446, ppl=7.7259, gnorm=0.3874, throughput=16.23K wps, wc=22.24K
[Epoch 0 Batch 13990/15628] loss=1.8817, ppl=6.5643, gnorm=0.3850, throughput=16.13K wps, wc=22.21K
[Epoch 0 Batch 14000/15628] loss=2.0484, ppl=7.7555, gnorm=0.3912, throughput=16.16K wps, wc=22.35K
[Epoch 0 Batch 14010/15628] loss=1.9250, ppl=6.8548, gnorm=0.3922, throughput=15.07K wps, wc=21.73K
[Epoch 0 Batch 14020/15628] loss=2.0074, ppl=7.4442, gnorm=0.3906, throughput=16.68K wps, wc=22.40K
[Epoch 0 Batch 14030/15628] loss=1.9992, ppl=7.3833, gnorm=0.4023, throughput=16.37K wps, wc=22.21K
[Epoch 0 Batch 14040/15628] loss=1.9767, ppl=7.2188, gnorm=0.3881, throughput=16.55K wps, wc=22.22K


[Epoch 0 Batch 14770/15628] loss=2.0156, ppl=7.5052, gnorm=0.3913, throughput=16.38K wps, wc=22.48K
[Epoch 0 Batch 14780/15628] loss=2.0455, ppl=7.7329, gnorm=0.3856, throughput=16.27K wps, wc=22.69K
[Epoch 0 Batch 14790/15628] loss=2.0331, ppl=7.6379, gnorm=0.3819, throughput=17.42K wps, wc=23.27K
[Epoch 0 Batch 14800/15628] loss=2.0831, ppl=8.0290, gnorm=0.3931, throughput=14.97K wps, wc=21.92K
[Epoch 0 Batch 14810/15628] loss=1.9286, ppl=6.8797, gnorm=0.3919, throughput=15.68K wps, wc=21.67K
[Epoch 0 Batch 14820/15628] loss=2.1469, ppl=8.5581, gnorm=0.3915, throughput=15.97K wps, wc=23.00K
[Epoch 0 Batch 14830/15628] loss=2.0580, ppl=7.8303, gnorm=0.3957, throughput=15.71K wps, wc=21.80K
[Epoch 0 Batch 14840/15628] loss=2.1603, ppl=8.6739, gnorm=0.3738, throughput=17.00K wps, wc=23.45K
[Epoch 0 Batch 14850/15628] loss=2.0037, ppl=7.4166, gnorm=0.3942, throughput=15.68K wps, wc=22.01K
[Epoch 0 Batch 14860/15628] loss=1.8585, ppl=6.4142, gnorm=0.3792, throughput=15.95K wps, wc=21.69K


[Epoch 0 Batch 15590/15628] loss=1.9158, ppl=6.7923, gnorm=0.3767, throughput=14.94K wps, wc=22.43K
[Epoch 0 Batch 15600/15628] loss=1.9353, ppl=6.9259, gnorm=0.3769, throughput=15.14K wps, wc=22.72K
[Epoch 0 Batch 15610/15628] loss=2.0613, ppl=7.8564, gnorm=0.3840, throughput=15.04K wps, wc=22.36K
[Epoch 0 Batch 15620/15628] loss=2.0846, ppl=8.0413, gnorm=0.3871, throughput=16.09K wps, wc=22.37K


  0%|          | 0/97 [00:00<?, ?it/s]

NameError: name 'val_tgt_sentences' is not defined

In [None]:
from importlib import reload

In [None]:
reload(nmt)
reload(nmt.utils)
reload(nmt.gnmt_hparams)