# Hyperparameter Search

[TOC]

In [1]:
import onmt
from onmt.trainer import Trainer
from onmt.utils.logging import logger
from onmt.utils.loss import build_loss_compute
import onmt.opts as opts
from onmt.inputters.inputter import build_dataset_iter, \
    load_old_vocab, old_style_vocab
from onmt.model_builder import build_model
from onmt.utils.optimizers import Optimizer
from onmt.utils.misc import set_random_seed
from onmt.models import build_model_saver
from onmt.utils.logging import init_logger, logger
from onmt.utils.misc import split_corpus
from onmt.translate.translator import build_translator

import itertools
import configargparse
import os
from itertools import chain
import torch
import subprocess, shlex, re, uuid, hashlib, json, gc, shutil, sys
from pprint import pprint
from copy import deepcopy

## Utils

In [2]:
class ArgumentHelper:
    def build(self, arg_dict):
        arg_total = self.base_arg_dict.copy()
        arg_total.update(arg_dict)
        return arg_total
    
    @staticmethod
    def save_args(arg_dict, filename):
        with open(filename + '.txt', 'w') as f:
            json.dump(arg_dict, f, indent=4)
    
    @staticmethod
    def convert_dict_to_arg_array(arg_dict):
        arr = []
        for k, v in arg_dict.items():
            arr.append('-' + k)
            if v is not None:
                arr.append(str(v))
        return arr

In [3]:
def cleanup():
    torch.cuda.empty_cache()
    gc.collect()

In [4]:
def build_translate_args(model_id):
    return {
        'src': 'data/nmt15/test_en',
        'tgt': 'data/nmt15/test_vi',
        'out': '/mnt/drive-2t/pred/opennmt_' + model_id + '.txt',
        'replace_unk': None,
        'gpu': '0',
        'batch_size': '16'
    }

def build_translate_args_reverse(model_id):
    return {
        'src': 'data/nmt15/test_vi',
        'tgt': 'data/nmt15/test_en',
        'out': '/mnt/drive-2t/pred/opennmt_' + model_id + '.txt',
        'replace_unk': None,
        'gpu': '0',
        'batch_size': '16'
    }

In [5]:
def clean_path_with_no_checkpoint(model_path, log_path):
    useless_file = []
    available_model = set()
    useless_log = []
    for root, dirs, files in os.walk(model_path):
        path = root.split(os.sep)
        for file in files:
            absolute_filename = os.path.join(root, file)
            model_id = re.findall(r'opennmt_(.+)_step_\d+\.pt', file)
            if len(model_id) != 0:
                available_model.add(model_id[0])
    
    for root, dirs, files in os.walk(model_path):
        path = root.split(os.sep)
        for file in files:
            absolute_filename = os.path.join(root, file)
            model_id = re.findall(r'opennmt_(.+).txt', file)
            if len(model_id) != 0 and model_id[0] not in available_model:
                useless_file.append(absolute_filename)
    
    dirs = os.listdir(log_path)
    for d in dirs:
        model_id = re.findall(r'opennmt_(.+)', d)
        if len(model_id) != 0 and model_id[0] not in available_model:
            useless_log.append(os.path.join(log_path, d))
    
#     print(available_model)
#     print(useless_file)
#     print(useless_log)
    for f in useless_file:
        os.remove(f)
    for f in useless_log:
        print(f)
        !rm -rf $f

# clean_path_with_no_checkpoint('/mnt/drive-2t/model', '/mnt/drive-2t/log')

In [6]:
def evaluate_human(model_id):
    model_args = '/mnt/drive-2t/model/opennmt_' + model_id + '.txt'
    !cat $model_args
    pred = '/mnt/drive-2t/pred/opennmt_' + model_id + '.txt'
    ref = 'data/nmt15/test_en'
    print()
    print(bleu_evaluate(pred, ref), bleu_evaluate(pred, ref, '0.25 0.25 0.25 0.25'))
    pred_head = !head $pred
    ref_head = !head $ref
    pairs = list(zip(pred_head, ref_head))
    for pair in pairs:
        print(pair[0])
        print()
        print(pair[1])
        print('-------------')

## BLEU

In [7]:
class BleuHelper:
    
    def __init__(self):
        self.python_interpreter = '/home/ray/.conda/envs/dl/bin/python'
        self.bleu_script = '/home/ray/Smooth_BLEU/bleu.py'
        self.nltk_script = '/home/ray/Smooth_BLEU/nltk_bleu.py'
        self.bleu_pred = '/home/ray/OpenNMT-py/data/nmt15/pred.txt'
        self.bleu_reference = '/home/ray/OpenNMT-py/data/nmt15/test_vi'
        self.bleu_weight = '0.25 0.25 0.25'
    
    def smooth_bleu_evaluate(self, pred, reference, weight='0.25 0.25 0.25'):
        p = subprocess.Popen([self.python_interpreter, self.bleu_script,
                              '-t', pred,
                              '-r', reference,
                              '-w', weight], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        p.wait()
        out, err = p.communicate()
        if p.returncode != 0:
            print(err)
            raise Exception()
        out = out.decode()
        err = err.decode()
        bleu_score = re.findall(r'BLEU = ([1-9\.]+)', err)
        return float(bleu_score[0])

    def nltk_bleu_evaluate(self, pred, reference, weight='0.25 0.25 0.25'):
        p = subprocess.Popen([self.python_interpreter, self.nltk_script,
                              '-t', pred,
                              '-r', reference,
                              '-w', weight], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        p.wait()
        out, err = p.communicate()
        if p.returncode != 0:
            print(err)
            raise Exception()
        out = out.decode()
        err = err.decode()
        bleu_score = re.findall(r'BLEU = ([1-9\.]+)', err)
        return float(bleu_score[0])

bleu_helper = BleuHelper()
def bleu_evaluate(pred, reference, weight='0.25 0.25 0.25'):
    smooth = bleu_helper.smooth_bleu_evaluate(pred, reference, weight)
#     nltk_ = bleu_helper.nltk_bleu_evaluate(pred, reference, weight)
#     return smooth, nltk_
    return smooth

## Searchable Hyperparameters

In [8]:
# searchable hyperparameters
class SearchableHyperparameters:
    
    def __init__(self):
        embedding_size = ['64', '128', '256', '512']
        encoder_type = ['rnn', 'brnn', 'mean', 'transformer']
        decoder_type = encoder_type
        layers = ['1', '2', '3', '4', '5', '6']
        rnn_size = ['64', '128', '256', '512']
        rnn_type = ['LSTM', 'GRU', 'SRU']
        global_attention = ['dot', 'general', 'mlp']
        self_attention_type = ['scaled_dot', 'average']
        heads = [2, 4, 6, 8, 10]
        transformer_ff = [1024, 2048, 4096]

        normalization = ['sents', 'tokens']
        valid_steps = 1000
        train_steps = 10000
        dropout = 0.1

## Custom Trainer Class

Features:
1. Early stop supported.
2. Custom validation evaluation functions.(TODO)
3. Custom callbacks. (TODO)

In [9]:
class MyTrainer(Trainer):
    def train(self,
              train_iter,
              train_steps,
              save_checkpoint_steps=5000,
              valid_iter=None,
              valid_steps=10000,
              early_stop_round=50000,
              early_stop_threshold=1):
        self.last_model_saver = None
        self.last_step = None
        self.last_moving_average = None
        if valid_iter is None:
            logger.info('Start training loop without validation...')
        else:
            logger.info('Start training loop and validate every %d steps...',
                        valid_steps)

        total_stats = onmt.utils.Statistics()
        report_stats = onmt.utils.Statistics()
        self._start_report_manager(start_time=total_stats.start_time)
        
        valid_early_stop_loss = []
        last_accent_valid_acc = float('-inf')
        decent_round = 0
        for i, (batches, normalization) in enumerate(
                self._accum_batches(train_iter)):
            if i == 0:
                print(i, 'Testing GPU memory capability with batch shape:', batches[0].src[0].shape, batches[0].tgt.shape)
            step = self.optim.training_step
            if decent_round == 0:  # 如果没有处于下降过程，则保存前一个model，如果当前step下降，则一直保存这个model
                self.last_model_saver = deepcopy(self.model_saver)
                self.last_step = step
                self.last_moving_average = deepcopy(self.moving_average)

            self._gradient_accumulation(
                batches, normalization, total_stats,
                report_stats)

            if self.average_decay > 0 and i % self.average_every == 0:
                self._update_average(step)
                            
            # 向tensorboard输出loss
            if step % self.report_manager.report_every == 0:
                tensorboard_writer = self.report_manager.tensorboard_writer
                tensorboard_writer.add_scalar('train/loss', report_stats.loss, step)

            report_stats = self._maybe_report_training(
                step, train_steps,
                self.optim.learning_rate(),
                report_stats)

            
            if valid_iter is not None and step % valid_steps == 0:
                print('last accent valid acc:', last_accent_valid_acc)
                if step == valid_steps:
                    valid_stats = self.validate(
                        valid_iter, verbose=1, moving_average=self.moving_average)
                else:
                    valid_stats = self.validate(
                        valid_iter, moving_average=self.moving_average)
                valid_stats = self._maybe_gather_stats(valid_stats)
                tensorboard_writer.add_scalar('valid/loss', valid_stats.loss, step)
#                 valid_acc = valid_stats.accuracy()
                valid_acc = -valid_stats.loss
                
#                 self._report_step(self.optim.learning_rate(),
#                                   step, valid_stats=valid_stats)
#                 if last_accent_valid_acc + early_stop_threshold > valid_acc:
#                     print('meet early stop condition, prev best loss:', last_accent_valid_acc, 'current loss:', valid_acc)
#                     if self.model_saver is not None:
#                         self.model_saver.save(step, moving_average=self.moving_average)
#                     return total_stats
                
#                 last_accent_valid_acc = valid_acc
                
                if valid_acc < last_accent_valid_acc + early_stop_threshold:  # 开始下降
                    if decent_round == 0:  # 保存最后一个上升model
                        print('Save last accent model with acc:', last_accent_valid_acc)
                    decent_round += 1
                    print('Decent round:', decent_round)
                else:
                    decent_round = 0
                    last_accent_valid_acc = valid_acc

                self._report_step(self.optim.learning_rate(),
                                  step, valid_stats=valid_stats)
                if decent_round == early_stop_round:  # 停止
                    if self.last_model_saver is not None:
                        print('meet early stop condition, prev best loss:', last_accent_valid_acc, 'current loss:', valid_acc)
                        self.last_model_saver.save(self.last_step, moving_average=self.last_moving_average)
                    return total_stats

#                 if len(valid_early_stop_loss) == early_stop_round:
#                     # 条件为当前valid loss要超过前n个valid loss最好的那个加threshold (X)
#                     # 应该允许valid loss暂时下降，但不能持续下降超过n个valid round
#                     # 如果持续下降，回到最好的model
#                     if valid_loss - max(valid_early_stop_loss) < early_stop_threshold:
#                         print('meet early stop condition, prev best loss:', max(valid_early_stop_loss), 'current loss:', valid_loss)
#                         if self.model_saver is not None:
#                             self.model_saver.save(step, moving_average=self.moving_average)
#                             self._report_step(self.optim.learning_rate(),
#                                               step, valid_stats=valid_stats)
#                         return total_stats
#                     else:
#                         valid_early_stop_loss.pop(0)
#                         valid_early_stop_loss.append(valid_loss)

            if (self.model_saver is not None
                    and (save_checkpoint_steps != 0
                         and step % save_checkpoint_steps == 0)):
                self.model_saver.save(step, moving_average=self.moving_average)

            if train_steps > 0 and step >= train_steps:
                break

        if self.model_saver is not None:
            self.model_saver.save(step, moving_average=self.moving_average)
        return total_stats
    
    def validate(self, valid_iter, verbose=0, moving_average=None):
        if moving_average:
            valid_model = deepcopy(self.model)
            for avg, param in zip(self.moving_average,
                                  valid_model.parameters()):
                param.data = avg.data.half() if self.model_dtype == "fp16" \
                    else avg.data
        else:
            valid_model = self.model

        # Set model in validating mode.
        valid_model.eval()

        with torch.no_grad():
            stats = onmt.utils.Statistics()

            for batch in valid_iter:
#                 print('Testing GPU memory capability with batch shape:', batch.src[0].shape, batch.tgt.shape)
                src, src_lengths = batch.src if isinstance(batch.src, tuple) \
                                   else (batch.src, None)
                tgt = batch.tgt

                # F-prop through the model.
                outputs, attns = valid_model(src, tgt, src_lengths)

                # Compute loss.
                _, batch_stats = self.valid_loss(batch, outputs, attns)

                # Update statistics.
                stats.update(batch_stats)

        if moving_average:
            del valid_model
        else:
            # Set model back to training mode.
            valid_model.train()

        return stats

In [10]:
def build_trainer(opt, device_id, model, fields, optim, model_saver=None):
    tgt_field = fields['tgt'][0][1].base_field
    train_loss = build_loss_compute(model, tgt_field, opt)
    valid_loss = build_loss_compute(
        model, tgt_field, opt, train=False)

    trunc_size = opt.truncated_decoder  # Badly named...
    shard_size = opt.max_generator_batches if opt.model_dtype == 'fp32' else 0
    norm_method = opt.normalization
    grad_accum_count = opt.accum_count
    n_gpu = opt.world_size
    average_decay = opt.average_decay
    average_every = opt.average_every
    if device_id >= 0:
        gpu_rank = opt.gpu_ranks[device_id]
    else:
        gpu_rank = 0
        n_gpu = 0
    gpu_verbose_level = opt.gpu_verbose_level

    report_manager = onmt.utils.build_report_manager(opt)
    trainer = MyTrainer(model, train_loss, valid_loss, optim, trunc_size,
                        shard_size, norm_method,
                        grad_accum_count, n_gpu, gpu_rank,
                        gpu_verbose_level, report_manager,
                        model_saver=model_saver if gpu_rank == 0 else None,
                        average_decay=average_decay,
                        average_every=average_every,
                        model_dtype=opt.model_dtype)
    return trainer

## Training Entry

In [11]:
def _check_save_model_path(opt):
    save_model_path = os.path.abspath(opt.save_model)
    model_dirname = os.path.dirname(save_model_path)
    if not os.path.exists(model_dirname):
        os.makedirs(model_dirname)


def _tally_parameters(model):
    enc = 0
    dec = 0
    for name, param in model.named_parameters():
        if 'encoder' in name:
            enc += param.nelement()
        else:
            dec += param.nelement()
    return enc + dec, enc, dec


def training_opt_postprocessing(opt, device_id):
    if opt.word_vec_size != -1:
        opt.src_word_vec_size = opt.word_vec_size
        opt.tgt_word_vec_size = opt.word_vec_size

    if opt.layers != -1:
        opt.enc_layers = opt.layers
        opt.dec_layers = opt.layers

    if opt.rnn_size != -1:
        opt.enc_rnn_size = opt.rnn_size
        opt.dec_rnn_size = opt.rnn_size

        # this check is here because audio allows the encoder and decoder to
        # be different sizes, but other model types do not yet
        same_size = opt.enc_rnn_size == opt.dec_rnn_size
        assert opt.model_type == 'audio' or same_size, \
            "The encoder and decoder rnns must be the same size for now"

    opt.brnn = opt.encoder_type == "brnn"

    assert opt.rnn_type != "SRU" or opt.gpu_ranks, \
        "Using SRU requires -gpu_ranks set."

    if torch.cuda.is_available() and not opt.gpu_ranks:
        logger.info("WARNING: You have a CUDA device, \
                    should run with -gpu_ranks")

    if device_id >= 0:
        torch.cuda.set_device(device_id)
    set_random_seed(opt.seed, device_id >= 0)

    return opt


def train_main(opt, device_id):
    opt = training_opt_postprocessing(opt, device_id)
    init_logger(opt.log_file)
    # Load checkpoint if we resume from a previous training.
    if opt.train_from:
        logger.info('Loading checkpoint from %s' % opt.train_from)
        checkpoint = torch.load(opt.train_from,
                                map_location=lambda storage, loc: storage)

        # Load default opts values then overwrite it with opts from
        # the checkpoint. It's usefull in order to re-train a model
        # after adding a new option (not set in checkpoint)
        dummy_parser = configargparse.ArgumentParser()
        opts.model_opts(dummy_parser)
        default_opt = dummy_parser.parse_known_args([])[0]

        model_opt = default_opt
        model_opt.__dict__.update(checkpoint['opt'].__dict__)
        logger.info('Loading vocab from checkpoint at %s.' % opt.train_from)
        vocab = checkpoint['vocab']
    else:
        checkpoint = None
        model_opt = opt
        vocab = torch.load(opt.data + '.vocab.pt')

    # check for code where vocab is saved instead of fields
    # (in the future this will be done in a smarter way)
    if old_style_vocab(vocab):
        data_type = opt.model_type
        fields = load_old_vocab(vocab, data_type, dynamic_dict=opt.copy_attn)
    else:
        fields = vocab

    # Report src and tgt vocab sizes, including for features
    for side in ['src', 'tgt']:
        for name, f in fields[side]:
            try:
                f_iter = iter(f)
            except TypeError:
                f_iter = [(name, f)]
            for sn, sf in f_iter:
                if sf.use_vocab:
                    logger.info(' * %s vocab size = %d' % (sn, len(sf.vocab)))

    # Build model.
    model = build_model(model_opt, opt, fields, checkpoint)
    n_params, enc, dec = _tally_parameters(model)
    logger.info('encoder: %d' % enc)
    logger.info('decoder: %d' % dec)
    logger.info('* number of parameters: %d' % n_params)
    _check_save_model_path(opt)

    # Build optimizer.
    optim = Optimizer.from_opt(model, opt, checkpoint=checkpoint)

    # Build model saver
    model_saver = build_model_saver(model_opt, opt, model, fields, optim)

    trainer = build_trainer(
        opt, device_id, model, fields, optim, model_saver=model_saver)

    # this line is kind of a temporary kludge because different objects expect
    # fields to have a different structure
    dataset_fields = dict(chain.from_iterable(fields.values()))

    train_iter = build_dataset_iter("train", dataset_fields, opt)
    valid_iter = build_dataset_iter(
        "valid", dataset_fields, opt, is_train=False)

    if len(opt.gpu_ranks):
        logger.info('Starting training on GPU: %s' % opt.gpu_ranks)
    else:
        logger.info('Starting training on CPU, could be very slow')
    train_steps = opt.train_steps
    if opt.single_pass and train_steps > 0:
        logger.warning("Option single_pass is enabled, ignoring train_steps.")
        train_steps = 0
    trainer.train(
        train_iter,
        train_steps,
        save_checkpoint_steps=opt.save_checkpoint_steps,
        valid_iter=valid_iter,
        valid_steps=opt.valid_steps,
        early_stop_round=opt.early_stop_round,
        early_stop_threshold=opt.early_stop_threshold)

    if opt.tensorboard:
        trainer.report_manager.tensorboard_writer.close()

## Home-made Runner Class

Features:
1. Automatic hyperparameters search. (TODO)
2. Reusable.

In [12]:
class BaseRunner:
    
    def __init__(self, data_path):
        self.data_path = data_path
        self.model_id = None
    
    def run(self, option, model_id=None, train_from=None):
        model_id = model_id if model_id is not None else BaseRunner.generate_id()
        self.model_id = model_id
        self.prepare_save_file()
        arg_dict = {
            'data': self.data_path,
            'save_model': self.model_path,
            'gpu_ranks': 0,
            'tensorboard': None,
            'tensorboard_log_dir': self.log_path + '/log'
        }
        arg_dict.update(option)
        if train_from is not None:
            arg_dict['train_from'] = train_from
        ArgumentHelper.save_args(arg_dict, self.model_path)
        parser = configargparse.ArgumentParser(
            description='train.py',
            formatter_class=configargparse.ArgumentDefaultsHelpFormatter)

        opts.add_md_help_argument(parser)
        opts.model_opts(parser)
        opts.train_opts(parser)
        parser.add_argument('-early_stop_round', type=int, default=10000, help='Early stop round')
        parser.add_argument('-early_stop_threshold', type=float, default=1, help='Early stop threshold')

        opt = parser.parse_args(ArgumentHelper.convert_dict_to_arg_array(arg_dict))
        train_main(opt, 0)

    def translate_main(self, opt, logger, model_id):
        translator = build_translator(opt, report_score=True)
        src_shards = split_corpus(opt.src, opt.shard_size)
        tgt_shards = split_corpus(opt.tgt, opt.shard_size) \
            if opt.tgt is not None else [None]*opt.shard_size
        shard_pairs = zip(src_shards, tgt_shards)

        for i, (src_shard, tgt_shard) in enumerate(shard_pairs):
            logger.info("Translating shard %d." % i)
            translator.translate(
                src=src_shard,
                tgt=tgt_shard,
                src_dir=opt.src_dir,
                batch_size=opt.batch_size,
                attn_debug=opt.attn_debug
            )

    def translate(self, args, model_id=None):
        model_id = model_id if model_id is not None else self.model_id
        if model_id is None:
            raise Exception()
        self.model_id = model_id
        self.prepare_save_file()
        args['model'] = self.model_path + '_step_' + str(BaseRunner.get_latest_model(
                                                        self.model_dir,
                                                        self.model_prefix,
                                                        self.model_id)) + '.pt'
        parser = configargparse.ArgumentParser(
            description='translate.py',
            config_file_parser_class=configargparse.YAMLConfigFileParser,
            formatter_class=configargparse.ArgumentDefaultsHelpFormatter)
        opts.config_opts(parser)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        opt = parser.parse_args(ArgumentHelper.convert_dict_to_arg_array(args))
        logger = init_logger(opt.log_file)
        self.translate_main(opt, logger, model_id)

    @staticmethod
    def generate_id():
        return hashlib.md5(str(uuid.uuid4()).encode()).hexdigest()
    
    @staticmethod
    def get_latest_model(model_dir, model_prefix, model_id):
        all_steps = []
        for root, dirs, files in os.walk(model_dir):
            path = root.split(os.sep)
            for file in files:
                absolute_filename = os.path.join(root, file)
                if model_id in absolute_filename:
                    steps = re.findall(model_prefix + model_id + '_step_(\d+)\.pt', absolute_filename)
                    if len(steps) > 0:
                        all_steps.append(int(steps[0]))
        return max(all_steps)

    def prepare_save_file(self):
        model_id = self.model_id
        model_dir = '/mnt/drive-2t/model/'
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        model_prefix = 'opennmt_'
        model_path = model_dir + model_prefix + model_id
        log_dir = '/mnt/drive-2t/log/'
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        log_prefix = model_prefix
        log_path = log_dir + log_prefix + model_id
        if not os.path.exists(log_path):
            os.mkdir(log_path)
        self.model_path = model_path
        self.log_path = log_path
        self.model_prefix = model_prefix
        self.model_dir = model_dir


runner = BaseRunner('data/nmt15')

In [13]:
reverse_runner = BaseRunner('data/nmt15-reverse')

In [14]:
torch.cuda.memory_allocated(0) / 1024 / 1024

0.0

In [15]:
torch.cuda.memory_cached(0) / 1024 / 1024 

0.0

In [16]:
torch.cuda.max_memory_cached(0) / 1024 / 1024

0.0

In [17]:
all_vars = {k: v for k, v in globals().items() if not k.startswith('_') and k not in ['In', 'Out']}

In [18]:
var_type = {k: str(type(v)) for k, v in all_vars.items() if 'function' not in str(type(v)) and
                                                            'module' not in str(type(v))}
var_type

{'get_ipython': "<class 'method'>",
 'exit': "<class 'IPython.core.autocall.ZMQExitAutocall'>",
 'quit': "<class 'IPython.core.autocall.ZMQExitAutocall'>",
 'Trainer': "<class 'type'>",
 'logger': "<class 'logging.RootLogger'>",
 'Optimizer': "<class 'type'>",
 'chain': "<class 'type'>",
 'ArgumentHelper': "<class 'type'>",
 'BleuHelper': "<class 'type'>",
 'bleu_helper': "<class '__main__.BleuHelper'>",
 'SearchableHyperparameters': "<class 'type'>",
 'MyTrainer': "<class 'type'>",
 'BaseRunner': "<class 'type'>",
 'runner': "<class '__main__.BaseRunner'>",
 'reverse_runner': "<class '__main__.BaseRunner'>"}

In [19]:
del reverse_runner
gc.collect()
reverse_runner = BaseRunner('data/nmt15-reverse')

## Example #0: Preprocessing

In [29]:
# %%bash
# python preprocess.py \
#      -train_src data/nmt15/train_vi \
#      -train_tgt data/nmt15/train_en \
#      -valid_src data/nmt15/valid_vi \
#      -valid_tgt data/nmt15/valid_en \
#      -save_data data/nmt15-reverse \
#      --src_seq_length 200 \
#      --tgt_seq_length 200 \
#      --src_words_min_frequency 3 \
#      --tgt_words_min_frequency 3

## Example #1: Train A Basic LSTM Network

In [16]:
model_args = {
    'encoder_type': 'brnn',
    'rnn_type': 'GRU'
}
train_args = {
    'batch_size': 64,
    'train_steps': 20000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 64,
    'early_stop_round': 5,
    'early_stop_threshold': 0.01,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)
reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-16 19:26:39,631 INFO]  * src vocab size = 8870
[2019-02-16 19:26:39,632 INFO]  * tgt vocab size = 20071
[2019-02-16 19:26:39,633 INFO] Building model...
[2019-02-16 19:26:43,167 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 500, padding_idx=1)
        )
      )
    )
    (rnn): GRU(500, 250, num_layers=2, dropout=0.3, bidirectional=True)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedGRU(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): GRUCell(1000, 500)
        (1): GRUCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_out): L

last accent valid acc: -inf


[2019-02-16 19:29:13,092 INFO] Validation perplexity: 75.9304
[2019-02-16 19:29:13,093 INFO] Validation accuracy: 32.9474
[2019-02-16 19:29:42,160 INFO] Step 600/20000; acc:  31.85; ppl: 88.06; xent: 4.48; lr: 0.00100; 3590/3064 tok/s;    179 sec
[2019-02-16 19:30:11,635 INFO] Step 700/20000; acc:  32.60; ppl: 79.42; xent: 4.37; lr: 0.00100; 5425/4624 tok/s;    208 sec
[2019-02-16 19:30:41,019 INFO] Step 800/20000; acc:  34.85; ppl: 65.90; xent: 4.19; lr: 0.00100; 5312/4527 tok/s;    238 sec
[2019-02-16 19:31:11,013 INFO] Step 900/20000; acc:  35.54; ppl: 60.60; xent: 4.10; lr: 0.00100; 5210/4449 tok/s;    268 sec
[2019-02-16 19:31:41,459 INFO] Step 1000/20000; acc:  35.98; ppl: 57.13; xent: 4.05; lr: 0.00100; 5233/4462 tok/s;    298 sec
[2019-02-16 19:31:41,569 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 32.94736280636874


[2019-02-16 19:31:57,461 INFO] Validation perplexity: 42.9186
[2019-02-16 19:31:57,462 INFO] Validation accuracy: 38.7609
[2019-02-16 19:32:28,021 INFO] Step 1100/20000; acc:  36.74; ppl: 53.38; xent: 3.98; lr: 0.00100; 3430/2899 tok/s;    345 sec
[2019-02-16 19:32:59,141 INFO] Step 1200/20000; acc:  37.88; ppl: 48.85; xent: 3.89; lr: 0.00100; 5121/4354 tok/s;    376 sec
[2019-02-16 19:33:30,448 INFO] Step 1300/20000; acc:  38.31; ppl: 45.87; xent: 3.83; lr: 0.00100; 5035/4287 tok/s;    407 sec
[2019-02-16 19:34:01,939 INFO] Step 1400/20000; acc:  38.50; ppl: 44.79; xent: 3.80; lr: 0.00100; 4983/4254 tok/s;    439 sec
[2019-02-16 19:34:34,977 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 19:34:35,735 INFO] Step 1500/20000; acc:  39.53; ppl: 40.56; xent: 3.70; lr: 0.00100; 4622/3948 tok/s;    472 sec
[2019-02-16 19:34:35,833 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 38.76092348449154


[2019-02-16 19:34:51,810 INFO] Validation perplexity: 35.5603
[2019-02-16 19:34:51,811 INFO] Validation accuracy: 40.1882
[2019-02-16 19:35:24,221 INFO] Step 1600/20000; acc:  39.47; ppl: 39.27; xent: 3.67; lr: 0.00100; 3281/2794 tok/s;    521 sec
[2019-02-16 19:35:56,277 INFO] Step 1700/20000; acc:  40.03; ppl: 36.84; xent: 3.61; lr: 0.00100; 4873/4159 tok/s;    553 sec
[2019-02-16 19:36:28,540 INFO] Step 1800/20000; acc:  40.35; ppl: 35.74; xent: 3.58; lr: 0.00100; 4921/4195 tok/s;    585 sec
[2019-02-16 19:37:00,908 INFO] Step 1900/20000; acc:  40.62; ppl: 33.79; xent: 3.52; lr: 0.00100; 4932/4186 tok/s;    618 sec
[2019-02-16 19:37:33,105 INFO] Step 2000/20000; acc:  41.61; ppl: 30.66; xent: 3.42; lr: 0.00100; 4935/4206 tok/s;    650 sec
[2019-02-16 19:37:33,206 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 40.18819952527848


[2019-02-16 19:37:49,352 INFO] Validation perplexity: 27.8196
[2019-02-16 19:37:49,353 INFO] Validation accuracy: 42.5583
[2019-02-16 19:38:21,894 INFO] Step 2100/20000; acc:  41.08; ppl: 31.50; xent: 3.45; lr: 0.00100; 3265/2781 tok/s;    699 sec
[2019-02-16 19:38:54,809 INFO] Step 2200/20000; acc:  41.33; ppl: 30.37; xent: 3.41; lr: 0.00100; 5003/4254 tok/s;    732 sec
[2019-02-16 19:39:26,821 INFO] Step 2300/20000; acc:  42.86; ppl: 26.85; xent: 3.29; lr: 0.00100; 4729/4042 tok/s;    764 sec
[2019-02-16 19:39:59,215 INFO] Step 2400/20000; acc:  42.43; ppl: 27.51; xent: 3.31; lr: 0.00100; 4872/4153 tok/s;    796 sec
[2019-02-16 19:40:31,632 INFO] Step 2500/20000; acc:  42.53; ppl: 26.94; xent: 3.29; lr: 0.00100; 4856/4147 tok/s;    828 sec
[2019-02-16 19:40:31,736 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 42.55825117569142


[2019-02-16 19:40:47,906 INFO] Validation perplexity: 24.2594
[2019-02-16 19:40:47,907 INFO] Validation accuracy: 44.3825
[2019-02-16 19:41:20,243 INFO] Step 2600/20000; acc:  42.58; ppl: 26.75; xent: 3.29; lr: 0.00100; 3291/2779 tok/s;    877 sec
[2019-02-16 19:41:52,519 INFO] Step 2700/20000; acc:  42.85; ppl: 26.17; xent: 3.26; lr: 0.00100; 4923/4189 tok/s;    909 sec
[2019-02-16 19:42:24,653 INFO] Step 2800/20000; acc:  43.14; ppl: 25.44; xent: 3.24; lr: 0.00100; 4968/4234 tok/s;    941 sec
[2019-02-16 19:42:57,047 INFO] Step 2900/20000; acc:  42.78; ppl: 26.04; xent: 3.26; lr: 0.00100; 4921/4181 tok/s;    974 sec
[2019-02-16 19:43:29,383 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 19:43:30,719 INFO] Step 3000/20000; acc:  43.80; ppl: 23.84; xent: 3.17; lr: 0.00100; 4489/3853 tok/s;   1007 sec
[2019-02-16 19:43:30,825 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 44.38246197338359


[2019-02-16 19:43:46,838 INFO] Validation perplexity: 23.3733
[2019-02-16 19:43:46,839 INFO] Validation accuracy: 44.1958


Save last accent model with acc: 44.38246197338359
Decent round: 1


[2019-02-16 19:43:57,297 INFO] Step 3100/20000; acc:  43.29; ppl: 24.43; xent: 3.20; lr: 0.00100; 6015/5117 tok/s;   1034 sec
[2019-02-16 19:44:07,500 INFO] Step 3200/20000; acc:  44.24; ppl: 22.65; xent: 3.12; lr: 0.00100; 15308/13106 tok/s;   1044 sec
[2019-02-16 19:44:17,771 INFO] Step 3300/20000; acc:  44.39; ppl: 22.42; xent: 3.11; lr: 0.00100; 15447/13149 tok/s;   1054 sec
[2019-02-16 19:44:28,229 INFO] Step 3400/20000; acc:  43.80; ppl: 23.15; xent: 3.14; lr: 0.00100; 15351/13012 tok/s;   1065 sec
[2019-02-16 19:44:38,503 INFO] Step 3500/20000; acc:  44.46; ppl: 21.53; xent: 3.07; lr: 0.00100; 15546/13248 tok/s;   1075 sec
[2019-02-16 19:44:38,602 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 44.38246197338359


[2019-02-16 19:44:54,706 INFO] Validation perplexity: 21.3142
[2019-02-16 19:44:54,706 INFO] Validation accuracy: 45.4404
[2019-02-16 19:45:27,174 INFO] Step 3600/20000; acc:  44.33; ppl: 21.77; xent: 3.08; lr: 0.00100; 3268/2782 tok/s;   1124 sec
[2019-02-16 19:45:59,961 INFO] Step 3700/20000; acc:  43.94; ppl: 22.30; xent: 3.10; lr: 0.00100; 5043/4294 tok/s;   1157 sec
[2019-02-16 19:46:31,851 INFO] Step 3800/20000; acc:  45.29; ppl: 20.00; xent: 3.00; lr: 0.00100; 4734/4038 tok/s;   1189 sec
[2019-02-16 19:47:04,186 INFO] Step 3900/20000; acc:  44.96; ppl: 20.51; xent: 3.02; lr: 0.00100; 4819/4113 tok/s;   1221 sec
[2019-02-16 19:47:36,747 INFO] Step 4000/20000; acc:  44.64; ppl: 20.66; xent: 3.03; lr: 0.00100; 4888/4170 tok/s;   1253 sec
[2019-02-16 19:47:36,853 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 45.44036199738636


[2019-02-16 19:47:52,896 INFO] Validation perplexity: 20.4828
[2019-02-16 19:47:52,897 INFO] Validation accuracy: 46.056
[2019-02-16 19:48:25,243 INFO] Step 4100/20000; acc:  44.97; ppl: 20.37; xent: 3.01; lr: 0.00100; 3306/2791 tok/s;   1302 sec
[2019-02-16 19:48:57,435 INFO] Step 4200/20000; acc:  44.82; ppl: 20.37; xent: 3.01; lr: 0.00100; 4901/4177 tok/s;   1334 sec
[2019-02-16 19:49:31,888 INFO] Step 4300/20000; acc:  45.10; ppl: 19.87; xent: 2.99; lr: 0.00100; 4628/3942 tok/s;   1369 sec
[2019-02-16 19:50:05,929 INFO] Step 4400/20000; acc:  45.20; ppl: 20.21; xent: 3.01; lr: 0.00100; 4694/3984 tok/s;   1403 sec
[2019-02-16 19:50:39,843 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 19:50:41,860 INFO] Step 4500/20000; acc:  46.30; ppl: 18.51; xent: 2.92; lr: 0.00100; 4212/3627 tok/s;   1439 sec
[2019-02-16 19:50:41,960 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 46.05598869202663


[2019-02-16 19:50:58,357 INFO] Validation perplexity: 20.2581
[2019-02-16 19:50:58,358 INFO] Validation accuracy: 46.0093


Save last accent model with acc: 46.05598869202663
Decent round: 1


[2019-02-16 19:51:09,444 INFO] Step 4600/20000; acc:  45.16; ppl: 19.73; xent: 2.98; lr: 0.00100; 5952/5044 tok/s;   1466 sec
[2019-02-16 19:51:19,774 INFO] Step 4700/20000; acc:  46.45; ppl: 17.76; xent: 2.88; lr: 0.00100; 14648/12564 tok/s;   1476 sec
[2019-02-16 19:51:30,803 INFO] Step 4800/20000; acc:  46.08; ppl: 18.27; xent: 2.91; lr: 0.00100; 14344/12228 tok/s;   1488 sec
[2019-02-16 19:51:41,655 INFO] Step 4900/20000; acc:  45.56; ppl: 19.12; xent: 2.95; lr: 0.00100; 14802/12540 tok/s;   1498 sec
[2019-02-16 19:51:52,228 INFO] Step 5000/20000; acc:  46.23; ppl: 17.71; xent: 2.87; lr: 0.00100; 15149/12931 tok/s;   1509 sec
[2019-02-16 19:51:52,326 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 46.05598869202663


[2019-02-16 19:52:09,675 INFO] Validation perplexity: 19.466
[2019-02-16 19:52:09,676 INFO] Validation accuracy: 46.3827
[2019-02-16 19:52:09,677 INFO] Saving checkpoint /mnt/drive-2t/model/opennmt_182b910f202193cc4dad1cd43d74b3ba_step_5000.pt
[2019-02-16 19:52:45,211 INFO] Step 5100/20000; acc:  45.78; ppl: 18.62; xent: 2.92; lr: 0.00100; 3010/2557 tok/s;   1562 sec
[2019-02-16 19:53:18,194 INFO] Step 5200/20000; acc:  45.60; ppl: 18.66; xent: 2.93; lr: 0.00100; 4988/4254 tok/s;   1595 sec
[2019-02-16 19:53:50,068 INFO] Step 5300/20000; acc:  46.69; ppl: 17.04; xent: 2.84; lr: 0.00100; 4778/4071 tok/s;   1627 sec
[2019-02-16 19:54:22,319 INFO] Step 5400/20000; acc:  46.05; ppl: 17.96; xent: 2.89; lr: 0.00100; 4774/4081 tok/s;   1659 sec
[2019-02-16 19:54:56,007 INFO] Step 5500/20000; acc:  46.05; ppl: 17.86; xent: 2.88; lr: 0.00100; 4762/4060 tok/s;   1693 sec
[2019-02-16 19:54:56,109 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 46.38269311120396


[2019-02-16 19:55:12,230 INFO] Validation perplexity: 19.6073
[2019-02-16 19:55:12,231 INFO] Validation accuracy: 46.4263
[2019-02-16 19:55:44,551 INFO] Step 5600/20000; acc:  46.27; ppl: 17.80; xent: 2.88; lr: 0.00100; 3324/2802 tok/s;   1741 sec
[2019-02-16 19:56:16,982 INFO] Step 5700/20000; acc:  46.02; ppl: 17.82; xent: 2.88; lr: 0.00100; 4849/4138 tok/s;   1774 sec
[2019-02-16 19:56:49,088 INFO] Step 5800/20000; acc:  46.46; ppl: 17.10; xent: 2.84; lr: 0.00100; 4948/4207 tok/s;   1806 sec
[2019-02-16 19:57:21,669 INFO] Step 5900/20000; acc:  46.08; ppl: 17.82; xent: 2.88; lr: 0.00100; 4933/4191 tok/s;   1838 sec
[2019-02-16 19:57:52,727 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 19:57:55,415 INFO] Step 6000/20000; acc:  47.16; ppl: 16.41; xent: 2.80; lr: 0.00100; 4502/3876 tok/s;   1872 sec
[2019-02-16 19:57:55,515 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 46.42625370042761


[2019-02-16 19:58:11,547 INFO] Validation perplexity: 19.7417
[2019-02-16 19:58:11,548 INFO] Validation accuracy: 46.0991


Save last accent model with acc: 46.42625370042761
Decent round: 1


[2019-02-16 19:58:22,277 INFO] Step 6100/20000; acc:  46.09; ppl: 17.95; xent: 2.89; lr: 0.00100; 6142/5197 tok/s;   1899 sec
[2019-02-16 19:58:32,096 INFO] Step 6200/20000; acc:  47.79; ppl: 15.46; xent: 2.74; lr: 0.00100; 15250/13108 tok/s;   1909 sec
[2019-02-16 19:58:42,382 INFO] Step 6300/20000; acc:  47.25; ppl: 16.27; xent: 2.79; lr: 0.00100; 15329/13077 tok/s;   1919 sec
[2019-02-16 19:58:52,780 INFO] Step 6400/20000; acc:  46.33; ppl: 17.28; xent: 2.85; lr: 0.00100; 15393/13036 tok/s;   1929 sec
[2019-02-16 19:59:03,129 INFO] Step 6500/20000; acc:  47.24; ppl: 16.29; xent: 2.79; lr: 0.00100; 15577/13304 tok/s;   1940 sec
[2019-02-16 19:59:03,228 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 46.42625370042761


[2019-02-16 19:59:19,387 INFO] Validation perplexity: 18.9723
[2019-02-16 19:59:19,387 INFO] Validation accuracy: 46.6885
[2019-02-16 19:59:51,847 INFO] Step 6600/20000; acc:  46.65; ppl: 16.79; xent: 2.82; lr: 0.00100; 3288/2788 tok/s;   1989 sec
[2019-02-16 20:00:24,680 INFO] Step 6700/20000; acc:  46.33; ppl: 17.49; xent: 2.86; lr: 0.00100; 5005/4270 tok/s;   2021 sec
[2019-02-16 20:00:56,631 INFO] Step 6800/20000; acc:  47.79; ppl: 15.31; xent: 2.73; lr: 0.00100; 4755/4059 tok/s;   2053 sec
[2019-02-16 20:01:28,653 INFO] Step 6900/20000; acc:  47.42; ppl: 16.08; xent: 2.78; lr: 0.00100; 4722/4037 tok/s;   2085 sec
[2019-02-16 20:02:01,100 INFO] Step 7000/20000; acc:  46.83; ppl: 16.55; xent: 2.81; lr: 0.00100; 4981/4251 tok/s;   2118 sec
[2019-02-16 20:02:01,208 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 46.68850622738628


[2019-02-16 20:02:17,315 INFO] Validation perplexity: 18.4581
[2019-02-16 20:02:17,316 INFO] Validation accuracy: 47.0503
[2019-02-16 20:02:49,632 INFO] Step 7100/20000; acc:  46.99; ppl: 16.38; xent: 2.80; lr: 0.00100; 3368/2831 tok/s;   2166 sec
[2019-02-16 20:03:21,835 INFO] Step 7200/20000; acc:  47.18; ppl: 16.40; xent: 2.80; lr: 0.00100; 4873/4163 tok/s;   2199 sec
[2019-02-16 20:03:53,980 INFO] Step 7300/20000; acc:  47.07; ppl: 16.25; xent: 2.79; lr: 0.00100; 4893/4158 tok/s;   2231 sec
[2019-02-16 20:04:26,426 INFO] Step 7400/20000; acc:  46.79; ppl: 16.77; xent: 2.82; lr: 0.00100; 4937/4199 tok/s;   2263 sec
[2019-02-16 20:04:56,971 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 20:05:00,283 INFO] Step 7500/20000; acc:  47.67; ppl: 15.52; xent: 2.74; lr: 0.00100; 4502/3871 tok/s;   2297 sec
[2019-02-16 20:05:00,392 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.05032581542756


[2019-02-16 20:05:16,329 INFO] Validation perplexity: 18.8753
[2019-02-16 20:05:16,330 INFO] Validation accuracy: 46.8627


Save last accent model with acc: 47.05032581542756
Decent round: 1


[2019-02-16 20:05:27,080 INFO] Step 7600/20000; acc:  47.01; ppl: 16.61; xent: 2.81; lr: 0.00100; 6141/5202 tok/s;   2324 sec
[2019-02-16 20:05:36,924 INFO] Step 7700/20000; acc:  48.47; ppl: 14.51; xent: 2.67; lr: 0.00100; 15240/13088 tok/s;   2334 sec
[2019-02-16 20:05:47,331 INFO] Step 7800/20000; acc:  47.50; ppl: 15.99; xent: 2.77; lr: 0.00100; 15378/13131 tok/s;   2344 sec
[2019-02-16 20:05:57,594 INFO] Step 7900/20000; acc:  47.31; ppl: 16.09; xent: 2.78; lr: 0.00100; 15477/13086 tok/s;   2354 sec
[2019-02-16 20:06:07,830 INFO] Step 8000/20000; acc:  47.79; ppl: 15.35; xent: 2.73; lr: 0.00100; 15611/13345 tok/s;   2365 sec
[2019-02-16 20:06:07,942 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.05032581542756


[2019-02-16 20:06:24,022 INFO] Validation perplexity: 18.673
[2019-02-16 20:06:24,023 INFO] Validation accuracy: 47.1299
[2019-02-16 20:06:56,672 INFO] Step 8100/20000; acc:  47.16; ppl: 16.14; xent: 2.78; lr: 0.00100; 3349/2835 tok/s;   2413 sec
[2019-02-16 20:07:29,401 INFO] Step 8200/20000; acc:  47.22; ppl: 16.23; xent: 2.79; lr: 0.00100; 4927/4220 tok/s;   2446 sec
[2019-02-16 20:08:01,244 INFO] Step 8300/20000; acc:  48.72; ppl: 14.35; xent: 2.66; lr: 0.00100; 4801/4086 tok/s;   2478 sec
[2019-02-16 20:08:33,293 INFO] Step 8400/20000; acc:  48.01; ppl: 15.14; xent: 2.72; lr: 0.00100; 4692/4017 tok/s;   2510 sec
[2019-02-16 20:09:06,415 INFO] Step 8500/20000; acc:  46.62; ppl: 16.76; xent: 2.82; lr: 0.00100; 5057/4295 tok/s;   2543 sec
[2019-02-16 20:09:06,519 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.12989056513197


[2019-02-16 20:09:22,591 INFO] Validation perplexity: 18.828
[2019-02-16 20:09:22,592 INFO] Validation accuracy: 47.1517
[2019-02-16 20:09:54,320 INFO] Step 8600/20000; acc:  47.83; ppl: 15.09; xent: 2.71; lr: 0.00100; 3277/2768 tok/s;   2591 sec
[2019-02-16 20:10:26,588 INFO] Step 8700/20000; acc:  47.60; ppl: 15.63; xent: 2.75; lr: 0.00100; 4894/4174 tok/s;   2623 sec
[2019-02-16 20:10:58,481 INFO] Step 8800/20000; acc:  47.98; ppl: 15.09; xent: 2.71; lr: 0.00100; 4895/4164 tok/s;   2655 sec
[2019-02-16 20:11:31,214 INFO] Step 8900/20000; acc:  46.97; ppl: 16.44; xent: 2.80; lr: 0.00100; 5043/4275 tok/s;   2688 sec
[2019-02-16 20:12:00,867 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 20:12:04,867 INFO] Step 9000/20000; acc:  48.87; ppl: 14.19; xent: 2.65; lr: 0.00100; 4485/3863 tok/s;   2722 sec
[2019-02-16 20:12:04,969 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.151670859743795


[2019-02-16 20:12:20,897 INFO] Validation perplexity: 18.7054
[2019-02-16 20:12:20,898 INFO] Validation accuracy: 47.1134


Save last accent model with acc: 47.151670859743795
Decent round: 1


[2019-02-16 20:12:31,720 INFO] Step 9100/20000; acc:  47.56; ppl: 15.50; xent: 2.74; lr: 0.00100; 6162/5218 tok/s;   2748 sec
[2019-02-16 20:12:41,529 INFO] Step 9200/20000; acc:  49.03; ppl: 13.73; xent: 2.62; lr: 0.00100; 15130/13023 tok/s;   2758 sec
[2019-02-16 20:12:51,830 INFO] Step 9300/20000; acc:  48.38; ppl: 14.83; xent: 2.70; lr: 0.00100; 15374/13150 tok/s;   2769 sec
[2019-02-16 20:13:02,122 INFO] Step 9400/20000; acc:  47.80; ppl: 15.51; xent: 2.74; lr: 0.00100; 15452/13065 tok/s;   2779 sec
[2019-02-16 20:13:12,427 INFO] Step 9500/20000; acc:  48.39; ppl: 14.54; xent: 2.68; lr: 0.00100; 15636/13334 tok/s;   2789 sec
[2019-02-16 20:13:12,522 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.151670859743795


[2019-02-16 20:13:28,758 INFO] Validation perplexity: 19.1499
[2019-02-16 20:13:28,759 INFO] Validation accuracy: 47.1992
[2019-02-16 20:14:01,706 INFO] Step 9600/20000; acc:  47.75; ppl: 15.33; xent: 2.73; lr: 0.00100; 3295/2794 tok/s;   2838 sec
[2019-02-16 20:14:34,590 INFO] Step 9700/20000; acc:  47.64; ppl: 15.57; xent: 2.75; lr: 0.00100; 4892/4179 tok/s;   2871 sec
[2019-02-16 20:15:06,548 INFO] Step 9800/20000; acc:  49.31; ppl: 13.65; xent: 2.61; lr: 0.00100; 4757/4060 tok/s;   2903 sec
[2019-02-16 20:15:38,612 INFO] Step 9900/20000; acc:  48.44; ppl: 14.56; xent: 2.68; lr: 0.00100; 4730/4039 tok/s;   2935 sec
[2019-02-16 20:16:11,982 INFO] Step 10000/20000; acc:  47.45; ppl: 15.94; xent: 2.77; lr: 0.00100; 5033/4280 tok/s;   2969 sec
[2019-02-16 20:16:12,086 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.199231911243075


[2019-02-16 20:16:28,177 INFO] Validation perplexity: 18.7275
[2019-02-16 20:16:28,178 INFO] Validation accuracy: 47.3072
[2019-02-16 20:16:28,179 INFO] Saving checkpoint /mnt/drive-2t/model/opennmt_182b910f202193cc4dad1cd43d74b3ba_step_10000.pt
[2019-02-16 20:17:02,166 INFO] Step 10100/20000; acc:  48.27; ppl: 14.52; xent: 2.68; lr: 0.00100; 3114/2630 tok/s;   3019 sec
[2019-02-16 20:17:34,283 INFO] Step 10200/20000; acc:  48.01; ppl: 14.99; xent: 2.71; lr: 0.00100; 4887/4174 tok/s;   3051 sec
[2019-02-16 20:18:06,535 INFO] Step 10300/20000; acc:  47.87; ppl: 14.98; xent: 2.71; lr: 0.00100; 4978/4215 tok/s;   3083 sec
[2019-02-16 20:18:39,101 INFO] Step 10400/20000; acc:  47.67; ppl: 15.57; xent: 2.75; lr: 0.00100; 4998/4248 tok/s;   3116 sec
[2019-02-16 20:19:08,145 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 20:19:12,693 INFO] Step 10500/20000; acc:  49.03; ppl: 13.89; xent: 2.63; lr: 0.00100; 4426/3815 tok/s;   3149 sec
[2019-02-1

last accent valid acc: 47.30724439268538


[2019-02-16 20:19:28,738 INFO] Validation perplexity: 19.0263
[2019-02-16 20:19:28,739 INFO] Validation accuracy: 47.2206


Save last accent model with acc: 47.30724439268538
Decent round: 1


[2019-02-16 20:19:39,490 INFO] Step 10600/20000; acc:  47.80; ppl: 15.32; xent: 2.73; lr: 0.00100; 6199/5246 tok/s;   3176 sec
[2019-02-16 20:19:49,328 INFO] Step 10700/20000; acc:  49.44; ppl: 13.35; xent: 2.59; lr: 0.00100; 15081/12990 tok/s;   3186 sec
[2019-02-16 20:19:59,815 INFO] Step 10800/20000; acc:  48.59; ppl: 14.44; xent: 2.67; lr: 0.00100; 15390/13132 tok/s;   3197 sec
[2019-02-16 20:20:10,096 INFO] Step 10900/20000; acc:  48.33; ppl: 14.61; xent: 2.68; lr: 0.00100; 15359/13012 tok/s;   3207 sec
[2019-02-16 20:20:20,346 INFO] Step 11000/20000; acc:  48.53; ppl: 14.25; xent: 2.66; lr: 0.00100; 15591/13301 tok/s;   3217 sec
[2019-02-16 20:20:20,445 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.30724439268538


[2019-02-16 20:20:36,544 INFO] Validation perplexity: 19.2336
[2019-02-16 20:20:36,545 INFO] Validation accuracy: 47.189


Decent round: 2


[2019-02-16 20:20:47,418 INFO] Step 11100/20000; acc:  47.69; ppl: 15.37; xent: 2.73; lr: 0.00100; 6082/5144 tok/s;   3244 sec
[2019-02-16 20:20:58,134 INFO] Step 11200/20000; acc:  47.92; ppl: 14.96; xent: 2.71; lr: 0.00100; 14992/12818 tok/s;   3255 sec
[2019-02-16 20:21:08,759 INFO] Step 11300/20000; acc:  48.55; ppl: 14.43; xent: 2.67; lr: 0.00100; 14724/12523 tok/s;   3265 sec
[2019-02-16 20:21:18,249 INFO] Step 11400/20000; acc:  49.26; ppl: 13.36; xent: 2.59; lr: 0.00100; 15398/13223 tok/s;   3275 sec
[2019-02-16 20:21:29,847 INFO] Step 11500/20000; acc:  46.98; ppl: 16.12; xent: 2.78; lr: 0.00100; 14660/12447 tok/s;   3287 sec
[2019-02-16 20:21:29,947 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.30724439268538


[2019-02-16 20:21:46,172 INFO] Validation perplexity: 19.1337
[2019-02-16 20:21:46,173 INFO] Validation accuracy: 47.2335


Decent round: 3


[2019-02-16 20:21:55,936 INFO] Step 11600/20000; acc:  48.88; ppl: 13.97; xent: 2.64; lr: 0.00100; 5888/4974 tok/s;   3313 sec
[2019-02-16 20:22:06,083 INFO] Step 11700/20000; acc:  48.25; ppl: 14.82; xent: 2.70; lr: 0.00100; 15423/13169 tok/s;   3323 sec
[2019-02-16 20:22:16,364 INFO] Step 11800/20000; acc:  48.01; ppl: 14.87; xent: 2.70; lr: 0.00100; 15561/13198 tok/s;   3333 sec
[2019-02-16 20:22:27,079 INFO] Step 11900/20000; acc:  47.59; ppl: 15.76; xent: 2.76; lr: 0.00100; 15451/13096 tok/s;   3344 sec
[2019-02-16 20:22:36,999 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-16 20:22:38,782 INFO] Step 12000/20000; acc:  49.11; ppl: 13.88; xent: 2.63; lr: 0.00100; 12870/11077 tok/s;   3355 sec
[2019-02-16 20:22:38,896 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.30724439268538


[2019-02-16 20:22:54,767 INFO] Validation perplexity: 19.2466
[2019-02-16 20:22:54,768 INFO] Validation accuracy: 46.9552


Decent round: 4


[2019-02-16 20:23:05,842 INFO] Step 12100/20000; acc:  47.35; ppl: 15.85; xent: 2.76; lr: 0.00100; 6208/5251 tok/s;   3383 sec
[2019-02-16 20:23:15,557 INFO] Step 12200/20000; acc:  49.83; ppl: 12.82; xent: 2.55; lr: 0.00100; 14673/12697 tok/s;   3392 sec
[2019-02-16 20:23:26,640 INFO] Step 12300/20000; acc:  48.20; ppl: 14.50; xent: 2.67; lr: 0.00100; 14648/12510 tok/s;   3403 sec
[2019-02-16 20:23:37,437 INFO] Step 12400/20000; acc:  48.31; ppl: 14.79; xent: 2.69; lr: 0.00100; 14591/12360 tok/s;   3414 sec
[2019-02-16 20:23:48,124 INFO] Step 12500/20000; acc:  48.29; ppl: 14.55; xent: 2.68; lr: 0.00100; 15101/12862 tok/s;   3425 sec
[2019-02-16 20:23:48,223 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: 47.30724439268538


[2019-02-16 20:24:04,205 INFO] Validation perplexity: 19.3363
[2019-02-16 20:24:04,205 INFO] Validation accuracy: 47.2952


Decent round: 5
meet early stop condition, prev best loss: 47.30724439268538 current loss: 47.29524300585845


AttributeError: 'NMTModel' object has no attribute 'optim'

In [15]:
train_args['valid_batch'] = 64
reverse_runner.run(train_args)

[2019-02-15 20:13:46,283 INFO]  * src vocab size = 8870
[2019-02-15 20:13:46,284 INFO]  * tgt vocab size = 20071
[2019-02-15 20:13:46,284 INFO] Building model...
[2019-02-15 20:13:49,837 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_out): Linear(in_feature

meet early stop condition, prev loss: 52.793656155822454 current loss: 53.52796323130673


In [16]:
reverse_runner.model_id

'18afdcd77652c3d69836a58403953bac'

## Example #2: Train A Transformer

In [18]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'transformer',
    'decoder_type': 'transformer',
    'layers': 6,
    'position_encoding': None,
    'max_generator_batches': 2,
    'dropout': 0.1,
    'enc_rnn_size': 512,
    'dec_rnn_size': 512,
    'src_word_vec_size': 512,
    'tgt_word_vec_size': 512,
    'accum_count': 4
}
train_args = {
    'batch_size': 16,
    'train_steps': 100000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 200,
    'valid_batch': 16,
    'early_stop_round': 10,
    'early_stop_threshold': 0,
    'report_every': 100
}
total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-17 13:47:46,400 INFO]  * src vocab size = 8870
[2019-02-17 13:47:46,402 INFO]  * tgt vocab size = 20071
[2019-02-17 13:47:46,403 INFO] Building model...
[2019-02-17 13:47:50,630 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 512, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=512, out_features=512, bias=True)
          (linear_values): Linear(in_features=512, out_features=512, bias=True)
          (linear_query): Linear(in_features=512, out_features=512, bias=True)
          (softmax): Softmax()
          (dropout): Dropout(p=0.1)
          (final_linear): Linear(in_features=512, out_features=512, bias=True)
        )
       

0 Testing GPU memory capability with batch shape: torch.Size([200, 16, 1]) torch.Size([190, 16, 1])


[2019-02-17 13:49:28,193 INFO] Step 100/100000; loss: 2621894.360388 acc:   9.87; ppl: 1468.88; xent: 7.29; lr: 0.00100; 4701/3692 tok/s;     97 sec
[2019-02-17 13:50:44,865 INFO] Step 200/100000; loss: 1312841.478624 acc:  19.83; ppl: 207.88; xent: 5.34; lr: 0.00100; 3932/3208 tok/s;    174 sec
[2019-02-17 13:50:44,973 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -inf


[2019-02-17 13:51:07,936 INFO] Validation perplexity: 169.087
[2019-02-17 13:51:07,937 INFO] Validation accuracy: 17.9167
[2019-02-17 13:52:20,121 INFO] Step 300/100000; loss: 1025134.557983 acc:  22.01; ppl: 155.75; xent: 5.05; lr: 0.00100; 2590/2132 tok/s;    269 sec
[2019-02-17 13:53:28,372 INFO] Step 400/100000; loss: 846213.914762 acc:  23.66; ppl: 125.23; xent: 4.83; lr: 0.00100; 3088/2567 tok/s;    338 sec
[2019-02-17 13:53:28,476 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -1154209.2971801758


[2019-02-17 13:53:51,487 INFO] Validation perplexity: 129.13
[2019-02-17 13:53:51,488 INFO] Validation accuracy: 20.3286
[2019-02-17 13:54:57,907 INFO] Step 500/100000; loss: 725905.292447 acc:  25.47; ppl: 105.02; xent: 4.65; lr: 0.00100; 2048/1742 tok/s;    427 sec
[2019-02-17 13:56:02,954 INFO] Step 600/100000; loss: 625419.903428 acc:  27.69; ppl: 90.40; xent: 4.50; lr: 0.00100; 2486/2135 tok/s;    492 sec
[2019-02-17 13:56:03,061 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -1093558.2692260742


[2019-02-17 13:56:25,877 INFO] Validation perplexity: 103.692
[2019-02-17 13:56:25,878 INFO] Validation accuracy: 24.0854
[2019-02-17 13:57:29,123 INFO] Step 700/100000; loss: 524134.457756 acc:  31.63; ppl: 68.98; xent: 4.23; lr: 0.00100; 1663/1437 tok/s;    578 sec
[2019-02-17 13:58:31,544 INFO] Step 800/100000; loss: 438472.353240 acc:  36.38; ppl: 51.59; xent: 3.94; lr: 0.00100; 2041/1781 tok/s;    641 sec
[2019-02-17 13:58:31,649 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -1044200.5935668945


[2019-02-17 13:58:54,671 INFO] Validation perplexity: 81.9583
[2019-02-17 13:58:54,672 INFO] Validation accuracy: 29.1896
[2019-02-17 13:59:55,934 INFO] Step 900/100000; loss: 368895.661441 acc:  40.03; ppl: 39.85; xent: 3.69; lr: 0.00100; 1344/1186 tok/s;    725 sec
[2019-02-17 14:00:56,234 INFO] Step 1000/100000; loss: 312602.137179 acc:  43.20; ppl: 31.77; xent: 3.46; lr: 0.00100; 1670/1499 tok/s;    785 sec
[2019-02-17 14:00:56,338 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -991282.8646850586


[2019-02-17 14:01:19,221 INFO] Validation perplexity: 77.1449
[2019-02-17 14:01:19,222 INFO] Validation accuracy: 30.3808
[2019-02-17 14:02:18,851 INFO] Step 1100/100000; loss: 265746.847168 acc:  45.54; ppl: 26.43; xent: 3.27; lr: 0.00100; 1076/982 tok/s;    868 sec
[2019-02-17 14:03:17,673 INFO] Step 1200/100000; loss: 223866.080187 acc:  48.00; ppl: 21.79; xent: 3.08; lr: 0.00100; 1327/1235 tok/s;    927 sec
[2019-02-17 14:03:17,791 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -977666.166809082


[2019-02-17 14:03:40,701 INFO] Validation perplexity: 96.9517
[2019-02-17 14:03:40,702 INFO] Validation accuracy: 28.7322


Save last accent model with acc: -977666.166809082
Decent round: 1


[2019-02-17 14:04:11,112 INFO] Step 1300/100000; loss: 185233.869574 acc:  51.04; ppl: 17.77; xent: 2.88; lr: 0.00100; 1254/1205 tok/s;    980 sec
[2019-02-17 14:04:40,535 INFO] Step 1400/100000; loss: 150791.138944 acc:  53.29; ppl: 14.73; xent: 2.69; lr: 0.00100; 1866/1905 tok/s;   1010 sec
[2019-02-17 14:04:40,640 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -977666.166809082


[2019-02-17 14:05:03,622 INFO] Validation perplexity: 88.2704
[2019-02-17 14:05:03,622 INFO] Validation accuracy: 27.9317


Decent round: 2


[2019-02-17 14:05:32,770 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844


RuntimeError: CUDA out of memory. Tried to allocate 23.62 MiB (GPU 0; 5.76 GiB total capacity; 3.86 GiB already allocated; 35.06 MiB free; 228.79 MiB cached)

In [17]:
{k: v in globals().items() if k 

{'__name__': '__main__',
 '__doc__': 'Automatically created module for IPython interactive environment',
 '__package__': None,
 '__loader__': None,
 '__spec__': None,
 '__builtin__': <module 'builtins' (built-in)>,
 '__builtins__': <module 'builtins' (built-in)>,
 '_ih': ['',
  'import onmt\nfrom onmt.trainer import Trainer\nfrom onmt.utils.logging import logger\nfrom onmt.utils.loss import build_loss_compute\nimport onmt.opts as opts\nfrom onmt.inputters.inputter import build_dataset_iter, \\\n    load_old_vocab, old_style_vocab\nfrom onmt.model_builder import build_model\nfrom onmt.utils.optimizers import Optimizer\nfrom onmt.utils.misc import set_random_seed\nfrom onmt.models import build_model_saver\nfrom onmt.utils.logging import init_logger, logger\nfrom onmt.utils.misc import split_corpus\nfrom onmt.translate.translator import build_translator\n\nimport itertools\nimport configargparse\nimport os\nfrom itertools import chain\nimport torch\nimport subprocess, shlex, re, uuid, has

## Example #3: Make Prediction

In [29]:
model_id = '832c1d8d66c693bce372445f692b0099'

translate_args = {
    'src': 'data/nmt15/test_en',
    'tgt': 'data/nmt15/test_vi',
    'out': '/mnt/drive-2t/pred/opennmt_' + model_id + '.txt',
    'replace_unk': None,
    'gpu': '0',
    'batch_size': '16'
}

runner.translate(translate_args, model_id)

[2019-02-15 17:15:34,564 INFO] Translating shard 0.
[2019-02-15 17:22:55,574 INFO] Translating shard 1.


PRED AVG SCORE: -0.6200, PRED PPL: 1.8589
GOLD AVG SCORE: -10.4237, GOLD PPL: 33648.8349


[2019-02-15 17:30:52,361 INFO] Translating shard 2.


PRED AVG SCORE: -0.6197, PRED PPL: 1.8584
GOLD AVG SCORE: -10.4420, GOLD PPL: 34270.6062
PRED AVG SCORE: -0.6319, PRED PPL: 1.8812
GOLD AVG SCORE: -10.4421, GOLD PPL: 34270.9004


In [38]:
class MyTrainer(Trainer):
    def train(self,
              train_iter,
              train_steps,
              save_checkpoint_steps=5000,
              valid_iter=None,
              valid_steps=10000,
              early_stop_round=50000,
              early_stop_threshold=1):
        if valid_iter is None:
            logger.info('Start training loop without validation...')
        else:
            logger.info('Start training loop and validate every %d steps...',
                        valid_steps)

        total_stats = onmt.utils.Statistics()
        report_stats = onmt.utils.Statistics()
        self._start_report_manager(start_time=total_stats.start_time)
        
        valid_early_stop_loss = []
        for i, (batches, normalization) in enumerate(
                self._accum_batches(train_iter)):
            step = self.optim.training_step

            self._gradient_accumulation(
                batches, normalization, total_stats,
                report_stats)

            if self.average_decay > 0 and i % self.average_every == 0:
                self._update_average(step)

            report_stats = self._maybe_report_training(
                step, train_steps,
                self.optim.learning_rate(),
                report_stats)

            if valid_iter is not None and step % valid_steps == 0:
                valid_stats = self.validate(
                    valid_iter, moving_average=self.moving_average)
                valid_stats = self._maybe_gather_stats(valid_stats)
                valid_loss = valid_stats.accuracy()
#                 valid_early_stop_loss.append(valid_loss)
                if len(valid_early_stop_loss) == early_stop_round:
                    # 条件为当前valid loss要超过前n个valid loss最好的那个加threshold
                    if valid_loss - max(valid_early_stop_loss) < early_stop_threshold:
                        print('meet early stop condition, prev best loss:', max(valid_early_stop_loss), 'current loss:', valid_loss)
                        if self.model_saver is not None:
                            self.model_saver.save(step, moving_average=self.moving_average)
                        return total_stats
                    else:
                        valid_early_stop_loss.pop(0)
                        valid_early_stop_loss
                self._report_step(self.optim.learning_rate(),
                                  step, valid_stats=valid_stats)

            if (self.model_saver is not None
                    and (save_checkpoint_steps != 0
                         and step % save_checkpoint_steps == 0)):
                self.model_saver.save(step, moving_average=self.moving_average)

            if train_steps > 0 and step >= train_steps:
                break

        if self.model_saver is not None:
            self.model_saver.save(step, moving_average=self.moving_average)
        return total_stats

(36.37, 2.0)

该model(transformer)的valid loss = 51.17859672702835, last_train_acc = 58.17

In [34]:
model_id = '5cb980256df787cdd84b166f65503553'
runner.translate(build_translate_args(model_id), model_id)

[2019-02-15 19:46:30,266 INFO] Translating shard 0.
[2019-02-15 19:48:39,104 INFO] Translating shard 1.


PRED AVG SCORE: -0.4451, PRED PPL: 1.5607
GOLD AVG SCORE: -11.8941, GOLD PPL: 146406.9002


[2019-02-15 19:50:56,516 INFO] Translating shard 2.


PRED AVG SCORE: -0.4401, PRED PPL: 1.5529
GOLD AVG SCORE: -11.8323, GOLD PPL: 137626.8257
PRED AVG SCORE: -0.4524, PRED PPL: 1.5722
GOLD AVG SCORE: -11.7261, GOLD PPL: 123763.2465


In [37]:
pred = '/mnt/drive-2t/pred/opennmt_' + model_id + '.txt'
ref = 'data/nmt15/test_vi'
bleu_evaluate(pred, ref), bleu_evaluate(pred, ref, '0.25 0.25 0.25 0.25')

(4.0, 24.16)

LSTM model valid loss = 53.67019634197286, last_train_acc = 67.98

In [15]:
model_id = '18afdcd77652c3d69836a58403953bac'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 32
reverse_runner.translate(args, model_id)

[2019-02-15 20:47:26,898 INFO] Translating shard 0.
[2019-02-15 20:48:58,846 INFO] Translating shard 1.


PRED AVG SCORE: -0.6594, PRED PPL: 1.9336
GOLD AVG SCORE: -11.0811, GOLD PPL: 64930.4421


[2019-02-15 20:50:33,734 INFO] Translating shard 2.


PRED AVG SCORE: -0.6708, PRED PPL: 1.9557
GOLD AVG SCORE: -11.0883, GOLD PPL: 65401.4528
PRED AVG SCORE: -0.6949, PRED PPL: 2.0034
GOLD AVG SCORE: -11.0109, GOLD PPL: 60531.6975


In [16]:
pred = '/mnt/drive-2t/pred/opennmt_' + model_id + '.txt'
ref = 'data/nmt15/test_en'
bleu_evaluate(pred, ref), bleu_evaluate(pred, ref, '0.25 0.25 0.25 0.25')

(35.73, 21.21)

In [20]:
from pprint import pprint
pred_head = !head $pred
ref_head = !head $ref
pairs = list(zip(pred_head, ref_head))
for pair in pairs:
    print(pair[0])
    print()
    print(pair[1])
    print('-------------')

Now , it &apos;s time that I ended up , and as I mentioned before , of course , I &apos;ve got a lot of other data if you &apos;re interested , but I just want to give this basic idea about communication to its language , and the potential of that possibility .

Now I just want to wrap up , and as I was mentioning earlier of course I have a lot of other data if you &apos;re interested , but I just wanted to give this sort of basic idea of being able to communicate with the brain in its language , and the potential power of being able to do that .
-------------
This is very different from the artificial device that you have to bridge from the brain to a device . Here we have to bridge the world from the external brain to a device .

So it &apos;s different from the motor prosthetics where you &apos;re communicating from the brain to a device . Here we have to communicate from the outside world into the brain and be understood , and be understood by the brain .
-------------
And then the

In [21]:
evaluate_human('18afdcd77652c3d69836a58403953bac')

{
    "data": "data/nmt15-reverse",
    "save_model": "/mnt/drive-2t/model/opennmt_18afdcd77652c3d69836a58403953bac",
    "gpu_ranks": 0,
    "tensorboard": null,
    "tensorboard_log_dir": "/mnt/drive-2t/log/opennmt_18afdcd77652c3d69836a58403953bac/log",
    "batch_size": 128,
    "train_steps": 20000,
    "optim": "adam",
    "learning_rate": 0.001,
    "valid_step": 500,
    "valid_batch": 64,
    "early_stop_round": 5,
    "early_stop_threshold": 1,
    "report_every": 100
}35.73 21.21
Now , it &apos;s time that I ended up , and as I mentioned before , of course , I &apos;ve got a lot of other data if you &apos;re interested , but I just want to give this basic idea about communication to its language , and the potential of that possibility .

Now I just want to wrap up , and as I was mentioning earlier of course I have a lot of other data if you &apos;re interested , but I just wanted to give this sort of basic idea of being able to communicate with the brain in its language , and

In [18]:
model_id = '5384edb1bec7646d9b0008074304d283'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 16
reverse_runner.translate(args, model_id)

[2019-02-16 11:08:55,847 INFO] Translating shard 0.
[2019-02-16 11:17:00,743 INFO] Translating shard 1.


PRED AVG SCORE: -0.8561, PRED PPL: 2.3540
GOLD AVG SCORE: -10.7383, GOLD PPL: 46086.8161


[2019-02-16 11:24:55,976 INFO] Translating shard 2.


PRED AVG SCORE: -0.8727, PRED PPL: 2.3933
GOLD AVG SCORE: -10.7712, GOLD PPL: 47628.2838
PRED AVG SCORE: -0.8991, PRED PPL: 2.4575
GOLD AVG SCORE: -10.7426, GOLD PPL: 46286.4810


In [19]:
pred = '/mnt/drive-2t/pred/opennmt_' + model_id + '.txt'
ref = 'data/nmt15/test_en'
bleu_evaluate(pred, ref), bleu_evaluate(pred, ref, '0.25 0.25 0.25 0.25')

(31.22, 17.0)

In [22]:
from pprint import pprint
pred_head = !head $pred
ref_head = !head $ref
pairs = list(zip(pred_head, ref_head))
for pair in pairs:
    print(pair[0].replace('&apos;', '\''))
    print()
    print(pair[1].replace('&apos;', '\''))
    print('-------------')

Now , at the end of the end of the conclusion , I 've mentioned before , I 've got a lot of other data , but if you want to give me a sense of the idea of the brain , it 's ability to interact with the potential language , and now , it 's right now .

Now I just want to wrap up , and as I was mentioning earlier of course I have a lot of other data if you 're interested , but I just wanted to give this sort of basic idea of being able to communicate with the brain in its language , and the potential power of being able to do that .
-------------
This is very different with artificial biology , and you have to communicate from the brain to the brain . Here we have to be able to communicate north through the world , and understand the brain .

So it 's different from the motor prosthetics where you 're communicating from the brain to a device . Here we have to communicate from the outside world into the brain and be understood , and be understood by the brain .
-------------
And then fina

## Example #4: GRU

In [15]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'rnn',
    'decoder_type': 'rnn',
    'rnn_type': 'GRU',
    'layers': 3,
    'max_generator_batches': 2,
    'dropout': 0.1,
    'enc_rnn_size': 512,
    'dec_rnn_size': 512,
    'src_word_vec_size': 512,
    'tgt_word_vec_size': 512
}
train_args = {
    'batch_size': 64,
    'train_steps': 20000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 64,
    'early_stop_round': 5,
    'early_stop_threshold': 0.1,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-16 13:22:25,039 INFO]  * src vocab size = 8870
[2019-02-16 13:22:25,042 INFO]  * tgt vocab size = 20071
[2019-02-16 13:22:25,042 INFO] Building model...
[2019-02-16 13:22:28,630 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 512, padding_idx=1)
        )
      )
    )
    (rnn): GRU(512, 512, num_layers=3, dropout=0.1)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 512, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.1)
    (rnn): StackedGRU(
      (dropout): Dropout(p=0.1)
      (layers): ModuleList(
        (0): GRUCell(1024, 512)
        (1): GRUCell(512, 512)
        (2): GRUCell(512, 512)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=512, out_features=512, bias=False)
      (lin

meet early stop condition, prev loss: 36.793585036493106 current loss: 37.37898601616187


'990547e9b5cc3ef3ab8df6728e6f8669'

In [15]:
model_id = '990547e9b5cc3ef3ab8df6728e6f8669'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 32
reverse_runner.translate(args, model_id)

[2019-02-16 13:42:42,908 INFO] Translating shard 0.
[2019-02-16 13:44:19,325 INFO] Translating shard 1.


PRED AVG SCORE: -1.1561, PRED PPL: 3.1776
GOLD AVG SCORE: -10.0213, GOLD PPL: 22499.8106


[2019-02-16 13:45:57,107 INFO] Translating shard 2.


PRED AVG SCORE: -1.1661, PRED PPL: 3.2096
GOLD AVG SCORE: -10.0651, GOLD PPL: 23508.4893
PRED AVG SCORE: -1.1886, PRED PPL: 3.2823
GOLD AVG SCORE: -10.0625, GOLD PPL: 23446.4471


In [23]:
evaluate_human('990547e9b5cc3ef3ab8df6728e6f8669')

{
    "data": "data/nmt15-reverse",
    "save_model": "/mnt/drive-2t/model/opennmt_990547e9b5cc3ef3ab8df6728e6f8669",
    "gpu_ranks": 0,
    "tensorboard": null,
    "tensorboard_log_dir": "/mnt/drive-2t/log/opennmt_990547e9b5cc3ef3ab8df6728e6f8669/log",
    "model_dtype": "fp32",
    "encoder_type": "rnn",
    "decoder_type": "rnn",
    "rnn_type": "GRU",
    "layers": 3,
    "max_generator_batches": 2,
    "dropout": 0.1,
    "enc_rnn_size": 512,
    "dec_rnn_size": 512,
    "src_word_vec_size": 512,
    "tgt_word_vec_size": 512,
    "batch_size": 64,
    "train_steps": 20000,
    "optim": "adam",
    "learning_rate": 0.001,
    "valid_step": 500,
    "valid_batch": 64,
    "early_stop_round": 5,
    "early_stop_threshold": 0.1,
    "report_every": 100
}
23.3 11.89
Now , before I go back to this time , I &apos;m going to show you , but I &apos;m going to talk about this , but it &apos;s a combination of time , and if you &apos;re talking about it , it &apos;s the idea of the human b

In [16]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'rnn',
    'decoder_type': 'rnn',
    'rnn_type': 'GRU',
#     'layers': 3,
#     'max_generator_batches': 2,
    'dropout': 0.3,
#     'enc_rnn_size': 512,
#     'dec_rnn_size': 512,
#     'src_word_vec_size': 512,
#     'tgt_word_vec_size': 512
}
train_args = {
    'batch_size': 64,
    'train_steps': 20000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 64,
    'early_stop_round': 5,
    'early_stop_threshold': 0.1,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-16 13:57:27,389 INFO]  * src vocab size = 8870
[2019-02-16 13:57:27,390 INFO]  * tgt vocab size = 20071
[2019-02-16 13:57:27,392 INFO] Building model...
[2019-02-16 13:57:30,951 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 500, padding_idx=1)
        )
      )
    )
    (rnn): GRU(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedGRU(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): GRUCell(1000, 500)
        (1): GRUCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_out): Linear(in_features=10

meet early stop condition, prev best loss: 44.19043978415284 current loss: 40.965622694177995


'f40ce12cd75f5edc1a96a6e26f46ed4e'

In [18]:
model_id = 'f40ce12cd75f5edc1a96a6e26f46ed4e'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 32
reverse_runner.translate(args, model_id)

[2019-02-16 14:09:21,216 INFO] Translating shard 0.
[2019-02-16 14:10:34,134 INFO] Translating shard 1.


PRED AVG SCORE: -1.0952, PRED PPL: 2.9897
GOLD AVG SCORE: -9.4220, GOLD PPL: 12357.4480


[2019-02-16 14:11:48,342 INFO] Translating shard 2.


PRED AVG SCORE: -1.1152, PRED PPL: 3.0503
GOLD AVG SCORE: -9.4861, GOLD PPL: 13175.3264
PRED AVG SCORE: -1.1401, PRED PPL: 3.1272
GOLD AVG SCORE: -9.5160, GOLD PPL: 13575.0565


In [19]:
evaluate_human('f40ce12cd75f5edc1a96a6e26f46ed4e')

{
    "data": "data/nmt15-reverse",
    "save_model": "/mnt/drive-2t/model/opennmt_f40ce12cd75f5edc1a96a6e26f46ed4e",
    "gpu_ranks": 0,
    "tensorboard": null,
    "tensorboard_log_dir": "/mnt/drive-2t/log/opennmt_f40ce12cd75f5edc1a96a6e26f46ed4e/log",
    "model_dtype": "fp32",
    "encoder_type": "rnn",
    "decoder_type": "rnn",
    "rnn_type": "GRU",
    "dropout": 0.3,
    "batch_size": 64,
    "train_steps": 20000,
    "optim": "adam",
    "learning_rate": 0.001,
    "valid_step": 500,
    "valid_batch": 64,
    "early_stop_round": 5,
    "early_stop_threshold": 0.1,
    "report_every": 100
}
21.52 12.44
Now , this time I ended up , and I mentioned earlier , of course , of course , but I wanted to give this idea of this idea about its body .

Now I just want to wrap up , and as I was mentioning earlier of course I have a lot of other data if you &apos;re interested , but I just wanted to give this sort of basic idea of being able to communicate with the brain in its language ,

In [16]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'rnn',
    'decoder_type': 'rnn',
#     'rnn_type': 'GRU',
#     'layers': 3,
#     'max_generator_batches': 2,
    'dropout': 0.3,
#     'enc_rnn_size': 512,
#     'dec_rnn_size': 512,
#     'src_word_vec_size': 512,
#     'tgt_word_vec_size': 512
}
train_args = {
    'batch_size': 64,
    'train_steps': 20000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 64,
    'early_stop_round': 5,
    'early_stop_threshold': 0.1,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-16 14:23:42,127 INFO]  * src vocab size = 8870
[2019-02-16 14:23:42,128 INFO]  * tgt vocab size = 20071
[2019-02-16 14:23:42,130 INFO] Building model...
[2019-02-16 14:23:45,681 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_out): Linear(in_feature

meet early stop condition, prev best loss: 53.28260154506743 current loss: 53.23859646003538


'22846dcbc32b2f51a395010cde5d7f85'

In [17]:
model_id = '22846dcbc32b2f51a395010cde5d7f85'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 32
reverse_runner.translate(args, model_id)

[2019-02-16 14:44:18,390 INFO] Translating shard 0.
[2019-02-16 14:45:53,548 INFO] Translating shard 1.


PRED AVG SCORE: -0.7247, PRED PPL: 2.0640
GOLD AVG SCORE: -10.3031, GOLD PPL: 29825.0073


[2019-02-16 14:47:31,194 INFO] Translating shard 2.


PRED AVG SCORE: -0.7346, PRED PPL: 2.0846
GOLD AVG SCORE: -10.2928, GOLD PPL: 29518.2979
PRED AVG SCORE: -0.7631, PRED PPL: 2.1449
GOLD AVG SCORE: -10.2362, GOLD PPL: 27895.0392


In [19]:
evaluate_human('22846dcbc32b2f51a395010cde5d7f85')

{
    "data": "data/nmt15-reverse",
    "save_model": "/mnt/drive-2t/model/opennmt_22846dcbc32b2f51a395010cde5d7f85",
    "gpu_ranks": 0,
    "tensorboard": null,
    "tensorboard_log_dir": "/mnt/drive-2t/log/opennmt_22846dcbc32b2f51a395010cde5d7f85/log",
    "model_dtype": "fp32",
    "encoder_type": "rnn",
    "decoder_type": "rnn",
    "dropout": 0.3,
    "batch_size": 64,
    "train_steps": 20000,
    "optim": "adam",
    "learning_rate": 0.001,
    "valid_step": 500,
    "valid_batch": 64,
    "early_stop_round": 5,
    "early_stop_threshold": 0.1,
    "report_every": 100
}
35.56 21.0
Now , it &apos;s time that I ended up , and as I mentioned earlier , I had a lot of other data if you &apos;re interested in , but I just wanted to give this basic idea about communication with the brain of it , and the potential of that ability to be the potential of that ability .

Now I just want to wrap up , and as I was mentioning earlier of course I have a lot of other data if you &apos;re inte

In [16]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'brnn',
    'decoder_type': 'rnn',
#     'rnn_type': 'GRU',
#     'layers': 3,
#     'max_generator_batches': 2,
    'dropout': 0.3,
#     'enc_rnn_size': 512,
#     'dec_rnn_size': 512,
#     'src_word_vec_size': 512,
#     'tgt_word_vec_size': 512
}
train_args = {
    'batch_size': 128,
    'train_steps': 20000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 64,
    'early_stop_round': 5,
    'early_stop_threshold': 0.1,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-16 14:53:42,839 INFO]  * src vocab size = 8870
[2019-02-16 14:53:42,840 INFO]  * tgt vocab size = 20071
[2019-02-16 14:53:42,842 INFO] Building model...
[2019-02-16 14:53:47,064 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 250, num_layers=2, dropout=0.3, bidirectional=True)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_out

meet early stop condition, prev best loss: 53.285713015726266 current loss: 53.31460524327256


'2a82e28e8fdf56498eba4ee34eee9ac0'

In [17]:
model_id = '2a82e28e8fdf56498eba4ee34eee9ac0'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 32
reverse_runner.translate(args, model_id)

[2019-02-16 15:11:53,568 INFO] Translating shard 0.
[2019-02-16 15:13:34,420 INFO] Translating shard 1.


PRED AVG SCORE: -0.7467, PRED PPL: 2.1100
GOLD AVG SCORE: -10.3315, GOLD PPL: 30683.9296


[2019-02-16 15:15:21,707 INFO] Translating shard 2.


PRED AVG SCORE: -0.7608, PRED PPL: 2.1400
GOLD AVG SCORE: -10.3291, GOLD PPL: 30610.4227
PRED AVG SCORE: -0.7843, PRED PPL: 2.1910
GOLD AVG SCORE: -10.2622, GOLD PPL: 28629.7671


In [18]:
evaluate_human('2a82e28e8fdf56498eba4ee34eee9ac0')

{
    "data": "data/nmt15-reverse",
    "save_model": "/mnt/drive-2t/model/opennmt_2a82e28e8fdf56498eba4ee34eee9ac0",
    "gpu_ranks": 0,
    "tensorboard": null,
    "tensorboard_log_dir": "/mnt/drive-2t/log/opennmt_2a82e28e8fdf56498eba4ee34eee9ac0/log",
    "model_dtype": "fp32",
    "encoder_type": "brnn",
    "decoder_type": "rnn",
    "dropout": 0.3,
    "batch_size": 128,
    "train_steps": 20000,
    "optim": "adam",
    "learning_rate": 0.001,
    "valid_step": 500,
    "valid_batch": 64,
    "early_stop_round": 5,
    "early_stop_threshold": 0.1,
    "report_every": 100
}
38.0 22.36
Now , it &apos;s time to end , and as I mentioned before , of course , I have a lot of different data if you &apos;re interested , but I just had a lot of different data if you care , but I just wanted to show this basic idea about communication communication with the brain with its language , and the potential of the potential of it .

Now I just want to wrap up , and as I was mentioning earlier o

In [16]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'brnn',
    'decoder_type': 'rnn',
#     'rnn_type': 'GRU',
    'layers': 3,
#     'max_generator_batches': 2,
    'dropout': 0.3,
    'enc_rnn_size': 512,
    'dec_rnn_size': 512,
    'src_word_vec_size': 512,
    'tgt_word_vec_size': 512
}
train_args = {
    'batch_size': 128,
    'train_steps': 20000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 64,
    'early_stop_round': 5,
    'early_stop_threshold': 0.01,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-16 15:25:23,776 INFO]  * src vocab size = 8870
[2019-02-16 15:25:23,779 INFO]  * tgt vocab size = 20071
[2019-02-16 15:25:23,779 INFO] Building model...
[2019-02-16 15:25:27,407 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 512, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(512, 256, num_layers=3, dropout=0.3, bidirectional=True)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 512, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): LSTMCell(1024, 512)
        (1): LSTMCell(512, 512)
        (2): LSTMCell(512, 512)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=512, out_features=51

meet early stop condition, prev best loss: 54.596531154711215 current loss: 54.21470925529172


'6082f2c966d9ea8dbb7afb565d719193'

In [17]:
model_id = '6082f2c966d9ea8dbb7afb565d719193'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 32
reverse_runner.translate(args, model_id)

[2019-02-16 15:52:50,313 INFO] Translating shard 0.
[2019-02-16 15:54:43,488 INFO] Translating shard 1.


PRED AVG SCORE: -0.6527, PRED PPL: 1.9208
GOLD AVG SCORE: -10.9694, GOLD PPL: 58069.3216


[2019-02-16 15:56:40,825 INFO] Translating shard 2.


PRED AVG SCORE: -0.6641, PRED PPL: 1.9427
GOLD AVG SCORE: -10.9644, GOLD PPL: 57782.8891
PRED AVG SCORE: -0.6866, PRED PPL: 1.9870
GOLD AVG SCORE: -10.8774, GOLD PPL: 52963.2560


In [21]:
evaluate_human('6082f2c966d9ea8dbb7afb565d719193')

{
    "data": "data/nmt15-reverse",
    "save_model": "/mnt/drive-2t/model/opennmt_6082f2c966d9ea8dbb7afb565d719193",
    "gpu_ranks": 0,
    "tensorboard": null,
    "tensorboard_log_dir": "/mnt/drive-2t/log/opennmt_6082f2c966d9ea8dbb7afb565d719193/log",
    "model_dtype": "fp32",
    "encoder_type": "brnn",
    "decoder_type": "rnn",
    "layers": 3,
    "dropout": 0.3,
    "enc_rnn_size": 512,
    "dec_rnn_size": 512,
    "src_word_vec_size": 512,
    "tgt_word_vec_size": 512,
    "batch_size": 128,
    "train_steps": 20000,
    "optim": "adam",
    "learning_rate": 0.001,
    "valid_step": 500,
    "valid_batch": 64,
    "early_stop_round": 5,
    "early_stop_threshold": 0.01,
    "report_every": 100
}
37.86 22.47
Now I ended up , and as I mentioned earlier before , of course I have a lot of data if you care , but I just want to give you this basic idea of communication to the brain with its language , and the potential power of the possibility .

Now I just want to wrap up , and a

### Change early stop criterion to loss from acc.

In [None]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'brnn',
    'decoder_type': 'rnn',
#     'rnn_type': 'GRU',
    'layers': 3,
#     'max_generator_batches': 2,
    'dropout': 0.3,
    'enc_rnn_size': 512,
    'dec_rnn_size': 512,
    'src_word_vec_size': 512,
    'tgt_word_vec_size': 512
}
train_args = {
    'batch_size': 128,
    'train_steps': 100000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 64,
    'early_stop_round': 10,
    'early_stop_threshold': 0,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-17 14:32:11,800 INFO]  * src vocab size = 8870
[2019-02-17 14:32:11,803 INFO]  * tgt vocab size = 20071
[2019-02-17 14:32:11,803 INFO] Building model...
[2019-02-17 14:32:15,534 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 512, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(512, 256, num_layers=3, dropout=0.3, bidirectional=True)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 512, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): LSTMCell(1024, 512)
        (1): LSTMCell(512, 512)
        (2): LSTMCell(512, 512)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=512, out_features=51

0 Testing GPU memory capability with batch shape: torch.Size([200, 128, 1]) torch.Size([190, 128, 1])


[2019-02-17 14:33:19,592 INFO] Step 100/100000; loss: 4036334.778355 acc:   8.13; ppl: 785.17; xent: 6.67; lr: 0.00100; 11879/9473 tok/s;     64 sec
[2019-02-17 14:34:06,330 INFO] Step 200/100000; loss: 2039835.030886 acc:  19.86; ppl: 219.52; xent: 5.39; lr: 0.00100; 9789/8096 tok/s;    111 sec
[2019-02-17 14:34:49,150 INFO] Step 300/100000; loss: 1385664.103316 acc:  28.66; ppl: 110.14; xent: 4.70; lr: 0.00100; 8058/6883 tok/s;    153 sec
[2019-02-17 14:35:28,084 INFO] Step 400/100000; loss: 991060.433813 acc:  34.69; ppl: 67.62; xent: 4.21; lr: 0.00100; 6953/6041 tok/s;    192 sec


In [17]:
model_id = '091ebe12a299ced931600c799f0937b5'
args = build_translate_args_reverse(model_id)
args['batch_size'] = 32
reverse_runner.translate(args, model_id)

[2019-02-17 11:47:52,429 INFO] Translating shard 0.
[2019-02-17 11:49:32,224 INFO] Translating shard 1.


PRED AVG SCORE: -0.5925, PRED PPL: 1.8085
GOLD AVG SCORE: -11.1411, GOLD PPL: 68947.5836


[2019-02-17 11:51:14,967 INFO] Translating shard 2.


PRED AVG SCORE: -0.6029, PRED PPL: 1.8274
GOLD AVG SCORE: -11.1182, GOLD PPL: 67386.9716
PRED AVG SCORE: -0.6267, PRED PPL: 1.8714
GOLD AVG SCORE: -11.0101, GOLD PPL: 60480.0534


In [18]:
model_id = '091ebe12a299ced931600c799f0937b5'
evaluate_human(model_id)

{
    "data": "data/nmt15-reverse",
    "save_model": "/mnt/drive-2t/model/opennmt_091ebe12a299ced931600c799f0937b5",
    "gpu_ranks": 0,
    "tensorboard": null,
    "tensorboard_log_dir": "/mnt/drive-2t/log/opennmt_091ebe12a299ced931600c799f0937b5/log",
    "model_dtype": "fp32",
    "encoder_type": "brnn",
    "decoder_type": "rnn",
    "layers": 3,
    "dropout": 0.3,
    "enc_rnn_size": 512,
    "dec_rnn_size": 512,
    "src_word_vec_size": 512,
    "tgt_word_vec_size": 512,
    "batch_size": 64,
    "train_steps": 100000,
    "optim": "adam",
    "learning_rate": 0.001,
    "valid_step": 500,
    "valid_batch": 32,
    "early_stop_round": 5,
    "early_stop_threshold": 0.01,
    "report_every": 100
}
37.51 22.73
Now , it &apos;s time that I end up , and as I mentioned earlier , of course , I have a lot of data if you care , but I just want to show this fundamental idea of communication to the brain in its language , and the potential power of that capacity .

Now I just want to w

In [16]:
model_args = {
    'model_dtype': 'fp32',
    'encoder_type': 'rnn',
    'decoder_type': 'rnn',
#     'rnn_type': 'GRU',
#     'layers': 3,
#     'max_generator_batches': 2,
    'dropout': 0.3,
#     'enc_rnn_size': 512,
#     'dec_rnn_size': 512,
#     'src_word_vec_size': 512,
#     'tgt_word_vec_size': 512
}
train_args = {
    'batch_size': 64,
    'train_steps': 100000,
    'optim': 'adam',
    'learning_rate': 0.001,
    'valid_step': 500,
    'valid_batch': 32,
    'early_stop_round': 5,
    'early_stop_threshold': 0.1,
    'report_every': 100
}

total_args = {}
total_args.update(model_args)
total_args.update(train_args)

reverse_runner.run(total_args)
reverse_runner.model_id

[2019-02-17 00:21:24,263 INFO]  * src vocab size = 8870
[2019-02-17 00:21:24,264 INFO]  * tgt vocab size = 20071
[2019-02-17 00:21:24,266 INFO] Building model...
[2019-02-17 00:21:28,526 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(8870, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(20071, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_out): Linear(in_feature

last accent valid acc: -inf


[2019-02-17 00:23:47,013 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654
[2019-02-17 00:24:08,988 INFO] Validation perplexity: 103.056
[2019-02-17 00:24:08,989 INFO] Validation accuracy: 28.6024
[2019-02-17 00:24:38,637 INFO] Step 600/100000; loss: 630023.420390 acc:  29.17; ppl: 107.71; xent: 4.68; lr: 0.00100; 3046/2589 tok/s;    190 sec
[2019-02-17 00:25:08,394 INFO] Step 700/100000; loss: 602951.637411 acc:  31.57; ppl: 88.65; xent: 4.48; lr: 0.00100; 5284/4518 tok/s;    220 sec
[2019-02-17 00:25:38,678 INFO] Step 800/100000; loss: 591457.869202 acc:  33.23; ppl: 77.78; xent: 4.35; lr: 0.00100; 5279/4486 tok/s;    250 sec
[2019-02-17 00:26:09,433 INFO] Step 900/100000; loss: 565543.698792 acc:  35.44; ppl: 64.40; xent: 4.17; lr: 0.00100; 5207/4415 tok/s;    281 sec
[2019-02-17 00:26:40,538 INFO] Step 1000/100000; loss: 542394.963114 acc:  36.82; ppl: 56.35; xent: 4.03; lr: 0.00100; 5059/4325 tok/s;    312 sec
[2019-02-17 00:26:40,646 INFO] Loadin

last accent valid acc: -1042816.4230957031


[2019-02-17 00:27:03,315 INFO] Validation perplexity: 45.7972
[2019-02-17 00:27:03,315 INFO] Validation accuracy: 38.9023
[2019-02-17 00:27:34,912 INFO] Step 1100/100000; loss: 535783.966767 acc:  37.60; ppl: 52.39; xent: 3.96; lr: 0.00100; 2914/2489 tok/s;    366 sec
[2019-02-17 00:28:06,741 INFO] Step 1200/100000; loss: 520141.352391 acc:  39.06; ppl: 46.32; xent: 3.84; lr: 0.00100; 5013/4261 tok/s;    398 sec
[2019-02-17 00:28:38,868 INFO] Step 1300/100000; loss: 502548.408494 acc:  40.11; ppl: 41.99; xent: 3.74; lr: 0.00100; 4902/4185 tok/s;    430 sec
[2019-02-17 00:29:11,669 INFO] Step 1400/100000; loss: 501804.743335 acc:  40.71; ppl: 39.98; xent: 3.69; lr: 0.00100; 4883/4148 tok/s;    463 sec
[2019-02-17 00:29:45,750 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 00:29:46,645 INFO] Step 1500/100000; loss: 482832.172116 acc:  41.61; ppl: 36.18; xent: 3.59; lr: 0.00100; 4517/3847 tok/s;    498 sec
[2019-02-17 00:29:46,749 INFO] Loa

last accent valid acc: -860350.5795898438


[2019-02-17 00:30:09,541 INFO] Validation perplexity: 29.6313
[2019-02-17 00:30:09,542 INFO] Validation accuracy: 43.9113
[2019-02-17 00:30:42,452 INFO] Step 1600/100000; loss: 465076.911014 acc:  42.50; ppl: 33.32; xent: 3.51; lr: 0.00100; 2781/2377 tok/s;    554 sec
[2019-02-17 00:31:15,285 INFO] Step 1700/100000; loss: 464986.962854 acc:  42.82; ppl: 31.78; xent: 3.46; lr: 0.00100; 4811/4095 tok/s;    587 sec
[2019-02-17 00:31:48,501 INFO] Step 1800/100000; loss: 460733.670228 acc:  43.17; ppl: 30.54; xent: 3.42; lr: 0.00100; 4747/4057 tok/s;    620 sec
[2019-02-17 00:32:21,704 INFO] Step 1900/100000; loss: 451177.222448 acc:  43.84; ppl: 28.60; xent: 3.35; lr: 0.00100; 4762/4052 tok/s;    653 sec
[2019-02-17 00:32:54,753 INFO] Step 2000/100000; loss: 444609.743636 acc:  44.78; ppl: 26.80; xent: 3.29; lr: 0.00100; 4821/4091 tok/s;    686 sec
[2019-02-17 00:32:54,857 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -762398.6516113281


[2019-02-17 00:33:17,766 INFO] Validation perplexity: 22.7443
[2019-02-17 00:33:17,767 INFO] Validation accuracy: 46.3507
[2019-02-17 00:33:50,921 INFO] Step 2100/100000; loss: 436440.474475 acc:  45.03; ppl: 25.55; xent: 3.24; lr: 0.00100; 2824/2398 tok/s;    742 sec
[2019-02-17 00:34:23,520 INFO] Step 2200/100000; loss: 421839.985645 acc:  45.35; ppl: 24.35; xent: 3.19; lr: 0.00100; 4743/4053 tok/s;    775 sec
[2019-02-17 00:34:56,999 INFO] Step 2300/100000; loss: 445731.467810 acc:  45.40; ppl: 24.18; xent: 3.19; lr: 0.00100; 4921/4180 tok/s;    808 sec
[2019-02-17 00:35:29,979 INFO] Step 2400/100000; loss: 406463.014549 acc:  46.82; ppl: 21.31; xent: 3.06; lr: 0.00100; 4747/4029 tok/s;    841 sec
[2019-02-17 00:36:03,009 INFO] Step 2500/100000; loss: 409024.372836 acc:  46.75; ppl: 21.04; xent: 3.05; lr: 0.00100; 4757/4065 tok/s;    874 sec
[2019-02-17 00:36:03,117 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -702889.7547607422


[2019-02-17 00:36:26,093 INFO] Validation perplexity: 19.6206
[2019-02-17 00:36:26,094 INFO] Validation accuracy: 48.1447
[2019-02-17 00:36:59,606 INFO] Step 2600/100000; loss: 419114.404120 acc:  46.64; ppl: 21.01; xent: 3.04; lr: 0.00100; 2850/2432 tok/s;    931 sec
[2019-02-17 00:37:32,438 INFO] Step 2700/100000; loss: 393030.335198 acc:  47.93; ppl: 19.27; xent: 2.96; lr: 0.00100; 4749/4046 tok/s;    964 sec
[2019-02-17 00:38:05,350 INFO] Step 2800/100000; loss: 398077.013215 acc:  47.94; ppl: 18.92; xent: 2.94; lr: 0.00100; 4808/4114 tok/s;    997 sec
[2019-02-17 00:38:38,580 INFO] Step 2900/100000; loss: 396421.135048 acc:  48.18; ppl: 18.70; xent: 2.93; lr: 0.00100; 4804/4074 tok/s;   1030 sec
[2019-02-17 00:39:12,310 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 00:39:13,893 INFO] Step 3000/100000; loss: 391482.034637 acc:  48.54; ppl: 17.83; xent: 2.88; lr: 0.00100; 4514/3848 tok/s;   1065 sec
[2019-02-17 00:39:13,995 INFO] Loa

last accent valid acc: -669653.4422607422


[2019-02-17 00:39:36,689 INFO] Validation perplexity: 17.2655
[2019-02-17 00:39:36,690 INFO] Validation accuracy: 49.5066
[2019-02-17 00:40:09,931 INFO] Step 3100/100000; loss: 375768.247467 acc:  49.15; ppl: 17.00; xent: 2.83; lr: 0.00100; 2777/2367 tok/s;   1121 sec
[2019-02-17 00:40:42,986 INFO] Step 3200/100000; loss: 375364.015007 acc:  49.08; ppl: 16.86; xent: 2.83; lr: 0.00100; 4718/4020 tok/s;   1154 sec
[2019-02-17 00:41:16,263 INFO] Step 3300/100000; loss: 378911.260649 acc:  49.14; ppl: 16.68; xent: 2.81; lr: 0.00100; 4739/4046 tok/s;   1188 sec
[2019-02-17 00:41:49,655 INFO] Step 3400/100000; loss: 376604.460515 acc:  49.37; ppl: 16.20; xent: 2.79; lr: 0.00100; 4755/4049 tok/s;   1221 sec
[2019-02-17 00:42:22,640 INFO] Step 3500/100000; loss: 375680.314608 acc:  50.05; ppl: 15.66; xent: 2.75; lr: 0.00100; 4878/4140 tok/s;   1254 sec


last accent valid acc: -640885.4837036133


[2019-02-17 00:42:22,858 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654
[2019-02-17 00:42:45,820 INFO] Validation perplexity: 15.8476
[2019-02-17 00:42:45,821 INFO] Validation accuracy: 50.3151
[2019-02-17 00:43:19,215 INFO] Step 3600/100000; loss: 365947.096657 acc:  50.05; ppl: 15.32; xent: 2.73; lr: 0.00100; 2795/2370 tok/s;   1310 sec
[2019-02-17 00:43:51,809 INFO] Step 3700/100000; loss: 353436.281603 acc:  50.30; ppl: 14.89; xent: 2.70; lr: 0.00100; 4683/4016 tok/s;   1343 sec
[2019-02-17 00:44:25,255 INFO] Step 3800/100000; loss: 387970.563111 acc:  49.90; ppl: 15.34; xent: 2.73; lr: 0.00100; 5020/4248 tok/s;   1377 sec
[2019-02-17 00:44:58,098 INFO] Step 3900/100000; loss: 345625.314829 acc:  51.33; ppl: 13.60; xent: 2.61; lr: 0.00100; 4744/4033 tok/s;   1409 sec
[2019-02-17 00:45:31,001 INFO] Step 4000/100000; loss: 347168.978542 acc:  51.25; ppl: 13.63; xent: 2.61; lr: 0.00100; 4718/4039 tok/s;   1442 sec
[2019-02-17 00:45:31,103 INFO] Loa

last accent valid acc: -621606.5955810547


[2019-02-17 00:45:54,066 INFO] Validation perplexity: 15.208
[2019-02-17 00:45:54,067 INFO] Validation accuracy: 50.8299
[2019-02-17 00:46:27,202 INFO] Step 4100/100000; loss: 362597.603580 acc:  50.80; ppl: 13.97; xent: 2.64; lr: 0.00100; 2869/2447 tok/s;   1498 sec
[2019-02-17 00:47:00,427 INFO] Step 4200/100000; loss: 349466.582035 acc:  51.85; ppl: 13.11; xent: 2.57; lr: 0.00100; 4803/4087 tok/s;   1532 sec
[2019-02-17 00:47:33,568 INFO] Step 4300/100000; loss: 345397.428776 acc:  51.76; ppl: 12.93; xent: 2.56; lr: 0.00100; 4762/4072 tok/s;   1565 sec
[2019-02-17 00:48:07,035 INFO] Step 4400/100000; loss: 341799.550614 acc:  52.07; ppl: 12.82; xent: 2.55; lr: 0.00100; 4722/4004 tok/s;   1598 sec
[2019-02-17 00:48:39,876 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 00:48:42,212 INFO] Step 4500/100000; loss: 342140.221690 acc:  52.11; ppl: 12.62; xent: 2.54; lr: 0.00100; 4500/3836 tok/s;   1633 sec
[2019-02-17 00:48:42,312 INFO] Load

last accent valid acc: -612339.8005981445


[2019-02-17 00:49:04,894 INFO] Validation perplexity: 14.131
[2019-02-17 00:49:04,895 INFO] Validation accuracy: 51.5882
[2019-02-17 00:49:37,834 INFO] Step 4600/100000; loss: 333581.438071 acc:  52.40; ppl: 12.28; xent: 2.51; lr: 0.00100; 2806/2392 tok/s;   1689 sec
[2019-02-17 00:50:10,543 INFO] Step 4700/100000; loss: 327371.396634 acc:  52.61; ppl: 12.12; xent: 2.49; lr: 0.00100; 4696/4012 tok/s;   1722 sec
[2019-02-17 00:50:43,957 INFO] Step 4800/100000; loss: 338522.389313 acc:  52.44; ppl: 12.24; xent: 2.50; lr: 0.00100; 4738/4045 tok/s;   1755 sec
[2019-02-17 00:51:17,382 INFO] Step 4900/100000; loss: 345938.031763 acc:  52.23; ppl: 12.13; xent: 2.50; lr: 0.00100; 4889/4147 tok/s;   1789 sec
[2019-02-17 00:51:50,287 INFO] Step 5000/100000; loss: 328542.505356 acc:  53.26; ppl: 11.48; xent: 2.44; lr: 0.00100; 4809/4091 tok/s;   1822 sec
[2019-02-17 00:51:50,390 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -595814.4460449219


[2019-02-17 00:52:13,359 INFO] Validation perplexity: 13.6505
[2019-02-17 00:52:13,360 INFO] Validation accuracy: 52.0994
[2019-02-17 00:52:13,360 INFO] Saving checkpoint /mnt/drive-2t/model/opennmt_153a72b0efdd37e8921bffa8bd6cc110_step_5000.pt
[2019-02-17 00:52:49,001 INFO] Step 5100/100000; loss: 327113.239679 acc:  53.16; ppl: 11.58; xent: 2.45; lr: 0.00100; 2684/2275 tok/s;   1880 sec
[2019-02-17 00:53:21,465 INFO] Step 5200/100000; loss: 318204.220895 acc:  53.35; ppl: 11.26; xent: 2.42; lr: 0.00100; 4706/4049 tok/s;   1913 sec
[2019-02-17 00:53:54,926 INFO] Step 5300/100000; loss: 345108.844725 acc:  53.06; ppl: 11.58; xent: 2.45; lr: 0.00100; 4993/4211 tok/s;   1946 sec
[2019-02-17 00:54:27,990 INFO] Step 5400/100000; loss: 314860.199137 acc:  54.16; ppl: 10.54; xent: 2.36; lr: 0.00100; 4757/4044 tok/s;   1979 sec
[2019-02-17 00:55:01,090 INFO] Step 5500/100000; loss: 315849.714545 acc:  53.84; ppl: 10.71; xent: 2.37; lr: 0.00100; 4693/4025 tok/s;   2012 sec
[2019-02-17 00:55:01

last accent valid acc: -588030.954284668


[2019-02-17 00:55:24,158 INFO] Validation perplexity: 13.4929
[2019-02-17 00:55:24,159 INFO] Validation accuracy: 52.2745
[2019-02-17 00:55:57,309 INFO] Step 5600/100000; loss: 325547.151342 acc:  53.73; ppl: 10.91; xent: 2.39; lr: 0.00100; 2842/2423 tok/s;   2069 sec
[2019-02-17 00:56:30,633 INFO] Step 5700/100000; loss: 321352.835857 acc:  54.41; ppl: 10.40; xent: 2.34; lr: 0.00100; 4848/4118 tok/s;   2102 sec
[2019-02-17 00:57:03,612 INFO] Step 5800/100000; loss: 320245.878460 acc:  54.24; ppl: 10.38; xent: 2.34; lr: 0.00100; 4862/4150 tok/s;   2135 sec
[2019-02-17 00:57:36,718 INFO] Step 5900/100000; loss: 301996.138234 acc:  54.89; ppl: 10.02; xent: 2.30; lr: 0.00100; 4656/3959 tok/s;   2168 sec
[2019-02-17 00:58:09,216 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 00:58:12,168 INFO] Step 6000/100000; loss: 314026.741962 acc:  54.50; ppl: 10.14; xent: 2.32; lr: 0.00100; 4482/3824 tok/s;   2203 sec
[2019-02-17 00:58:12,265 INFO] Loa

last accent valid acc: -585419.2310180664


[2019-02-17 00:58:34,962 INFO] Validation perplexity: 12.9807
[2019-02-17 00:58:34,963 INFO] Validation accuracy: 52.5647
[2019-02-17 00:59:07,671 INFO] Step 6100/100000; loss: 304596.544668 acc:  54.85; ppl:  9.95; xent: 2.30; lr: 0.00100; 2802/2389 tok/s;   2259 sec
[2019-02-17 00:59:40,413 INFO] Step 6200/100000; loss: 297574.771949 acc:  55.06; ppl:  9.81; xent: 2.28; lr: 0.00100; 4654/3980 tok/s;   2292 sec
[2019-02-17 01:00:13,625 INFO] Step 6300/100000; loss: 309120.098794 acc:  54.74; ppl:  9.93; xent: 2.30; lr: 0.00100; 4759/4055 tok/s;   2325 sec
[2019-02-17 01:00:47,117 INFO] Step 6400/100000; loss: 321156.282347 acc:  54.63; ppl:  9.94; xent: 2.30; lr: 0.00100; 4925/4176 tok/s;   2358 sec
[2019-02-17 01:01:20,278 INFO] Step 6500/100000; loss: 306497.550754 acc:  55.31; ppl:  9.57; xent: 2.26; lr: 0.00100; 4806/4093 tok/s;   2392 sec
[2019-02-17 01:01:20,380 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -576712.4891967773


[2019-02-17 01:01:43,264 INFO] Validation perplexity: 12.7693
[2019-02-17 01:01:43,265 INFO] Validation accuracy: 52.8901
[2019-02-17 01:02:16,117 INFO] Step 6600/100000; loss: 296164.481762 acc:  55.37; ppl:  9.50; xent: 2.25; lr: 0.00100; 2783/2356 tok/s;   2447 sec
[2019-02-17 01:02:48,662 INFO] Step 6700/100000; loss: 293655.583576 acc:  55.61; ppl:  9.29; xent: 2.23; lr: 0.00100; 4686/4047 tok/s;   2480 sec
[2019-02-17 01:03:22,634 INFO] Step 6800/100000; loss: 331008.482047 acc:  54.93; ppl:  9.80; xent: 2.28; lr: 0.00100; 5088/4268 tok/s;   2514 sec
[2019-02-17 01:03:55,289 INFO] Step 6900/100000; loss: 279343.524548 acc:  56.71; ppl:  8.56; xent: 2.15; lr: 0.00100; 4673/3985 tok/s;   2547 sec
[2019-02-17 01:04:28,188 INFO] Step 7000/100000; loss: 289996.461967 acc:  56.03; ppl:  8.92; xent: 2.19; lr: 0.00100; 4691/4029 tok/s;   2579 sec
[2019-02-17 01:04:28,291 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -573019.4006347656


[2019-02-17 01:04:51,102 INFO] Validation perplexity: 12.673
[2019-02-17 01:04:51,103 INFO] Validation accuracy: 53.0826
[2019-02-17 01:05:24,516 INFO] Step 7100/100000; loss: 307436.566276 acc:  55.69; ppl:  9.30; xent: 2.23; lr: 0.00100; 2874/2448 tok/s;   2636 sec
[2019-02-17 01:05:58,101 INFO] Step 7200/100000; loss: 307484.128592 acc:  56.02; ppl:  9.05; xent: 2.20; lr: 0.00100; 4912/4156 tok/s;   2669 sec
[2019-02-17 01:06:30,852 INFO] Step 7300/100000; loss: 286787.657295 acc:  56.57; ppl:  8.57; xent: 2.15; lr: 0.00100; 4752/4075 tok/s;   2702 sec
[2019-02-17 01:07:03,768 INFO] Step 7400/100000; loss: 283663.926278 acc:  56.61; ppl:  8.58; xent: 2.15; lr: 0.00100; 4717/4008 tok/s;   2735 sec
[2019-02-17 01:07:35,564 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 01:07:38,990 INFO] Step 7500/100000; loss: 289599.594447 acc:  56.40; ppl:  8.61; xent: 2.15; lr: 0.00100; 4479/3820 tok/s;   2770 sec
[2019-02-17 01:07:39,097 INFO] Load

last accent valid acc: -571315.2623291016


[2019-02-17 01:08:01,765 INFO] Validation perplexity: 12.3772
[2019-02-17 01:08:01,766 INFO] Validation accuracy: 53.2853
[2019-02-17 01:08:34,547 INFO] Step 7600/100000; loss: 282157.102832 acc:  56.64; ppl:  8.48; xent: 2.14; lr: 0.00100; 2781/2376 tok/s;   2826 sec
[2019-02-17 01:09:07,442 INFO] Step 7700/100000; loss: 279370.579670 acc:  56.88; ppl:  8.43; xent: 2.13; lr: 0.00100; 4652/3983 tok/s;   2859 sec
[2019-02-17 01:09:41,048 INFO] Step 7800/100000; loss: 292370.379862 acc:  56.46; ppl:  8.58; xent: 2.15; lr: 0.00100; 4762/4049 tok/s;   2892 sec
[2019-02-17 01:10:14,312 INFO] Step 7900/100000; loss: 296181.659850 acc:  56.54; ppl:  8.48; xent: 2.14; lr: 0.00100; 4915/4165 tok/s;   2926 sec
[2019-02-17 01:10:47,330 INFO] Step 8000/100000; loss: 289491.562200 acc:  56.91; ppl:  8.35; xent: 2.12; lr: 0.00100; 4848/4132 tok/s;   2959 sec
[2019-02-17 01:10:47,433 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -566002.4651489258


[2019-02-17 01:11:10,133 INFO] Validation perplexity: 12.4682
[2019-02-17 01:11:10,134 INFO] Validation accuracy: 53.3511


Save last accent model with acc: -566002.4651489258
Decent round: 1


[2019-02-17 01:11:21,350 INFO] Step 8100/100000; loss: 288987.637007 acc:  56.59; ppl:  8.55; xent: 2.15; lr: 0.00100; 4699/3959 tok/s;   2993 sec
[2019-02-17 01:11:31,153 INFO] Step 8200/100000; loss: 262200.664243 acc:  57.97; ppl:  7.80; xent: 2.05; lr: 0.00100; 14998/13025 tok/s;   3002 sec
[2019-02-17 01:11:42,756 INFO] Step 8300/100000; loss: 312992.622728 acc:  56.45; ppl:  8.59; xent: 2.15; lr: 0.00100; 14946/12544 tok/s;   3014 sec
[2019-02-17 01:11:53,163 INFO] Step 8400/100000; loss: 262507.030419 acc:  58.24; ppl:  7.56; xent: 2.02; lr: 0.00100; 14613/12470 tok/s;   3024 sec
[2019-02-17 01:12:03,951 INFO] Step 8500/100000; loss: 275160.309348 acc:  57.70; ppl:  7.86; xent: 2.06; lr: 0.00100; 14455/12369 tok/s;   3035 sec
[2019-02-17 01:12:04,051 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -566002.4651489258


[2019-02-17 01:12:26,940 INFO] Validation perplexity: 12.3995
[2019-02-17 01:12:26,940 INFO] Validation accuracy: 53.5142


Decent round: 2


[2019-02-17 01:12:37,872 INFO] Step 8600/100000; loss: 286133.668957 acc:  57.21; ppl:  8.15; xent: 2.10; lr: 0.00100; 4713/4022 tok/s;   3069 sec
[2019-02-17 01:12:49,157 INFO] Step 8700/100000; loss: 287386.635457 acc:  57.76; ppl:  7.90; xent: 2.07; lr: 0.00100; 14567/12325 tok/s;   3080 sec
[2019-02-17 01:12:59,988 INFO] Step 8800/100000; loss: 280330.546803 acc:  57.69; ppl:  7.74; xent: 2.05; lr: 0.00100; 14807/12654 tok/s;   3091 sec
[2019-02-17 01:13:10,566 INFO] Step 8900/100000; loss: 261443.608377 acc:  58.40; ppl:  7.48; xent: 2.01; lr: 0.00100; 14418/12280 tok/s;   3102 sec
[2019-02-17 01:13:22,023 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 01:13:23,375 INFO] Step 9000/100000; loss: 271088.367757 acc:  58.13; ppl:  7.59; xent: 2.03; lr: 0.00100; 12242/10442 tok/s;   3115 sec
[2019-02-17 01:13:23,480 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -566002.4651489258


[2019-02-17 01:13:46,473 INFO] Validation perplexity: 12.2041
[2019-02-17 01:13:46,473 INFO] Validation accuracy: 53.7275
[2019-02-17 01:14:19,380 INFO] Step 9100/100000; loss: 268392.538209 acc:  58.41; ppl:  7.54; xent: 2.02; lr: 0.00100; 2774/2372 tok/s;   3171 sec
[2019-02-17 01:14:52,308 INFO] Step 9200/100000; loss: 272233.236470 acc:  58.11; ppl:  7.67; xent: 2.04; lr: 0.00100; 4743/4058 tok/s;   3204 sec
[2019-02-17 01:15:25,659 INFO] Step 9300/100000; loss: 273201.406143 acc:  58.10; ppl:  7.50; xent: 2.01; lr: 0.00100; 4786/4067 tok/s;   3237 sec
[2019-02-17 01:15:59,064 INFO] Step 9400/100000; loss: 277196.518365 acc:  58.04; ppl:  7.56; xent: 2.02; lr: 0.00100; 4843/4103 tok/s;   3270 sec
[2019-02-17 01:16:31,962 INFO] Step 9500/100000; loss: 271221.367023 acc:  58.50; ppl:  7.45; xent: 2.01; lr: 0.00100; 4808/4104 tok/s;   3303 sec
[2019-02-17 01:16:32,068 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -562833.7909545898


[2019-02-17 01:16:55,036 INFO] Validation perplexity: 12.2124
[2019-02-17 01:16:55,037 INFO] Validation accuracy: 53.788


Save last accent model with acc: -562833.7909545898
Decent round: 1


[2019-02-17 01:17:06,252 INFO] Step 9600/100000; loss: 275911.765401 acc:  58.16; ppl:  7.66; xent: 2.04; lr: 0.00100; 4692/3952 tok/s;   3338 sec
[2019-02-17 01:17:16,154 INFO] Step 9700/100000; loss: 250403.971301 acc:  59.51; ppl:  6.97; xent: 1.94; lr: 0.00100; 15042/13027 tok/s;   3347 sec
[2019-02-17 01:17:27,547 INFO] Step 9800/100000; loss: 289771.920270 acc:  58.11; ppl:  7.60; xent: 2.03; lr: 0.00100; 14907/12540 tok/s;   3359 sec
[2019-02-17 01:17:38,073 INFO] Step 9900/100000; loss: 251920.432582 acc:  59.51; ppl:  6.82; xent: 1.92; lr: 0.00100; 14609/12466 tok/s;   3369 sec
[2019-02-17 01:17:48,876 INFO] Step 10000/100000; loss: 258893.054366 acc:  59.16; ppl:  7.04; xent: 1.95; lr: 0.00100; 14362/12281 tok/s;   3380 sec
[2019-02-17 01:17:49,017 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -562833.7909545898


[2019-02-17 01:18:11,974 INFO] Validation perplexity: 12.3844
[2019-02-17 01:18:11,975 INFO] Validation accuracy: 53.6671
[2019-02-17 01:18:11,976 INFO] Saving checkpoint /mnt/drive-2t/model/opennmt_153a72b0efdd37e8921bffa8bd6cc110_step_10000.pt


Decent round: 2


[2019-02-17 01:18:25,317 INFO] Step 10100/100000; loss: 270220.985155 acc:  58.73; ppl:  7.31; xent: 1.99; lr: 0.00100; 4354/3728 tok/s;   3417 sec
[2019-02-17 01:18:36,674 INFO] Step 10200/100000; loss: 276800.884822 acc:  58.99; ppl:  7.16; xent: 1.97; lr: 0.00100; 14686/12383 tok/s;   3428 sec
[2019-02-17 01:18:47,530 INFO] Step 10300/100000; loss: 265399.246075 acc:  59.13; ppl:  7.01; xent: 1.95; lr: 0.00100; 14671/12556 tok/s;   3439 sec
[2019-02-17 01:18:58,235 INFO] Step 10400/100000; loss: 255566.289446 acc:  59.47; ppl:  6.87; xent: 1.93; lr: 0.00100; 14545/12386 tok/s;   3450 sec
[2019-02-17 01:19:09,323 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 01:19:10,826 INFO] Step 10500/100000; loss: 250030.983415 acc:  59.76; ppl:  6.76; xent: 1.91; lr: 0.00100; 12181/10396 tok/s;   3462 sec
[2019-02-17 01:19:10,970 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -562833.7909545898


[2019-02-17 01:19:33,790 INFO] Validation perplexity: 12.0941
[2019-02-17 01:19:33,791 INFO] Validation accuracy: 54.0071
[2019-02-17 01:20:06,761 INFO] Step 10600/100000; loss: 256697.860239 acc:  59.59; ppl:  6.85; xent: 1.92; lr: 0.00100; 2791/2385 tok/s;   3518 sec
[2019-02-17 01:20:39,604 INFO] Step 10700/100000; loss: 257913.425544 acc:  59.41; ppl:  6.94; xent: 1.94; lr: 0.00100; 4746/4055 tok/s;   3551 sec
[2019-02-17 01:21:13,613 INFO] Step 10800/100000; loss: 271729.811259 acc:  59.18; ppl:  7.01; xent: 1.95; lr: 0.00100; 4838/4104 tok/s;   3585 sec
[2019-02-17 01:21:46,601 INFO] Step 10900/100000; loss: 255714.754768 acc:  59.60; ppl:  6.73; xent: 1.91; lr: 0.00100; 4790/4067 tok/s;   3618 sec
[2019-02-17 01:22:19,575 INFO] Step 11000/100000; loss: 254901.545631 acc:  59.93; ppl:  6.72; xent: 1.90; lr: 0.00100; 4748/4059 tok/s;   3651 sec
[2019-02-17 01:22:19,680 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -560795.9261474609


[2019-02-17 01:22:42,557 INFO] Validation perplexity: 12.1283
[2019-02-17 01:22:42,557 INFO] Validation accuracy: 54.0516


Save last accent model with acc: -560795.9261474609
Decent round: 1


[2019-02-17 01:22:53,792 INFO] Step 11100/100000; loss: 267352.532779 acc:  59.22; ppl:  7.05; xent: 1.95; lr: 0.00100; 4753/4001 tok/s;   3685 sec
[2019-02-17 01:23:03,609 INFO] Step 11200/100000; loss: 234678.988677 acc:  60.88; ppl:  6.31; xent: 1.84; lr: 0.00100; 14992/12975 tok/s;   3695 sec
[2019-02-17 01:23:15,108 INFO] Step 11300/100000; loss: 278737.842412 acc:  59.38; ppl:  6.97; xent: 1.94; lr: 0.00100; 14809/12489 tok/s;   3706 sec
[2019-02-17 01:23:25,566 INFO] Step 11400/100000; loss: 237502.039271 acc:  61.09; ppl:  6.20; xent: 1.82; lr: 0.00100; 14592/12447 tok/s;   3717 sec
[2019-02-17 01:23:36,245 INFO] Step 11500/100000; loss: 248149.248441 acc:  60.45; ppl:  6.45; xent: 1.86; lr: 0.00100; 14595/12471 tok/s;   3728 sec
[2019-02-17 01:23:36,342 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -560795.9261474609


[2019-02-17 01:23:59,215 INFO] Validation perplexity: 12.3651
[2019-02-17 01:23:59,217 INFO] Validation accuracy: 53.9147


Decent round: 2


[2019-02-17 01:24:10,213 INFO] Step 11600/100000; loss: 260379.355594 acc:  59.81; ppl:  6.76; xent: 1.91; lr: 0.00100; 4679/4012 tok/s;   3761 sec
[2019-02-17 01:24:21,666 INFO] Step 11700/100000; loss: 267374.583434 acc:  60.19; ppl:  6.60; xent: 1.89; lr: 0.00100; 14740/12373 tok/s;   3773 sec
[2019-02-17 01:24:32,373 INFO] Step 11800/100000; loss: 252282.734117 acc:  60.48; ppl:  6.41; xent: 1.86; lr: 0.00100; 14805/12687 tok/s;   3784 sec
[2019-02-17 01:24:43,644 INFO] Step 11900/100000; loss: 248375.922256 acc:  60.42; ppl:  6.40; xent: 1.86; lr: 0.00100; 13968/11871 tok/s;   3795 sec
[2019-02-17 01:24:54,130 INFO] Loading dataset from data/nmt15-reverse.train.0.pt, number of examples: 95844
[2019-02-17 01:24:55,846 INFO] Step 12000/100000; loss: 233175.226304 acc:  61.08; ppl:  6.08; xent: 1.81; lr: 0.00100; 12391/10588 tok/s;   3807 sec
[2019-02-17 01:24:55,943 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -560795.9261474609


[2019-02-17 01:25:18,758 INFO] Validation perplexity: 12.2206
[2019-02-17 01:25:18,758 INFO] Validation accuracy: 54.2076


Decent round: 3


[2019-02-17 01:25:29,622 INFO] Step 12100/100000; loss: 249227.554929 acc:  60.80; ppl:  6.34; xent: 1.85; lr: 0.00100; 4666/3996 tok/s;   3841 sec
[2019-02-17 01:25:40,125 INFO] Step 12200/100000; loss: 240656.699547 acc:  60.68; ppl:  6.33; xent: 1.85; lr: 0.00100; 14523/12420 tok/s;   3851 sec
[2019-02-17 01:25:51,685 INFO] Step 12300/100000; loss: 262568.810959 acc:  60.23; ppl:  6.50; xent: 1.87; lr: 0.00100; 14340/12136 tok/s;   3863 sec
[2019-02-17 01:26:02,543 INFO] Step 12400/100000; loss: 244385.998502 acc:  60.93; ppl:  6.20; xent: 1.82; lr: 0.00100; 14521/12342 tok/s;   3874 sec
[2019-02-17 01:26:13,489 INFO] Step 12500/100000; loss: 248908.065883 acc:  60.99; ppl:  6.27; xent: 1.84; lr: 0.00100; 14507/12383 tok/s;   3885 sec
[2019-02-17 01:26:13,585 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -560795.9261474609


[2019-02-17 01:26:36,559 INFO] Validation perplexity: 12.1092
[2019-02-17 01:26:36,560 INFO] Validation accuracy: 54.3783


Decent round: 4


[2019-02-17 01:26:47,803 INFO] Step 12600/100000; loss: 252385.248033 acc:  60.57; ppl:  6.46; xent: 1.87; lr: 0.00100; 4683/3944 tok/s;   3919 sec
[2019-02-17 01:26:58,081 INFO] Step 12700/100000; loss: 237192.034684 acc:  61.37; ppl:  6.09; xent: 1.81; lr: 0.00100; 14829/12778 tok/s;   3929 sec
[2019-02-17 01:27:09,279 INFO] Step 12800/100000; loss: 257974.215482 acc:  60.85; ppl:  6.23; xent: 1.83; lr: 0.00100; 14897/12595 tok/s;   3941 sec
[2019-02-17 01:27:19,766 INFO] Step 12900/100000; loss: 224642.598003 acc:  62.25; ppl:  5.74; xent: 1.75; lr: 0.00100; 14339/12264 tok/s;   3951 sec
[2019-02-17 01:27:30,481 INFO] Step 13000/100000; loss: 236886.305157 acc:  61.52; ppl:  5.98; xent: 1.79; lr: 0.00100; 14452/12361 tok/s;   3962 sec
[2019-02-17 01:27:30,578 INFO] Loading dataset from data/nmt15-reverse.valid.0.pt, number of examples: 10654


last accent valid acc: -560795.9261474609


[2019-02-17 01:27:53,652 INFO] Validation perplexity: 12.3892
[2019-02-17 01:27:53,653 INFO] Validation accuracy: 54.1907
[2019-02-17 01:27:53,654 INFO] Saving checkpoint /mnt/drive-2t/model/opennmt_153a72b0efdd37e8921bffa8bd6cc110_step_11000.pt


Decent round: 5
meet early stop condition, prev best loss: -560795.9261474609 current loss: -566220.6053466797


'153a72b0efdd37e8921bffa8bd6cc110'