In [1]:
import numpy as np
import tensorflow as tf
import json
import os

In [2]:
from seq2seq.rnn_seq2seq import create_seq2seq_model
from seq2seq.input.generator_io import generator_input_fn

In [3]:
from train_parallel_corpora import load_vocab, file_data_generator_py

In [4]:
unk_id = 2
vocab_ids_bias = unk_id + 1
vocab2id, id2vocab = load_vocab("./data/tatoeba_en_ru/vocab.txt", ids_bias=vocab_ids_bias)
encode = lambda line: list(map(lambda x: vocab2id.get(x, unk_id), line.split(" ")))
decode = lambda line: " ".join(list(map(lambda x: id2vocab.get(x, " "), line))).strip()
vocab_size = len(vocab2id) + vocab_ids_bias

In [5]:
! head -10 ./data/tatoeba_en_ru/test.txt | awk -F '\t' '{print $2}'

том сказал что больше этого делать не будет
tom said he wouldnt do that again
у тома добро@@ е сердце
tom has a good heart
мне не@@ где остановиться
i have nowhere to stay
пожалуйста объяс@@ ни мне правила футбо@@ ла
please explain the rules of soccer to me
он ушёл
he went away


In [6]:
val_input_fn = generator_input_fn(
    x=file_data_generator_py("./data/tatoeba_en_ru/test.txt", line_encode_fn=encode),
    target_key=["targets", "targets_length"],
    batch_size=1, shuffle=False, num_epochs=1,
    queue_capacity=1024, num_threads=1,
    pad_value=0)

In [7]:
log_dir = "./logs_170620_tatoeba_en_ru"

In [8]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
run_config = tf.contrib.learn.RunConfig(
    session_config=tf.ConfigProto(gpu_options=gpu_options),
    model_dir=log_dir)

In [9]:
with open("{}/hparams.json".format(log_dir)) as fout:
    values = json.load(fout)
#     values = fout.readline()
# @TODO: for some reason tf.parse_values does not work :(
hparams = tf.contrib.training.HParams()
for key, value in values.items():
    hparams.add_hparam(key, value)

In [10]:
model = create_seq2seq_model(config=run_config, hparams=hparams)

INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_num_worker_replicas': 0, '_master': '', '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f246b5ca9e8>, '_tf_random_seed': None, '_num_ps_replicas': 0, '_is_chief': True, '_keep_checkpoint_max': 5, '_task_type': None, '_environment': 'local', '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.5
}
, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_task_id': 0, '_model_dir': './logs_170620_tatoeba_en_ru', '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_save_checkpoints_secs': 600}


In [11]:
predictions = model.predict(val_input_fn)

In [12]:
for i, pred in enumerate(predictions):
    if i >= 10:
        break
    print(decode(pred["prediction"].reshape(-1)))
    print("-"*80)

INFO:tensorflow:Restoring parameters from ./logs_170620_tatoeba_en_ru/model.ckpt-758201
том сказал что больше не сделает этого
--------------------------------------------------------------------------------
tom said that he would do not to do that
--------------------------------------------------------------------------------
у тома хороший сердце
--------------------------------------------------------------------------------
tom has a heart heart
--------------------------------------------------------------------------------
мне нужно по@@ си@@ ё@@ ваться
--------------------------------------------------------------------------------
i have no stop anywhere
--------------------------------------------------------------------------------
объяс@@ ните правила мне в этой футбо@@
--------------------------------------------------------------------------------
please explain me the rules of the football
--------------------------------------------------------------------------------
о

In [13]:
hparams.values()

{'attention': 'bahdanau',
 'beam_width': 3,
 'bidirectional': False,
 'cell': 'LSTMCell',
 'cell_num': 1,
 'embedding_size': 128,
 'gradient_clip': 10.0,
 'inference_mode': 'greedy',
 'learning_rate': 0.0001,
 'lr_decay_koef': 0.99,
 'lr_decay_steps': 100000,
 'num_layers': 2,
 'num_units': 128,
 'residual_connections': True,
 'residual_dense': True,
 'scheduled_sampling_probability': 0.2,
 'training_mode': 'scheduled_sampling_embedding',
 'vocab_size': 9775}

In [14]:
hparams.inference_mode =  "beam"

In [15]:
hparams.values()

{'attention': 'bahdanau',
 'beam_width': 3,
 'bidirectional': False,
 'cell': 'LSTMCell',
 'cell_num': 1,
 'embedding_size': 128,
 'gradient_clip': 10.0,
 'inference_mode': 'beam',
 'learning_rate': 0.0001,
 'lr_decay_koef': 0.99,
 'lr_decay_steps': 100000,
 'num_layers': 2,
 'num_units': 128,
 'residual_connections': True,
 'residual_dense': True,
 'scheduled_sampling_probability': 0.2,
 'training_mode': 'scheduled_sampling_embedding',
 'vocab_size': 9775}

In [16]:
val_input_fn = generator_input_fn(
    x=file_data_generator_py("./data/tatoeba_en_ru/test.txt", line_encode_fn=encode),
    target_key=["targets", "targets_length"],
    batch_size=1, shuffle=False, num_epochs=1,
    queue_capacity=1024, num_threads=1,
    pad_data=True)

In [17]:
model = create_seq2seq_model(config=run_config, hparams=hparams)

INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_num_worker_replicas': 0, '_master': '', '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f246b5ca9e8>, '_tf_random_seed': None, '_num_ps_replicas': 0, '_is_chief': True, '_keep_checkpoint_max': 5, '_task_type': None, '_environment': 'local', '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.5
}
, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_task_id': 0, '_model_dir': './logs_170620_tatoeba_en_ru', '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_save_checkpoints_secs': 600}


In [18]:
predictions = model.predict(val_input_fn)

In [19]:
for i, pred in enumerate(predictions):
    if i >= 10:
        break
    for beam_pred in pred["prediction"].swapaxes(0, 1):
        print(decode(beam_pred))
    print("-"*80)

INFO:tensorflow:Restoring parameters from ./logs_170620_tatoeba_en_ru/model.ckpt-758201
том сказал что больше этого больше не сделает
том сказал что больше не сделает этого больше
том сказал что больше этого не сделает этого
--------------------------------------------------------------------------------
did said he would be done that
did said he would be done that
did said he would do not do that
--------------------------------------------------------------------------------
у тома добро@@ е сердце
у тома добро@@ е сердце
у него добро@@ серде@@ чный хороший
--------------------------------------------------------------------------------
tom have a heart heart
tom has a heart heart
tom have a heart
--------------------------------------------------------------------------------
мне не нужно остаться
мне мне остаться
мне мне остаться остаться
--------------------------------------------------------------------------------
should no longer to stop
should no longer to stop anywhere
shoul