In [1]:
import sys
import time
import tensorflow as tf
import numpy as np
sys.path.insert(0, '../')
%load_ext autoreload
%autoreload 2

In [2]:
from seqmodel.bunch import Bunch
from seqmodel.experiment.policy_agent import ActorCriticAgent
from seqmodel import model
from seqmodel import data

In [3]:
vocab = data.Vocabulary.from_vocab_file('../data/tiny_copy/vocab.txt')
valid_iter = data.Seq2SeqIterator(vocab, vocab)
valid_iter.initialize('../data/tiny_copy/valid.txt')
train_iter = data.Seq2SeqIterator(vocab, vocab)
train_iter.initialize('../data/tiny_copy/train.txt')

In [4]:
tf.reset_default_graph()
agent_opt = ActorCriticAgent.default_opt()
emb_opt = agent_opt.policy_model.model_opt.embedding
dec_opt = agent_opt.policy_model.model_opt.decoder
enc_opt = agent_opt.policy_model.model_opt.encoder
optim_opt = agent_opt.optim

agent_opt.discount_factor = 0.7

emb_opt.decoder_dim = 32
emb_opt.encoder_dim = 32

dec_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32
enc_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32

optim_opt.learning_rate = 0.3
optim_opt.name = 'GradientDescentOptimizer'
optim_opt.max_epochs = 5

emb_opt = agent_opt.value_model.model_opt.embedding
dec_opt = agent_opt.value_model.model_opt.decoder
enc_opt = agent_opt.value_model.model_opt.encoder

emb_opt.decoder_dim = 32
emb_opt.encoder_dim = 32

dec_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32
enc_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32

sess_config = tf.ConfigProto(device_count={'GPU': 0})
sess = tf.Session(config = sess_config)

agent = ActorCriticAgent(agent_opt, sess)
agent.initialize_model(with_training=True)
agent.initialize_optim()
for v in tf.trainable_variables():
    print('{}, {}'.format(v.name, v.get_shape()))
sess.run(tf.global_variables_initializer())

policy_agent/policy/model/encoder_embedding:0, (15, 32)
policy_agent/policy/model/decoder_embedding:0, (15, 32)
policy_agent/policy/model/encoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/policy/model/encoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/policy/model/decoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/policy/model/decoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/policy/model/decoder_rnn/logit_w:0, (15, 32)
policy_agent/policy/model/decoder_rnn/logit_b:0, (15,)
policy_agent/value/model/encoder_embedding:0, (15, 32)
policy_agent/value/model/decoder_embedding:0, (15, 32)
policy_agent/value/model/encoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/value/model/encoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/value/model/decoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/value/model/decoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/value/model/decoder_rnn/regression_w:0, 

In [5]:
agent.train(train_iter, 20, valid_iter, 20, verbose=True)

[36m[INFO ][0mep: 0, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 8.47738, eval_loss: 1.28891 (3.62883), wps: 15076.5
[36m[INFO ][0mvalid: @49 eval_loss: 0.51180 (1.66830), wps: 32778.7
[36m[INFO ][0mep: 1, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 1.98665, eval_loss: 0.29930 (1.34891), wps: 15189.4
[36m[INFO ][0mvalid: @49 eval_loss: 0.17578 (1.19218), wps: 37301.0
[36m[INFO ][0mep: 2, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 0.99541, eval_loss: 0.15021 (1.16208), wps: 15362.9
[36m[INFO ][0mvalid: @49 eval_loss: 0.13434 (1.14378), wps: 37125.1
[36m[INFO ][0mep: 3, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 0.68252, eval_loss: 0.10282 (1.10830), wps: 15279.1
[36m[INFO ][0mvalid: @49 eval_loss: 0.08431 (1.08796), wps: 29797.8
[36m[INFO ][0mep: 4, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 0.51694, eval_loss: 0.07806 (1.08119), wps: 15338.3
[36m[INFO ][0mvalid: @49 eval_loss: 0.08713 (1.09104), wps: 37472.1


<seqmodel.experiment.run_info.TrainingState at 0x7fe3212bc910>

In [7]:
info = agent.evaluate(valid_iter, 20)
print("PPL: {}".format(
    info.eval_cost/info.num_tokens))

valid_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
valid_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_env)
print('Each match: {}'.format(-1 * info.eval_loss))

valid_hard_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
valid_hard_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_hard_env)
print('Exact match: {}'.format(-1 * info.eval_loss))

valid_bleu_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.SEN_BLEU)
valid_bleu_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_bleu_env)
print('BLEU: {}'.format(-1 * info.eval_loss))

PPL: 0.0871275737847
Each match: 0.958608730159
Exact match: 0.852
BLEU: 0.921656665336


In [8]:
agent.reset_training_state()
train_env = data.env.CopyEnv(train_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
# train_env = data.env.CopyEnv(train_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
info = agent.policy_gradient(train_env, 100, valid_hard_env, 20, max_steps=20)

[36m[INFO ][0mep: 0, lr: 0.300000
[36m[INFO ][0mtrain: @100 tr_loss: -0.11119, base_loss: 0.02524, avg_return: 0.93516, wps: 9942.5
[36m[INFO ][0mvalid: @10 avg_return: 0.92200, wps: 26810.6
[36m[INFO ][0mep: 1, lr: 0.300000
[36m[INFO ][0mtrain: @100 tr_loss: -0.09094, base_loss: 0.00691, avg_return: 0.94783, wps: 10502.8
[36m[INFO ][0mvalid: @10 avg_return: 0.94000, wps: 23781.7
[36m[INFO ][0mep: 2, lr: 0.300000
[36m[INFO ][0mtrain: @100 tr_loss: -0.07769, base_loss: 0.00506, avg_return: 0.95311, wps: 10626.9
[36m[INFO ][0mvalid: @10 avg_return: 0.93900, wps: 27211.6
[36m[INFO ][0mep: 3, lr: 0.300000
[36m[INFO ][0mtrain: @100 tr_loss: -0.07511, base_loss: 0.00444, avg_return: 0.95557, wps: 10365.3
[36m[INFO ][0mvalid: @10 avg_return: 0.94200, wps: 25807.0
[36m[INFO ][0mep: 4, lr: 0.300000
[36m[INFO ][0mtrain: @100 tr_loss: -0.07230, base_loss: 0.00416, avg_return: 0.95655, wps: 10111.3
[36m[INFO ][0mvalid: @10 avg_return: 0.94200, wps: 27234.3


In [9]:
info = agent.evaluate(valid_iter, 20)
print("PPL: {}".format(
    info.eval_cost/info.num_tokens))

valid_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
valid_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_env)
print('Each match: {}'.format(-1 * info.eval_loss))

valid_hard_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
valid_hard_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_hard_env)
print('Exact match: {}'.format(-1 * info.eval_loss))

valid_bleu_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.SEN_BLEU)
valid_bleu_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_bleu_env)
print('BLEU: {}'.format(-1 * info.eval_loss))

PPL: 0.0422084542905
Each match: 0.984306349206
Exact match: 0.942
BLEU: 0.956616780586


In [None]:
test_data = ([['a a b c a d', 'a a b c a d'], ['a h a j b c d a b', 'a h a j b c d a b']])
test_iter = data.Seq2SeqIterator(vocab, vocab)
test_iter.initialize(test_data)
test_iter.init_batch(2)
env = data.env.CopyEnv(test_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
# env = data.env.Seq2SeqEnv(test_iter, re_init=False)
transitions, states, rewards = agent.rollout(env, greedy=True)
rewards = np.array(rewards)
returns, targets = agent._compute_return(states, rewards)
print('Return: ') 
print(returns.T)
print('Target: ')
print(targets.T)
pg_data = env.create_transition_return(states, returns)
val_data = env.create_transition_value(states, targets)

In [None]:
pg_data.features.encoder_input.T

In [None]:
pg_data.labels.decoder_label.T

In [None]:
pg_data.labels.decoder_label_weight.T

In [None]:
(targets - returns).T

In [None]:
targets.T