In [1]:
import sys
import time
import tensorflow as tf
import numpy as np
sys.path.insert(0, '../')
%load_ext autoreload
%autoreload 2

In [2]:
from seqmodel.bunch import Bunch
from seqmodel.experiment.policy_agent import PolicyAgent
from seqmodel import model
from seqmodel import data

In [3]:
vocab = data.Vocabulary.from_vocab_file('../data/tiny_copy/vocab.txt')
valid_iter = data.Seq2SeqIterator(vocab, vocab)
valid_iter.initialize('../data/tiny_copy/valid.txt')
train_iter = data.Seq2SeqIterator(vocab, vocab)
train_iter.initialize('../data/tiny_copy/train.txt')

In [4]:
tf.reset_default_graph()
agent_opt = PolicyAgent.default_opt()
emb_opt = agent_opt.policy_model.model_opt.embedding
dec_opt = agent_opt.policy_model.model_opt.decoder
enc_opt = agent_opt.policy_model.model_opt.encoder
optim_opt = agent_opt.optim

emb_opt.decoder_dim = 32
emb_opt.encoder_dim = 32

dec_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32
enc_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32

optim_opt.learning_rate = 0.3
optim_opt.name = 'GradientDescentOptimizer'
optim_opt.max_epochs = 5

sess_config = tf.ConfigProto(device_count={'GPU': 0})
sess = tf.Session(config = sess_config)

agent = PolicyAgent(agent_opt, sess)
agent.initialize_model(with_training=True)
agent.initialize_optim()
for v in tf.trainable_variables():
    print('{}, {}'.format(v.name, v.get_shape()))
sess.run(tf.global_variables_initializer())
agent.train(train_iter, 20, valid_iter, 20, verbose=True)
info = agent.evaluate(valid_iter, 20)
print("PPL: {}, time: {}".format(
    info.eval_cost/info.num_tokens, info.end_time - info.start_time))

[36m[INFO ][0mep: 0, lr: 0.300000


policy_agent/policy/model/encoder_embedding:0, (15, 32)
policy_agent/policy/model/decoder_embedding:0, (15, 32)
policy_agent/policy/model/encoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/policy/model/encoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/policy/model/decoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/policy/model/decoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/policy/model/decoder_rnn/logit_w:0, (15, 32)
policy_agent/policy/model/decoder_rnn/logit_b:0, (15,)


[36m[INFO ][0mtrain: @499 tr_loss: 8.50951, eval_loss: 1.29804, wps: 15466.5
[36m[INFO ][0mvalid: @49 tr_loss: 0.00000, eval_loss: 0.40551, wps: 33733.9
[36m[INFO ][0mep: 1, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 1.90082, eval_loss: 0.28631, wps: 15322.5
[36m[INFO ][0mvalid: @49 tr_loss: 0.00000, eval_loss: 0.19064, wps: 38384.1
[36m[INFO ][0mep: 2, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 1.07683, eval_loss: 0.16209, wps: 15187.0
[36m[INFO ][0mvalid: @49 tr_loss: 0.00000, eval_loss: 0.14583, wps: 37978.7
[36m[INFO ][0mep: 3, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 0.73735, eval_loss: 0.11099, wps: 15359.9
[36m[INFO ][0mvalid: @49 tr_loss: 0.00000, eval_loss: 0.10547, wps: 37980.8
[36m[INFO ][0mep: 4, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 0.59668, eval_loss: 0.08988, wps: 15544.2
[36m[INFO ][0mvalid: @49 tr_loss: 0.00000, eval_loss: 0.08302, wps: 38254.2


PPL: 0.0830161876572, time: 0.171453952789


In [5]:
valid_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
valid_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_env)
print(info.eval_loss)

valid_hard_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
valid_hard_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_hard_env)
print(info.eval_loss)

0.957633730159
0.853


In [6]:
agent.reset_training_state()
train_env = data.env.CopyEnv(train_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
info = agent.policy_gradient(train_env, 20, valid_hard_env, 20)

[36m[INFO ][0mep: 0, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: 0.10995, avg_return: 0.80790, wps: 899.2
[36m[INFO ][0mvalid: @50 tr_loss: 0.00000, avg_return: 0.88600, wps: 1416.0
[36m[INFO ][0mep: 1, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: 0.09764, avg_return: 0.82930, wps: 891.9
[36m[INFO ][0mvalid: @50 tr_loss: 0.00000, avg_return: 0.86300, wps: 1433.0
[36m[INFO ][0mep: 2, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: 0.09650, avg_return: 0.83450, wps: 882.0
[36m[INFO ][0mvalid: @50 tr_loss: 0.00000, avg_return: 0.86200, wps: 1403.6
[36m[INFO ][0mep: 3, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: 0.08349, avg_return: 0.84340, wps: 903.1
[36m[INFO ][0mvalid: @50 tr_loss: 0.00000, avg_return: 0.89700, wps: 1414.7
[36m[INFO ][0mep: 4, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: 0.08454, avg_return: 0.83790, wps: 902.8
[36m[INFO ][0mvalid: @50 tr_loss: 0.00000, avg_return: 0.87700, wps: 1336.9


In [7]:
valid_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
valid_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_env)
print(info.eval_loss)

valid_hard_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
valid_hard_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_hard_env)
print(info.eval_loss)

0.966419047619
0.877
