In [1]:
import sys
import time
import tensorflow as tf
import numpy as np
sys.path.insert(0, '../')
%load_ext autoreload
%autoreload 2

In [2]:
from seqmodel.bunch import Bunch
from seqmodel.experiment.policy_agent import ActorCriticAgent
from seqmodel import model
from seqmodel import data

In [3]:
vocab = data.Vocabulary.from_vocab_file('../data/tiny_copy/vocab.txt')
valid_iter = data.Seq2SeqIterator(vocab, vocab)
valid_iter.initialize('../data/tiny_copy/valid.txt')
train_iter = data.Seq2SeqIterator(vocab, vocab)
train_iter.initialize('../data/tiny_copy/train.txt')

In [4]:
tf.reset_default_graph()
agent_opt = ActorCriticAgent.default_opt()
emb_opt = agent_opt.policy_model.model_opt.embedding
dec_opt = agent_opt.policy_model.model_opt.decoder
enc_opt = agent_opt.policy_model.model_opt.encoder
optim_opt = agent_opt.optim

emb_opt.decoder_dim = 32
emb_opt.encoder_dim = 32

dec_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32
enc_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32

optim_opt.learning_rate = 0.3
optim_opt.name = 'GradientDescentOptimizer'
optim_opt.max_epochs = 5

emb_opt = agent_opt.value_model.model_opt.embedding
dec_opt = agent_opt.value_model.model_opt.decoder
enc_opt = agent_opt.value_model.model_opt.encoder

emb_opt.decoder_dim = 32
emb_opt.encoder_dim = 32

dec_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32
enc_opt.rnn_opt.rnn_cell.cell_opt.num_units = 32

sess_config = tf.ConfigProto(device_count={'GPU': 0})
sess = tf.Session(config = sess_config)

agent = ActorCriticAgent(agent_opt, sess)
agent.initialize_model(with_training=True)
agent.initialize_optim()
for v in tf.trainable_variables():
    print('{}, {}'.format(v.name, v.get_shape()))
sess.run(tf.global_variables_initializer())
agent.train(train_iter, 20, valid_iter, 20, verbose=True)
info = agent.evaluate(valid_iter, 20)
print("PPL: {}, time: {}".format(
    info.eval_cost/info.num_tokens, info.end_time - info.start_time))

[36m[INFO ][0mep: 0, lr: 0.300000


policy_agent/policy/model/encoder_embedding:0, (15, 32)
policy_agent/policy/model/decoder_embedding:0, (15, 32)
policy_agent/policy/model/encoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/policy/model/encoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/policy/model/decoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/policy/model/decoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/policy/model/decoder_rnn/logit_w:0, (15, 32)
policy_agent/policy/model/decoder_rnn/logit_b:0, (15,)
policy_agent/value/model/encoder_embedding:0, (15, 32)
policy_agent/value/model/decoder_embedding:0, (15, 32)
policy_agent/value/model/encoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/value/model/encoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/value/model/decoder_rnn/rnn/basic_lstm_cell/weights:0, (64, 128)
policy_agent/value/model/decoder_rnn/rnn/basic_lstm_cell/biases:0, (128,)
policy_agent/value/model/decoder_rnn/regression_w:0, 

[36m[INFO ][0mtrain: @499 tr_loss: 8.38486, eval_loss: 1.27545, wps: 15508.2
[36m[INFO ][0mvalid: @49 eval_loss: 0.42847, wps: 33578.6
[36m[INFO ][0mep: 1, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 1.81709, eval_loss: 0.27445, wps: 15583.6
[36m[INFO ][0mvalid: @49 eval_loss: 0.20787, wps: 36936.7
[36m[INFO ][0mep: 2, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 1.04972, eval_loss: 0.15826, wps: 15484.7
[36m[INFO ][0mvalid: @49 eval_loss: 0.13676, wps: 35501.7
[36m[INFO ][0mep: 3, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 0.68783, eval_loss: 0.10351, wps: 15293.4
[36m[INFO ][0mvalid: @49 eval_loss: 0.09881, wps: 37862.8
[36m[INFO ][0mep: 4, lr: 0.300000
[36m[INFO ][0mtrain: @499 tr_loss: 0.52774, eval_loss: 0.07911, wps: 15263.5
[36m[INFO ][0mvalid: @49 eval_loss: 0.07291, wps: 36484.4


PPL: 0.0729132937909, time: 0.172223806381


In [5]:
valid_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
valid_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_env)
print(info.eval_loss)

valid_hard_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
valid_hard_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_hard_env)
print(info.eval_loss)

0.96374484127
0.874


In [16]:
agent.reset_training_state()
train_env = data.env.CopyEnv(train_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
info = agent.policy_gradient(train_env, 20, valid_hard_env, 20)

[36m[INFO ][0mep: 0, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: -0.07616, base_loss: 0.00769, avg_return: 0.95990, wps: 597.1
[36m[INFO ][0mvalid: @50 avg_return: 0.89800, wps: 1376.1
[36m[INFO ][0mep: 1, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: -0.07266, base_loss: 0.00727, avg_return: 0.96191, wps: 604.3
[36m[INFO ][0mvalid: @50 avg_return: 0.89800, wps: 1334.5
[36m[INFO ][0mep: 2, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: -0.07046, base_loss: 0.00691, avg_return: 0.96244, wps: 604.9
[36m[INFO ][0mvalid: @50 avg_return: 0.90800, wps: 1392.0
[36m[INFO ][0mep: 3, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: -0.06713, base_loss: 0.00649, avg_return: 0.96404, wps: 605.5
[36m[INFO ][0mvalid: @50 avg_return: 0.92800, wps: 1384.8
[36m[INFO ][0mep: 4, lr: 0.300000
[36m[INFO ][0mtrain: @500 tr_loss: -0.06964, base_loss: 0.00683, avg_return: 0.96396, wps: 603.1
[36m[INFO ][0mvalid: @50 avg_return: 0.90100, wps: 1370.2


In [17]:
valid_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
valid_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_env)
print(info.eval_loss)

valid_hard_env = data.env.CopyEnv(valid_iter, re_init=False, reward_mode=data.env.ToyRewardMode.ALL_MATCH)
valid_hard_env.restart(batch_size=20)
info = agent.evaluate_policy(valid_hard_env)
print(info.eval_loss)

info = agent.evaluate(valid_iter, 20)
print("PPL: {}, time: {}".format(
    info.eval_cost/info.num_tokens, info.end_time - info.start_time))

0.975562698413
0.901
PPL: 0.054776666954, time: 0.16993188858


In [18]:
test_data = ([['a a b c a d a f a', 'a a b c a d a f a'], ['a b c d', 'a b c d']])
test_iter = data.Seq2SeqIterator(vocab, vocab)
test_iter.initialize(test_data)
test_iter.init_batch(2)
env = data.env.CopyEnv(test_iter, re_init=False, reward_mode=data.env.ToyRewardMode.EACH_MATCH)
transitions, states, rewards = agent.rollout(env, greedy=True)
rewards = np.array(rewards)
returns, rewards, baseline = agent.compute_return(states, rewards)
print('Reward: ') 
print(rewards.T)
print('Baseline: ')
print(baseline.T)
print('Advantage: ')
print(returns.T)
pg_data = env.create_transition_return(states, returns)
val_data = env.create_transition_value(states, rewards)

Reward: 
[[ 0.66886488  0.57461099  0.47940504  0.48424752  0.4891389   0.3930696
   0.2960299   0.19801     0.099       0.1       ]
 [ 0.980199    0.7880798   0.59402     0.398       0.2         0.          0.
   0.          0.          0.        ]]
Baseline: 
[[ 0.79030454  0.69538051  0.61195886  0.53802836  0.44017869  0.35918486
   0.26809672  0.18161324  0.10470365  0.03600962]
 [ 0.96629143  0.76740676  0.56396669  0.36221999  0.19967744  0.0968501
   0.0968501   0.0968501   0.0968501   0.0968501 ]]
Advantage: 
[[-0.12143966 -0.12076952 -0.13255382 -0.05378084  0.04896021  0.03388474
   0.02793318  0.01639676 -0.00570365  0.06399038]
 [ 0.01390757  0.02067304  0.03005331  0.03578001  0.00032256 -0.0968501
  -0.0968501  -0.0968501  -0.0968501  -0.0968501 ]]


In [19]:
pg_data.features.encoder_input.T

array([[ 1,  5,  5,  6,  7,  5,  8,  5, 10,  5,  3],
       [ 1,  5,  6,  7,  8,  3,  3,  3,  3,  3,  3]], dtype=int32)

In [20]:
pg_data.labels.decoder_label.T

array([[ 5,  5,  7,  6,  5,  8,  5, 10, 14,  0],
       [ 5,  6,  7,  8,  0,  0,  0,  0,  0,  0]])