In [1]:
import time

import numpy as np
import tensorflow as tf
import rnndatasets.warandpeace as data

In [2]:
class fRNNCell(tf.nn.rnn_cell.RNNCell):
    """Classic RNN cell that is hopefully more flexible than tfs."""
    
    def __init__(self, num_units, input_size=None,
                 Whh_init=tf.random_normal_initializer(stddev=0.01),
                 Wxh_init=tf.random_normal_initializer(stddev=0.01),
                 bias_init=tf.constant_initializer(1),
                 nonlin=tf.nn.tanh):
        """sets up params.
        
        Args:
            num_units: how many units in this recurrent layer
            input_size: how many inputs are there
            Whh_init: how to initialise the hidden to hidden connections
            Wxh_init: how to initialise the input connections
            nonlin: the nonlinearity to use on the final state"""
        self._num_units = num_units
        self._input_size = input_size or num_units
        self._Whh_init = Whh_init
        self._Wxh_init = Wxh_init
        self._bias_init = bias_init
        self._nonlin = nonlin
        
    @property
    def input_size(self):
        return self._input_size
    
    @property
    def output_size(self):
        return self._num_units
    
    @property
    def state_size(self):
        return self._num_units
    
    def __call__(self, inputs, state, scope=None):
        """basic RNN, but with params we set up before"""
        with tf.variable_scope(scope or type(self).__name__):
            # get first weights
            whh = tf.get_variable('hidden_weights', [self.output_size, self.output_size],
                                  initializer=self._Whh_init)
            wxh = tf.get_variable('input_weights', [self.input_size, self.output_size],
                                  initializer=self._Wxh_init)
            bias = tf.get_variable('bias', [self.output_size],
                                  initializer=self._bias_init)
            output = self._nonlin(tf.matmul(state, whh) + tf.matmul(inputs, wxh) + bias)
        return output, output
    
def identity_initializer():
    def _initializer(shape, dtype=tf.float32):
        if len(shape) == 1:
            return tf.constant(0., shape=shape)
        elif len(shape) == 2 and shape[0] == shape[1]:
            return tf.constant(np.identity(shape[0], np.float32))
        elif len(shape) == 4 and shape[2] == shape[3]:
            array = np.zeros(shape, np.float32)
            cx, cy = shape[0]/2, shape[1]/2
            for i in range(shape[2]):
                array[cx, cy, i, i] = 1
            return tf.constant(array)
        else:
            raise
    return _initializer

In [3]:
# make a super quick model
def get_lstm_model(input_data, shape, num_outputs, sequence_length, batch_size):
    """gets an lstm model with input projection and the given number of outputs"""
    cells = []
    input_size = shape[0]
    for layer in shape:
        cells.append(tf.nn.rnn_cell.LSTMCell(layer,
                                             input_size,
                                             use_peepholes=True,
                                             cell_clip=1.0))
        input_size = layer
    
    cell = tf.nn.rnn_cell.MultiRNNCell(cells)
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    with tf.device('/cpu:0'):
        # do the embedding on the cpu always
        # same outs as ins
        embedding = tf.get_variable('embedding', [num_outputs, shape[0]])
        inputs = tf.nn.embedding_lookup(embedding, input_data)
    
    inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, sequence_length, inputs)]
    # inputs = tf.split(1, sequence_length, inputs)
    outputs, state = tf.nn.rnn(cell, inputs, initial_state=initial_state)
    outputs = tf.reshape(tf.concat(1, outputs), [-1, shape[-1]])  # turn list of outputs into a big tensor
    # now do the output projection
    softmax_w = tf.get_variable('softmax_w', [shape[-1], num_outputs])
    softmax_b = tf.get_variable('softmax_b', [num_outputs])
    logits = tf.matmul(outputs, softmax_w) + softmax_b
    return initial_state, logits, state

# make a super quick model
def get_rnn_model(input_data, shape, num_outputs, sequence_length, batch_size):
    """gets an lstm model with input projection and the given number of outputs"""
    
    cells = [tf.nn.rnn_cell.DropoutWrapper(
                fRNNCell(layer,
                         Whh_init=identity_initializer(),
                         bias_init=tf.constant_initializer(0),
                         nonlin=tf.nn.relu),
                    input_keep_prob=0.5 if sequence_length != 1 else 1)
              for layer in shape]
    #cells = [tf.nn.rnn_cell.BasicRNNCell(layer) for layer in shape]
    cell = tf.nn.rnn_cell.MultiRNNCell(cells)
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    with tf.device('/cpu:0'):
        # do the embedding on the cpu always
        # same outs as ins
        embedding = tf.get_variable('embedding', [num_outputs, shape[0]])
        inputs = tf.nn.embedding_lookup(embedding, input_data)
    
    inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, sequence_length, inputs)]
    #inputs = tf.pack(inputs)
    # inputs = tf.split(1, sequence_length, inputs)
    slen = tf.pack([tf.constant(sequence_length, dtype=tf.int32)]*batch_size,)
    outputs, state = tf.nn.rnn(cell, inputs, sequence_length=slen, initial_state=initial_state)
    outputs = tf.reshape(tf.concat(1, outputs), [-1, shape[-1]])  # turn list of outputs into a big tensor
    # now do the output projection
    softmax_w = tf.get_variable('softmax_w', [shape[-1], num_outputs])
    softmax_b = tf.get_variable('softmax_b', [num_outputs], initializer=tf.constant_initializer(0))
    logits = tf.matmul(outputs, softmax_w) + softmax_b
    return initial_state, logits, state

In [4]:
# get loss for a model given targets
def get_loss(logits, targets, batch_size, sequence_length):
    """cross entropy, because text"""
    loss =  tf.nn.seq2seq.sequence_loss_by_example([logits],
                                                   [tf.reshape(targets, [-1])],
                                                   [tf.ones([batch_size * sequence_length])])
    loss = tf.reduce_sum(loss) / batch_size
    # add a scalar_summary to monitor perplexity
    tf.scalar_summary('perplexity (character)', tf.exp(loss/sequence_length))
    return loss

In [5]:
# get an op to do a step of descent on all trainable variables defined
def train_op(loss, learning_rate, momentum, global_step, max_grad_norm=1):
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                      max_grad_norm)
    #opt = tf.train.MomentumOptimizer(learning_rate, momentum)
    opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-15)
    #opt = tf.train.RMSPropOptimizer(learning_rate, decay=0.95)
    return opt.apply_gradients(zip(grads, tvars), global_step=global_step)

In [10]:
def run_epoch(session, inputs, targets, initial_state, final_state, loss, data_iter, eval_op, summariser, global_step):
    """run the model on some data"""
    state = initial_state.eval()
    costs = 0
    steps = 0
    start = time.time()
    summ_op = tf.merge_all_summaries()
    for progress, batch in data_iter:
        batch.append(batch[0])
        in_vals = batch[0:-1]
        target_vals = batch[1:]
        cost, state, _ = session.run(
            [loss, final_state, eval_op],
            {inputs: in_vals,
             targets: target_vals,
             initial_state: state})
        costs += cost
        steps += len(in_vals)
        
        print('\r({:.3f}) perplexity: {:.3f} (xent: {:.3f}), ({:.0f} cps)'.format(
            progress, np.exp(costs/steps), costs/steps, steps * len(batch[0]) / (time.time() - start)),
             end='')
        summariser.add_summary(session.run(summ_op,
                                           {loss: np.exp(costs/steps)}),
                              global_step=tf.train.global_step(session, global_step))
        

In [42]:
# (very slowly) make some samples
# this is really dumb,
# should just rip out tf.nn.rnn and set it up so the input is the last output
# then we could unroll it and get the sequence all at once
import random
def print_sample(session, inputs, initial_state, output, final_state, length, vocab):
    state = initial_state.eval()
    in_data = np.array(vocab[np.random.choice(list(vocab.keys()))]).reshape((1, 1))
    int_to_char = {int(vocab[key]): key for key in vocab}
    print('making a sample')
    sample = []
    char_probs = tf.nn.softmax(output)
    for i in range(length):
        probs = sess.run([char_probs],
                         {initial_state: state,
                          inputs: in_data})
        probs = probs[0].flatten()
        probs = probs / (probs.sum() + 1e-7)
        try:
            char_idx = np.random.multinomial(1, probs, size=(1,))
        except:
            char_idx = probs
        char_idx = np.argmax(char_idx)
        in_data = np.array(char_idx).reshape((1, 1))
        sample.append(int_to_char[char_idx])
    result = ''.join(sample)
    print(result)
    return result

In [24]:
tf.reset_default_graph()

batch_size = 100
sequence_length = 100
shape = [180, 180, 180]

in_var = tf.placeholder(tf.int32, [batch_size, sequence_length])
target_var = tf.placeholder(tf.int32, [batch_size, sequence_length])

sample_in_var = tf.placeholder(tf.int32, [1, 1])

vocab = data.get_vocab('char')

lr_val = tf.get_variable('learning_rate', [])
mo_val = tf.get_variable('momentum', [])
global_step = tf.Variable(0, name='global_step')

print('getting model...', end='')
with tf.variable_scope(
    'rnn_model',
    initializer=tf.truncated_normal_initializer(stddev=0.05)) as scope:
    #initial_state, outputs, final_state = get_lstm_model(in_var, shape, len(vocab), sequence_length, batch_size)
    initial_state, outputs, final_state = get_rnn_model(in_var, shape, len(vocab), sequence_length, batch_size)
    # get a one step at a time model to generate some samplies
    print('...', end='')
    scope.reuse_variables()
    #initial_state, output_1, final_state_1 = get_lstm_model(sample_in_var, shape, len(vocab), 1, 1)
    initial_state_1, output_1, final_state_1 = get_rnn_model(sample_in_var, shape, len(vocab), 1, 1)

print('...', end='')
loss_op = get_loss(outputs, target_var, batch_size, sequence_length)
print('...', end='')
update_weights = train_op(loss_op, lr_val, mo_val, global_step)
print('\r{:~^30}'.format('got model'))
sess = tf.Session()
print('initialising', end='')
sess.run(tf.initialize_all_variables())
print('\r{:~^30}'.format('initialised'))
learning_rate = 2e-3
momentum = 0.99
summaries = tf.merge_all_summaries()
summ_writer = tf.train.SummaryWriter('summaries', graph_def=sess.graph_def)

~~~~~~~~~~got model~~~~~~~~~~~
~~~~~~~~~initialised~~~~~~~~~~


In [None]:
learning_rate = 0.0001
with sess.as_default():
    for i in range(100):
        try:
            print('~~~~~~~~~~Epoch {:>3}:~~~~~~~~~~'.format(i))
            learning_rate, momentum = sess.run([lr_val.assign(learning_rate),
                                                mo_val.assign(momentum)])
            print('~~~~lr: {}'.format(learning_rate))
            print('~~~mom: {}'.format(momentum))
            data_iter = data.get_char_iter(sequence_length, batch_size, True, 50000, overlap=True)
            run_epoch(sess,
                      in_var, 
                      target_var, 
                      initial_state, 
                      final_state, 
                      loss_op, 
                      data_iter, 
                      update_weights,
                      summ_writer,
                      global_step)
            print()
            # could be fun to use on the final states to initialise for sampling
            
            print_sample(sess,
                         sample_in_var,
                         initial_state_1,
                         output_1,
                         final_state_1,
                         250,
                         vocab)
            if i >= 1:
                learning_rate = learning_rate * .98
            #momentum = 1-learning_rate-1e-6
            summ_writer.flush()  # make sure we can see some stuff
        except KeyboardInterrupt:
            print_sample(sess,
                         sample_in_var,
                         initial_state_1,
                         output_1,
                         final_state_1,
                         2000,
                         vocab)
            break
        

~~~~~~~~~~Epoch   0:~~~~~~~~~~
~~~~lr: 9.999999747378752e-05
~~~mom: 0.9900000095367432
(0.246) perplexity: 21.087 (xent: 3.049), (18187 cps)

In [41]:
saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=100) # this should be all we care about
saver.save(sess, 'model_180x180x180', global_step=global_step)

'model_180x180x180-9200'