In [3]:
import numpy as np
import collections
import os
import sys
import tensorflow as tf

In [91]:
# sess = tf.InteractiveSession()

Adapted from https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb

## Read data

In [48]:
data_path = 'data/'

In [5]:
def _read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().decode("utf-8").replace("\n", "<eos>").split()

In [26]:
def _build_vocab(filename):
    data = _read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))

    return word_to_id, id_to_word

In [7]:
def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [129]:
def ptb_raw_data(data_path=None):
    """
    Load PTB raw data from data directory "data_path".
    
    Reads PTB text files, converts strings to integer ids,
    and performs mini-batching of the inputs.
    
    The PTB dataset comes from Tomas Mikolov's webpage:
    http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
    
    Args:
        data_path: string path to the directory where simple-examples.tgz has
        been extracted.
    
    Returns:
        tuple (train_data, valid_data, test_data, vocabulary)
        where each of the data objects can be passed to PTBIterator.
    """

    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path = os.path.join(data_path, "ptb.test.txt")

    word_to_id, id_to_word = _build_vocab(train_path)
    
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    
    vocabulary = len(word_to_id)
    
    return train_data, valid_data, test_data, word_to_id, id_to_word, vocabulary

In [130]:
raw_data = ptb_raw_data(data_path)
train_data, valid_data, test_data, word_to_id, id_to_word, vocab_size = raw_data

In [131]:
len(train_data), len(valid_data), len(test_data), vocab_size

(929589, 73760, 82430, 10000)

In [132]:
print 'The beginning of training data:'
print ' '.join([id_to_word[e] for e in train_data[:50]])

print '\nThe beginning of valid data:'
print ' '.join([id_to_word[e] for e in valid_data[:50]])

print '\nThe beginning of test data:'
print ' '.join([id_to_word[e] for e in test_data[:50]])

The beginning of training data:
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter <eos> pierre <unk> N years old will join the board as a nonexecutive director nov. N <eos> mr. <unk> is chairman of <unk> n.v. the dutch

The beginning of valid data:
consumers may want to move their telephones a little closer to the tv set <eos> <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> <eos> two weeks ago viewers of several nbc

The beginning of test data:
no it was n't black monday <eos> but while the new york stock exchange did n't fall apart friday as the dow jones industrial average plunged N points most of it in the final hour it barely managed to stay this side of chaos <eos> some circuit breakers installed after


## Generate batches

In [51]:
batch_size = 20
num_steps = 20  # the number of unrolls

In [52]:
def ptb_producer(raw_data, batch_size, num_steps, name=None):
    """
    Iterate on the raw PTB data.
    
    This chunks up raw_data into batches of examples and returns Tensors that
    are drawn from these batches.
    
    Args:
        raw_data: one of the raw data outputs from ptb_raw_data.
        batch_size: int, the batch size.
        num_steps: int, the number of unrolls.
        name: the name of this operation (optional).
    Returns:
        A pair of Tensors, each shaped [batch_size, num_steps]. The second element
        of the tuple is the same data time-shifted to the right by one.
    Raises:
        tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
    """
    
    with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
        raw_data = tf.convert_to_tensor(
            raw_data, name="raw_data", dtype=tf.int32)

        data_len = tf.size(raw_data)
        batch_len = data_len // batch_size
        data = tf.reshape(raw_data[0:batch_size * batch_len],
                          [batch_size, batch_len])

        epoch_size = (batch_len - 1) // num_steps
        assertion = tf.assert_positive(
            epoch_size,
            message="epoch_size == 0, decrease batch_size or num_steps")
        with tf.control_dependencies([assertion]):
            epoch_size = tf.identity(epoch_size, name="epoch_size")

        i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
        
        x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
        x.set_shape([batch_size, num_steps])
        
        y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1])
        y.set_shape([batch_size, num_steps])
        
        return x, y

It's a bit hard to understand the code above without establishing a session, so I'm going to break it down with simple Numpy operations to see exactly what it's doing:

In [59]:
raw_data = train_data

data_len = len(raw_data)
batch_len = data_len // batch_size

data_len, batch_len

(929589, 46479)

In [61]:
data = np.array(raw_data[0:batch_size * batch_len]).reshape(batch_size, batch_len)
data.shape

(20, 46479)

In [62]:
epoch_size = (batch_len - 1) // num_steps
epoch_size

2323

In [115]:
i = 0

x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1])

x = x.eval()
y = y.eval()

x.shape, y.shape

((20, 20), (20, 20))

In [116]:
# tf.strided_slice is equivalent to slicing in in Numpy:
# https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html

assert np.array_equal(x, data[0:batch_size:1, i * num_steps:(i + 1) * num_steps:1])
assert np.array_equal(y, data[0:batch_size:1, i * num_steps + 1:(i + 1) * num_steps + 1:1])

In [178]:
# print out some x and y as an example
for i in range(0, 5):
    print 'Batch', i+1, ' '.join([id_to_word[wid] for wid in x[i]])

Batch 1 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim
Batch 2 that would so load a carrier up with debt that it would <unk> safety or a carrier 's ability to
Batch 3 N shares of its common stock for each of <unk> deposit 's N shares outstanding <eos> liberty national a bank
Batch 4 and white paper <eos> that means goods could be manufactured closer to customers saving shipping costs he said <eos> moreover
Batch 5 at least in part because of buy programs generated by stock-index arbitrage a form of program trading involving futures contracts


In [179]:
for i in range(0, 5):
    print 'Batch', i+1, ' '.join([id_to_word[wid] for wid in y[i]])

Batch 1 banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food
Batch 2 would so load a carrier up with debt that it would <unk> safety or a carrier 's ability to compete
Batch 3 shares of its common stock for each of <unk> deposit 's N shares outstanding <eos> liberty national a bank holding
Batch 4 white paper <eos> that means goods could be manufactured closer to customers saving shipping costs he said <eos> moreover production
Batch 5 least in part because of buy programs generated by stock-index arbitrage a form of program trading involving futures contracts <eos>


Generate batches and put it in a class (for easy access later):

In [135]:
class PTBInput(object):
    """The input data."""

    def __init__(self, data, name=None):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
        
        # Generate training and label data
        # The latter offsets the former by one word
        self.input_data, self.targets = ptb_producer(
            data, batch_size, num_steps, name=name)

## Model training

In [127]:
hidden_size = 200  # number of hidden units in LSTM; also embedding size
keep_prob = 0.5  # 1 - dropoff rate
num_layers = 2  # number of LSTM layers

In [None]:
class PTBModel(object):
    """The PTB model."""

    def __init__(self, is_training, config, input_):
        self._input = input_  # Note `input_` includes both `input_data` and `targets`

        def lstm_cell():
            tf.contrib.rnn.BasicLSTMCell(
                hidden_size,
                forget_bias=0.0,
                state_is_tuple=True,
                reuse=tf.get_variable_scope().reuse)

        lstm_cell = lstm_cell

        # Implement dropoff (for training only)
        if is_training and keep_prob < 1:

            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    lstm_cell(), output_keep_prob=config.keep_prob)

        # Stacking multiple LSTMs
        attn_cells = [attn_cell() for _ in range(num_layers)]
        stacked_lstm = tf.contrib.rnn.MultiRNNCell(attn_cells, state_is_tuple=True)
        
        # Initialize states with zeros
        self._initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
        
        # The word IDs will be embedded into a dense representation before feeding to the LSTM.
        # This allows the model to efficiently represent the knowledge about particular words.
        with tf.device("/cpu:0"):
            embedding = tf.get_variable(
                "embedding", [vocab_size, hidden_size], dtype=tf.float32)
            input_embeddings = tf.nn.embedding_lookup(embedding, input_.input_data)
            # The shape of `input_embeddings` is [batch_size, num_steps, hidden_size]
        
        # Implement dropoff (for training only)
        if is_training and keep_prob < 1:
            input_embeddings = tf.nn.dropout(inputs, keep_prob)

        # Simplified version of models/tutorials/rnn/rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #
        # The alternative version of the code below is:
        #
        # inputs = tf.unstack(inputs, num=num_steps, axis=1)
        # outputs, state = tf.contrib.rnn.static_rnn(
        #     cell, inputs, initial_state=self._initial_state)
        
        # Unroll LSTM loop
        outputs = []
        state = self._initial_state
        
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                
                (cell_output, state) = stacked_lstm(input_embeddings[:, time_step, :], state)
                outputs.append(cell_output)
                # `outputs` is a list of `num_steps` tensors, each shaped [batch_size, hidden_size]
        
        # Resize the ouput into a [batch_size * num_steps, hidden_size] matrix.
        # Note axis=1 because we want to group words together according to its original sequence
        # in order to compare with `targets` to compute loss later.
        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, hidden_size])
        
        # Compute logits
        softmax_w = tf.get_variable(
            "softmax_w", [hidden_size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable(
            "softmax_b", [vocab_size], dtype=data_type())
        
        logits = tf.matmul(output, softmax_w) + softmax_b
        # The shape of `logits` =
        # [batch_size * num_steps, hidden_size] x [hidden_size, vocab_size] + [vocab_size] =
        # [batch_size * num_steps, vocab_size]

        # Reshape logits to be 3-D tensor for sequence loss
        logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])

        # use the contrib sequence loss and average over the batches
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            input_.targets,
            tf.ones([batch_size, num_steps], dtype=tf.float32),
            average_across_timesteps=False,
            average_across_batch=True)

        # update the cost variables
        self._cost = cost = tf.reduce_sum(loss)
        self._final_state = state

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost, tvars), config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(
            tf.float32, shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)

    def assign_lr(self, session, lr_value):
        session.run(self._lr_update, feed_dict={self._new_lr: lr_value})