In [66]:
import numpy as np
import collections
import os
import sys
import inspect
import time
from datetime import datetime
import random
import pickle
import tensorflow as tf

In [91]:
# sess = tf.InteractiveSession()

Adapted from https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb

## Read data

In [8]:
data_path = 'data/'

In [9]:
def _read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().decode("utf-8").replace("\n", "<eos>").split()

In [10]:
def _build_vocab(filename):
    data = _read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))

    return word_to_id, id_to_word

In [11]:
def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [12]:
def ptb_raw_data(data_path=None):
    """
    Load PTB raw data from data directory "data_path".
    
    Reads PTB text files, converts strings to integer ids,
    and performs mini-batching of the inputs.
    
    The PTB dataset comes from Tomas Mikolov's webpage:
    http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
    
    Args:
        data_path: string path to the directory where simple-examples.tgz has
        been extracted.
    
    Returns:
        tuple (train_data, valid_data, test_data, vocabulary)
        where each of the data objects can be passed to PTBIterator.
    """

    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path = os.path.join(data_path, "ptb.test.txt")

    word_to_id, id_to_word = _build_vocab(train_path)
    
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    
    vocabulary = len(word_to_id)
    
    return train_data, valid_data, test_data, word_to_id, id_to_word, vocabulary

In [13]:
raw_data = ptb_raw_data(data_path)
train_data, valid_data, test_data, word_to_id, id_to_word, vocab_size = raw_data

In [14]:
len(train_data), len(valid_data), len(test_data), vocab_size

(929589, 73760, 82430, 10000)

In [132]:
print 'The beginning of training data:'
print ' '.join([id_to_word[e] for e in train_data[:50]])

print '\nThe beginning of valid data:'
print ' '.join([id_to_word[e] for e in valid_data[:50]])

print '\nThe beginning of test data:'
print ' '.join([id_to_word[e] for e in test_data[:50]])

The beginning of training data:
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter <eos> pierre <unk> N years old will join the board as a nonexecutive director nov. N <eos> mr. <unk> is chairman of <unk> n.v. the dutch

The beginning of valid data:
consumers may want to move their telephones a little closer to the tv set <eos> <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> <eos> two weeks ago viewers of several nbc

The beginning of test data:
no it was n't black monday <eos> but while the new york stock exchange did n't fall apart friday as the dow jones industrial average plunged N points most of it in the final hour it barely managed to stay this side of chaos <eos> some circuit breakers installed after


In [67]:
# Save dictionaries
with open('word_to_id.pickle', 'wb') as handle:
    pickle.dump(word_to_id, handle)

with open('id_to_word.pickle', 'wb') as handle:
    pickle.dump(id_to_word, handle)

## Define model configurations

In [15]:
class config(object):
    vocab_size = vocab_size
    batch_size = 20
    num_steps = 20  # sequence length; the number of unrolls
    hidden_size = 200  # number of hidden units in LSTM; also embedding size
    keep_prob = 0.5  # 1 - dropoff rate
    num_layers = 2  # number of LSTM layers
    max_grad_norm = 5  # max gradient (to prevent the exploding gradient problems)
    init_scale = 0.1  # the initial scale of the weights
    max_epoch = 4  # the number of epochs trained with the initial learning rate
    max_max_epoch = 13  # the total number of epochs for training
    learning_rate = 1.0  # the initial value of the learning rate
    lr_decay = 0.5  # the decay of the learning rate for each epoch after "max_epoch"

## Generate batches

In [16]:
def ptb_producer(raw_data, batch_size, num_steps, name=None):
    """
    Iterate on the raw PTB data.
    
    This chunks up raw_data into batches of examples and returns Tensors that
    are drawn from these batches.
    
    Args:
        raw_data: one of the raw data outputs from ptb_raw_data.
        batch_size: int, the batch size.
        num_steps: int, the number of unrolls.
        name: the name of this operation (optional).
    Returns:
        A pair of Tensors, each shaped [batch_size, num_steps]. The second element
        of the tuple is the same data time-shifted to the right by one.
    Raises:
        tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
    """
    
    with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
        raw_data = tf.convert_to_tensor(
            raw_data, name="raw_data", dtype=tf.int32)

        data_len = tf.size(raw_data)
        batch_len = data_len // batch_size
        data = tf.reshape(raw_data[0:batch_size * batch_len],
                          [batch_size, batch_len])

        epoch_size = (batch_len - 1) // num_steps
        assertion = tf.assert_positive(
            epoch_size,
            message="epoch_size == 0, decrease batch_size or num_steps")
        with tf.control_dependencies([assertion]):
            epoch_size = tf.identity(epoch_size, name="epoch_size")

        i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
        
        x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
        x.set_shape([batch_size, num_steps])
        
        y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1])
        y.set_shape([batch_size, num_steps])
        
        return x, y

It's a bit hard to understand the code above without establishing a session, so I'm going to break it down with simple Numpy operations to see exactly what it's doing:

In [51]:
batch_size = config.batch_size
num_steps = config.num_steps

In [59]:
raw_data = train_data

data_len = len(raw_data)
batch_len = data_len // batch_size

data_len, batch_len

(929589, 46479)

In [61]:
data = np.array(raw_data[0:batch_size * batch_len]).reshape(batch_size, batch_len)
data.shape

(20, 46479)

In [62]:
epoch_size = (batch_len - 1) // num_steps
epoch_size

2323

In [115]:
i = 0

x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1])

x = x.eval()
y = y.eval()

x.shape, y.shape

((20, 20), (20, 20))

In [116]:
# tf.strided_slice is equivalent to slicing in Numpy:
# https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html

assert np.array_equal(x, data[0:batch_size:1, i * num_steps:(i + 1) * num_steps:1])
assert np.array_equal(y, data[0:batch_size:1, i * num_steps + 1:(i + 1) * num_steps + 1:1])

In [178]:
# print out some x and y as an example
for i in range(0, 5):
    print 'Batch', i+1, ' '.join([id_to_word[wid] for wid in x[i]])

Batch 1 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim
Batch 2 that would so load a carrier up with debt that it would <unk> safety or a carrier 's ability to
Batch 3 N shares of its common stock for each of <unk> deposit 's N shares outstanding <eos> liberty national a bank
Batch 4 and white paper <eos> that means goods could be manufactured closer to customers saving shipping costs he said <eos> moreover
Batch 5 at least in part because of buy programs generated by stock-index arbitrage a form of program trading involving futures contracts


In [179]:
for i in range(0, 5):
    print 'Batch', i+1, ' '.join([id_to_word[wid] for wid in y[i]])

Batch 1 banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food
Batch 2 would so load a carrier up with debt that it would <unk> safety or a carrier 's ability to compete
Batch 3 shares of its common stock for each of <unk> deposit 's N shares outstanding <eos> liberty national a bank holding
Batch 4 white paper <eos> that means goods could be manufactured closer to customers saving shipping costs he said <eos> moreover production
Batch 5 least in part because of buy programs generated by stock-index arbitrage a form of program trading involving futures contracts <eos>


Generate batches and put it in a class (for easy access later):

In [17]:
class PTBInput(object):
    """The input data."""

    def __init__(self, config, data, name=None):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
        
        # Generate training and label data
        # The latter offsets the former by one word
        self.input_data, self.targets = ptb_producer(
            data, batch_size, num_steps, name=name)

## Define model

In [33]:
class PTBModel(object):
    """The PTB model."""

    def __init__(self, is_training, config, input_=None):
        batch_size = config.batch_size
        num_steps = config.num_steps
        hidden_size = config.hidden_size
        vocab_size = config.vocab_size
        
        if input_ is not None:
            # For normal training and validation
            self._input = input_
            self._input_data = input_.input_data
            self._targets = input_.targets
            
        else:
            # For text generations
            self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
            self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        def lstm_cell():
            # With the latest TensorFlow source code (as of Mar 27, 2017),
            # the BasicLSTMCell will need a reuse parameter which is unfortunately not
            # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
            # an argument check here:
            
            if 'reuse' in inspect.getargspec(
                    tf.contrib.rnn.BasicLSTMCell.__init__).args:
                return tf.contrib.rnn.BasicLSTMCell(
                    hidden_size,
                    forget_bias=0.0,
                    state_is_tuple=True,
                    reuse=tf.get_variable_scope().reuse)
            else:
                return tf.contrib.rnn.BasicLSTMCell(
                    hidden_size,
                    forget_bias=0.0,
                    state_is_tuple=True)
            
            # Note because we set `state_is_tuple=True`, the states are 2-tuples of the `c_state` and `h_state`
            # `c_state` is the cell state
            # `h_state` is the hidden state
            # See this SO thread: https://stackoverflow.com/questions/41789133/c-state-and-m-state-in-tensorflow-lstm
    
        attn_cell = lstm_cell

        # Implement dropoff (for training only)
        if is_training and config.keep_prob < 1:

            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    lstm_cell(), output_keep_prob=config.keep_prob)

        # Stacking multiple LSTMs
        attn_cells = [attn_cell() for _ in range(config.num_layers)]
        stacked_lstm = tf.contrib.rnn.MultiRNNCell(attn_cells, state_is_tuple=True)
        
        # Initialize states with zeros
        # `_initial_state` is a list of `num_layers` tensors
        # Each is a tuple of (`c_state`, `h_state`),
        # and both `c_state` and `h_state` are shaped [batch_size, hidden_size]
        self._initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
        
        # The word IDs will be embedded into a dense representation before feeding to the LSTM.
        # This allows the model to efficiently represent the knowledge about particular words.
        with tf.device("/cpu:0"):
            embedding = tf.get_variable(
                "embedding", [vocab_size, hidden_size], dtype=tf.float32)
            input_embeddings = tf.nn.embedding_lookup(embedding, self.input_data)
            # The shape of `input_embeddings` is [batch_size, num_steps, hidden_size]
        
        # Implement dropoff (for training only)
        if is_training and config.keep_prob < 1:
            input_embeddings = tf.nn.dropout(input_embeddings, config.keep_prob)

        # Simplified version of models/tutorials/rnn/rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #
        # The alternative version of the code below is:
        #
        # inputs = tf.unstack(inputs, num=num_steps, axis=1)
        # outputs, state = tf.contrib.rnn.static_rnn(
        #     cell, inputs, initial_state=self._initial_state)
        
        # Unroll LSTM loop
        outputs = []
        state = self._initial_state
        
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                
                (cell_output, state) = stacked_lstm(input_embeddings[:, time_step, :], state)
                outputs.append(cell_output)
                # `outputs` is a list of `num_steps` tensors, each shaped [batch_size, hidden_size]
        
        # Resize the ouput into a [batch_size * num_steps, hidden_size] matrix.
        # Note axis=1 because we want to group words together according to its original sequence
        # in order to compare with `targets` to compute loss later.
        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, hidden_size])
        
        # Compute logits
        softmax_w = tf.get_variable(
            "softmax_w", [hidden_size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable(
            "softmax_b", [vocab_size], dtype=tf.float32)
        
        self._logits = logits = tf.matmul(output, softmax_w) + softmax_b
        # The shape of `logits` =
        # [batch_size * num_steps, hidden_size] x [hidden_size, vocab_size] + [vocab_size] =
        # [batch_size * num_steps, vocab_size]
        
        # Sample based on the size of logits (used for text generation)
        self._logits_sample = tf.multinomial(logits, 1)
        
        # Instead of sampling, return the position with the largest logit
        self._logits_max = tf.argmax(logits, 1)
        
        # Reshape logits to be 3-D tensor for sequence loss
        logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])

        # Use the contrib sequence loss and average over the batches
        # Source code: https://github.com/tensorflow/tensorflow/blob/r1.2/tensorflow/contrib/seq2seq/python/ops/loss.py#L30
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,  # shape: [batch_size, num_steps, vocab_size]
            self._targets,  # shape: [batch_size, num_steps]
            tf.ones([batch_size, num_steps], dtype=tf.float32),
            average_across_timesteps=False,
            average_across_batch=True)

        # Update the cost variables
        self._cost = cost = tf.reduce_sum(loss)
        self._final_state = state

        if not is_training:
            return

        # Optimizer
        self._lr = tf.Variable(0.0, trainable=False)
        
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost, tvars), config.max_grad_norm)
        
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(
            tf.float32, shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
        
        
    # To update learning rate
    def assign_lr(self, session, lr_value):
        session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
    
    @property
    def input(self):
        return self._input
    
    @property
    def input_data(self):
        return self._input_data
    
    @property
    def targets(self):
        return self._targets

    @property
    def initial_state(self):
        return self._initial_state

    @property
    def cost(self):
        return self._cost

    @property
    def final_state(self):
        return self._final_state

    @property
    def lr(self):
        return self._lr

    @property
    def train_op(self):
        return self._train_op
    
    @property
    def logits_sample(self):
        return self._logits_sample
    
    @property
    def logits_max(self):
        return self._logits_max
    
    @property
    def logits(self):
        return self._logits

### Model running

In [35]:
def run_epoch(session, model, eval_op=None, verbose=False):
    """
    Runs the model on the given data.
    """
    start_time = time.time()
    costs = 0.0
    iters = 0
    state = session.run(model.initial_state)
    fetches = {
        "cost": model.cost,
        "final_state": model.final_state,
    }
    
    if eval_op is not None:
        fetches["eval_op"] = eval_op
    
    # Recall that epoch_size = (batch_len - 1) // num_steps.
    for step in range(model.input.epoch_size):
        feed_dict = {}
        
        # Recall that `_initial_state` is a list of `num_layers` tensors
        # Each is a tuple of (`c_state`, `h_state`)
        for i, (c, h) in enumerate(model.initial_state):
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h
        
        vals = session.run(fetches, feed_dict)
        
        # Extract cost and final_state after the current step,
        # which become the new cost and state for the next step
        cost = vals["cost"]
        state = vals["final_state"]
        
        # Compute average cost up to the current step
        costs += cost
        iters += model.input.num_steps

        if verbose and step % (model.input.epoch_size // 10) == 10:
            print("%.3f (raw step: %.0f) perplexity: %.3f speed: %.0f wps" %
                  (step * 1.0 / model.input.epoch_size,
                   step,
                   np.exp(costs / iters),
                   iters * model.input.batch_size / (time.time() - start_time)))
            

    return np.exp(costs / iters)

### Text generations

In [24]:
# Use "it" as the start of the sentence
feed = np.array(word_to_id['it']).reshape(1, 1)

In [27]:
# Define sentence length
text_length = 200

In [28]:
def generate_text(session, model, feed, text_length):
    state = session.run(model.initial_state)
    fetches = {
        "final_state": model.final_state,
        "logits": model.logits_sample
    }
    
    generated_text = [feed]
    
    for i in range(text_length):
        feed_dict = {}
        feed_dict[model.input_data] = feed
        
        for i, (c, h) in enumerate(model.initial_state):
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h
        
        vals = session.run(fetches, feed_dict)
        
        # Extract final_state and sampled logits after the current step,
        # which become the new state and feed for the next step
        state = vals["final_state"]
        feed = vals["logits"]
        
        # Append generated text
        generated_text.append(feed)

    return generated_text

## Run all the things!

In [53]:
save_path = 'model_output' + '_' + datetime.now().strftime('%Y-%m-%d-%H-%M')
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [54]:
# Load configurations for training and validation
train_valid_config = config()

# Modify config for test data
eval_config = config()
eval_config.batch_size = 1
eval_config.num_steps = 1

In [None]:
# Redirect all output to a file
# First, save the default output
orig_stdout = sys.stdout

with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

    with tf.name_scope("Train"):
        train_input = PTBInput(config=train_valid_config, data=train_data, name="TrainInput")
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            m = PTBModel(is_training=True, config=config, input_=train_input)
        tf.summary.scalar("Training_Loss", m.cost)
        tf.summary.scalar("Learning_Rate", m.lr)

    with tf.name_scope("Valid"):
        valid_input = PTBInput(config=train_valid_config, data=valid_data, name="ValidInput")
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
        tf.summary.scalar("Validation_Loss", mvalid.cost)
    
    with tf.name_scope("Test"):
        test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mtest = PTBModel(is_training=False, config=eval_config, input_=test_input)
    
    # For text generations
    with tf.name_scope("Feed"):
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mfeed = PTBModel(is_training=False, config=eval_config)
            
    sv = tf.train.Supervisor(logdir=save_path)
    with sv.managed_session() as session:
        for i in range(config.max_max_epoch):
            # Redirect output to a file
            log_file_path = 'log_file_' + str(i) + '.txt'
            f = open(os.path.join(save_path, log_file_path), 'w')
            sys.stdout = f
            
            # Update learning_rate if necessary
            lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)
            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            
            # Compute train and valid perplexity
            train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True)
            print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
            
            valid_perplexity = run_epoch(session, mvalid)
            print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
            
            # Generate text
            generated_text = generate_text(session, mfeed, np.array(feed).reshape(1, 1), text_length)
            generated_text = ' '.join([id_to_word[text[0, 0]] for text in generated_text])
            print("Sample text generation:", generated_text)
            
            f.close()
            
        # Finally, compute test perplextiy
        log_file_path = 'log_file_test_perplexity.txt'
        f = open(os.path.join(save_path, log_file_path), 'w')
        sys.stdout = f

        test_perplexity = run_epoch(session, mtest)
        print("Test Perplexity: %.3f" % test_perplexity)
        
        print("Saving model to %s." % save_path)
        sv.saver.save(session, save_path, global_step=sv.global_step)
        
        f.close()

# Restore default output
sys.stdout = orig_stdout

In [57]:
# Print out final perplexities
with sv.managed_session() as session:
    print 'Train perplexity:', train_perplexity
    print 'Valid perplexity:', valid_perplexity
    print 'Test perplexity:', test_perplexity

Train perplexity: 126.468876915
Valid perplexity: 122.219147167
Test perplexity: 117.980653469


In [58]:
# Print out final text generation
with sv.managed_session() as session:
    generated_text = generate_text(session, mtest, np.array(feed).reshape(1, 1), text_length)
    generated_text = ' '.join([id_to_word[text[0, 0]] for text in generated_text])
    
    print generated_text

it was persuaded by the democratic deal that alliance that get that mr. barre issued an offer to help N <eos> the company boosted two corporate demand and the magazine <eos> complex companies <unk> proper members of mlx such concerns <eos> britain 's office likely to buy in a promotion against age N to ensure about profits in the u.s. unless the <unk> 's mr. <unk> could give the sale of <unk> hours <unk> throwing barriers after technical transactions <eos> mr. <unk> publisher & nikko corp. has cleared intel which has stalled with the offer <eos> mr. i. conceded its stake is a key version of sales that deals a <unk> product at the company <eos> without my shift by being <unk> the company could bring the authority this month to set about of the buy-out newspaper 's pilot toy director that mr. abramson sold <unk> the departures wave account well during apple 's elimination of the nation 's <unk> newspaper <eos> ge 's debt benefits appears to help <unk> the afternoon rehabilitation industr