In [107]:
import pandas as pd
import numpy as np
import collections
import os
import sys
import inspect
import time
from datetime import datetime
import random
import re
import tensorflow as tf

Adapted from https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb

## Read data

In [28]:
data = pd.read_csv('works_words.csv')
data.head()

Unnamed: 0,gutenberg_id,title,space_cnt,rn,n,grp,word
0,105,Persuasion,112,1,1002,train,sir
1,105,Persuasion,112,1,1002,train,walter
2,105,Persuasion,112,1,1002,train,elliot
3,105,Persuasion,112,1,1002,train,_comma_
4,105,Persuasion,112,1,1002,train,of


In [36]:
# Create word-to-ID mapping
counter = collections.Counter(data['word'])
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
id_to_word = dict(zip(range(len(words)), words))

vocab_size = len(word_to_id)
vocab_size

38115

In [37]:
# Separate data into training, valid, and testing
train_data = [word_to_id[w] for w in list(data['word'][data['grp'] == 'train'])]
valid_data = [word_to_id[w] for w in list(data['word'][data['grp'] == 'valid'])]
test_data = [word_to_id[w] for w in list(data['word'][data['grp'] == 'test'])]

In [38]:
len(train_data), len(valid_data), len(test_data)

(1842988, 206542, 218819)

In [39]:
print 'The beginning of training data:'
print ' '.join([id_to_word[e] for e in train_data[:50]])

print '\nThe beginning of valid data:'
print ' '.join([id_to_word[e] for e in valid_data[:50]])

print '\nThe beginning of test data:'
print ' '.join([id_to_word[e] for e in test_data[:50]])

The beginning of training data:
sir walter elliot _comma_ of kellynch hall _comma_ in somersetshire _comma_ was a man who _comma_ for his own amusement _comma_ never took up any book but the baronetage _semicolon_ there he found occupation for an idle hour _comma_ and consolation in a distressed one _semicolon_ there his faculties were

The beginning of valid data:
_quote_ indeed _comma_ my dear mrs smith _comma_ i want none _comma_ _quote_ cried anne _period_ _quote_ you have asserted nothing contradictory to what mr elliot appeared to be some years ago _period_ this is all in confirmation _comma_ rather _comma_ of what we used to hear and believe _period_

The beginning of test data:
the interruption had been short _comma_ though severe _comma_ and ease and animation returned to most of those they left as the door shut them out _comma_ but not to anne _period_ she could think only of the invitation she had with such astonishment witnessed _comma_ and of the manner


## Define model configurations

In [134]:
class config(object):
    vocab_size = vocab_size
    batch_size = 20
    num_steps = 35  # sequence length; the number of unrolls
    hidden_size = 300  # number of hidden units in LSTM; also embedding size
    keep_prob = 0.5  # 1 - dropoff rate
    num_layers = 2  # number of LSTM layers
    max_grad_norm = 5  # max gradient (to prevent the exploding gradient problems)
    init_scale = 0.05  # the initial scale of the weights
    max_epoch = 4  # the number of epochs trained with the initial learning rate
    max_max_epoch = 20  # the total number of epochs for training
    learning_rate = 1.0  # the initial value of the learning rate
    lr_decay = 0.5  # the decay of the learning rate for each epoch after "max_epoch"

## Generate batches

In [41]:
def batch_producer(raw_data, batch_size, num_steps, name=None):
    """
    Iterate on the raw data.
    
    This chunks up raw_data into batches of examples and returns Tensors that
    are drawn from these batches.
    
    Args:
        raw_data: one of train_data, valid_data, and test_data.
        batch_size: int, the batch size.
        num_steps: int, the number of unrolls.
        name: the name of this operation (optional).
    Returns:
        A pair of Tensors, each shaped [batch_size, num_steps]. The second element
        of the tuple is the same data time-shifted to the right by one.
    Raises:
        tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
    """
    
    with tf.name_scope(name, "BatchProducer", [raw_data, batch_size, num_steps]):
        raw_data = tf.convert_to_tensor(
            raw_data, name="raw_data", dtype=tf.int32)

        data_len = tf.size(raw_data)
        batch_len = data_len // batch_size
        data = tf.reshape(raw_data[0:batch_size * batch_len],
                          [batch_size, batch_len])

        epoch_size = (batch_len - 1) // num_steps
        assertion = tf.assert_positive(
            epoch_size,
            message="epoch_size == 0, decrease batch_size or num_steps")
        with tf.control_dependencies([assertion]):
            epoch_size = tf.identity(epoch_size, name="epoch_size")

        i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
        
        x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
        x.set_shape([batch_size, num_steps])
        
        y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1])
        y.set_shape([batch_size, num_steps])
        
        return x, y

It's a bit hard to understand the code above without establishing a session, so I'm going to break it down with simple Numpy operations to see exactly what it's doing:

In [42]:
batch_size = config.batch_size
num_steps = config.num_steps

In [43]:
# Use train_data as an example
raw_data = train_data

data_len = len(raw_data)
batch_len = data_len // batch_size

data_len, batch_len

(1842988, 92149)

In [44]:
data = np.array(raw_data[0:batch_size * batch_len]).reshape(batch_size, batch_len)
data.shape

(20, 92149)

In [45]:
epoch_size = (batch_len - 1) // num_steps
epoch_size

2632

In [47]:
i = 0

x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1], [batch_size, (i + 1) * num_steps + 1])

# Open an interactive session to evaluate generated tensors
sess = tf.InteractiveSession()

x = x.eval()
y = y.eval()

x.shape, y.shape

((20, 35), (20, 35))

In [48]:
# tf.strided_slice is equivalent to slicing in Numpy:
# https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html

assert np.array_equal(x, data[0:batch_size:1, i * num_steps:(i + 1) * num_steps:1])
assert np.array_equal(y, data[0:batch_size:1, i * num_steps + 1:(i + 1) * num_steps + 1:1])

In [50]:
# print out some x and y as examples
for i in range(0, 5):
    print 'Batch', i+1, ' '.join([id_to_word[wid] for wid in x[i]]), '\n'

Batch 1 sir walter elliot _comma_ of kellynch hall _comma_ in somersetshire _comma_ was a man who _comma_ for his own amusement _comma_ never took up any book but the baronetage _semicolon_ there he found occupation for 

Batch 2 being also dancing _comma_ catherine was left to the mercy of mrs _period_ thorpe and mrs _period_ allen _comma_ between whom she now remained _period_ she could not help being vexed at the non _dash_ 

Batch 3 _semicolon_ for i assure you mr _period_ wingfield told me _comma_ that he did not believe he had ever sent us off altogether _comma_ in such good case _period_ i trust _comma_ at least _comma_ 

Batch 4 was wishing to get the better of his attachment to herself _comma_ she just recovering from her mania for mr _period_ elton _period_ it seemed as if every thing united to promise the most interesting 

Batch 5 that knowledge _period_ mrs _period_ jennings left them earlier than usual _semicolon_ for she could not be easy till the middletons and palmers 

In [55]:
for i in range(0, 5):
    print 'Batch', i+1, ' '.join([id_to_word[wid] for wid in y[i]]), '\n'

Batch 1 walter elliot _comma_ of kellynch hall _comma_ in somersetshire _comma_ was a man who _comma_ for his own amusement _comma_ never took up any book but the baronetage _semicolon_ there he found occupation for an 

Batch 2 also dancing _comma_ catherine was left to the mercy of mrs _period_ thorpe and mrs _period_ allen _comma_ between whom she now remained _period_ she could not help being vexed at the non _dash_ appearance 

Batch 3 for i assure you mr _period_ wingfield told me _comma_ that he did not believe he had ever sent us off altogether _comma_ in such good case _period_ i trust _comma_ at least _comma_ that 

Batch 4 wishing to get the better of his attachment to herself _comma_ she just recovering from her mania for mr _period_ elton _period_ it seemed as if every thing united to promise the most interesting consequences 

Batch 5 knowledge _period_ mrs _period_ jennings left them earlier than usual _semicolon_ for she could not be easy till the middletons and palmers

Generate batches and put it in a class (for easy access later):

In [92]:
class ModelInput(object):
    def __init__(self, config, data, name=None):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
        self.input_data, self.targets = batch_producer(
            data, batch_size, num_steps, name=name)

## Define model

In [59]:
class Model(object):
    def __init__(self, is_training, config, input_=None):
        batch_size = config.batch_size
        num_steps = config.num_steps
        hidden_size = config.hidden_size
        vocab_size = config.vocab_size
        
        if input_ is not None:
            # For normal training and validation, input data is pre-defined by class `ModelInput`
            self._input = input_
            self._input_data = input_.input_data
            self._targets = input_.targets
            
        else:
            # For text generations, input data is generated and fed on the fly
            self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
            self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        def lstm_cell():
            # With the latest TensorFlow source code (as of Mar 27, 2017),
            # the BasicLSTMCell will need a reuse parameter which is unfortunately not
            # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
            # an argument check here:
            
            if 'reuse' in inspect.getargspec(
                    tf.contrib.rnn.BasicLSTMCell.__init__).args:
                return tf.contrib.rnn.BasicLSTMCell(
                    hidden_size,
                    forget_bias=0.0,
                    state_is_tuple=True,
                    reuse=tf.get_variable_scope().reuse)
            else:
                return tf.contrib.rnn.BasicLSTMCell(
                    hidden_size,
                    forget_bias=0.0,
                    state_is_tuple=True)
            
            # Note because we set `state_is_tuple=True`, the states are 2-tuples of the `c_state` and `h_state`
            # `c_state` is the cell state
            # `h_state` is the hidden state
            # See this SO thread: https://stackoverflow.com/questions/41789133/c-state-and-m-state-in-tensorflow-lstm
    
        attn_cell = lstm_cell

        # Implement dropoff (for training only)
        if is_training and config.keep_prob < 1:

            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    lstm_cell(), output_keep_prob=config.keep_prob)

        # Stacking multiple LSTMs
        attn_cells = [attn_cell() for _ in range(config.num_layers)]
        stacked_lstm = tf.contrib.rnn.MultiRNNCell(attn_cells, state_is_tuple=True)
        
        # Initialize states with zeros
        # `_initial_state` is a list of `num_layers` tensors
        # Each is a tuple of (`c_state`, `h_state`),
        # and both `c_state` and `h_state` are shaped [batch_size, hidden_size]
        self._initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
        
        # The word IDs will be embedded into a dense representation before feeding to the LSTM.
        # This allows the model to efficiently represent the knowledge about particular words.
        with tf.device("/cpu:0"):
            embedding = tf.get_variable(
                "embedding", [vocab_size, hidden_size], dtype=tf.float32)
            input_embeddings = tf.nn.embedding_lookup(embedding, self.input_data)
            # The shape of `input_embeddings` is [batch_size, num_steps, hidden_size]
        
        # Implement dropoff (for training only)
        if is_training and config.keep_prob < 1:
            input_embeddings = tf.nn.dropout(input_embeddings, config.keep_prob)

        # Simplified version of models/tutorials/rnn/rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #
        # The alternative version of the code below is:
        #
        # inputs = tf.unstack(inputs, num=num_steps, axis=1)
        # outputs, state = tf.contrib.rnn.static_rnn(
        #     cell, inputs, initial_state=self._initial_state)
        
        # Unroll LSTM loop
        outputs = []
        state = self._initial_state
        
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                
                (cell_output, state) = stacked_lstm(input_embeddings[:, time_step, :], state)
                outputs.append(cell_output)
                # `outputs` is a list of `num_steps` tensors, each shaped [batch_size, hidden_size]
        
        # Resize the ouput into a [batch_size * num_steps, hidden_size] matrix.
        # Note axis=1 in `tf.reshape` below because we want to group words together according to its original sequence
        # in order to compare with `targets` to compute loss later.
        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, hidden_size])
        
        # Compute logits
        softmax_w = tf.get_variable(
            "softmax_w", [hidden_size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable(
            "softmax_b", [vocab_size], dtype=tf.float32)
        
        logits = tf.matmul(output, softmax_w) + softmax_b
        # The shape of `logits` =
        # [batch_size * num_steps, hidden_size] x [hidden_size, vocab_size] + [vocab_size] =
        # [batch_size * num_steps, vocab_size]
        
        # Sample based on the size of logits (used for text generation)
        self._logits_sample = tf.multinomial(logits, 1)
        
        # Reshape logits to be 3-D tensor for sequence loss
        logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])

        # Use the contrib sequence loss and average over the batches
        # Source code: https://github.com/tensorflow/tensorflow/blob/r1.2/tensorflow/contrib/seq2seq/python/ops/loss.py#L30
        loss = tf.contrib.seq2seq.sequence_loss(
            logits,  # shape: [batch_size, num_steps, vocab_size]
            self._targets,  # shape: [batch_size, num_steps]
            tf.ones([batch_size, num_steps], dtype=tf.float32),  # weights (all set to 1 here)
            average_across_timesteps=False,
            average_across_batch=True)

        # Update the cost variables
        self._cost = cost = tf.reduce_sum(loss)
        self._final_state = state

        if not is_training:
            return

        # Optimizer
        self._lr = tf.Variable(0.0, trainable=False)
        
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost, tvars), config.max_grad_norm)
        
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(
            tf.float32, shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
        
        
    # Update learning rate
    def assign_lr(self, session, lr_value):
        session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
    
    # Properties to be called model training
    @property
    def input(self):
        return self._input
    
    @property
    def input_data(self):
        return self._input_data
    
    @property
    def targets(self):
        return self._targets

    @property
    def initial_state(self):
        return self._initial_state

    @property
    def cost(self):
        return self._cost

    @property
    def final_state(self):
        return self._final_state

    @property
    def lr(self):
        return self._lr

    @property
    def train_op(self):
        return self._train_op
    
    @property
    def logits_sample(self):
        return self._logits_sample

### Model running

In [61]:
def run_epoch(session, model, eval_op=None, verbose=False):
    """
    Runs the model on the given data.
    """
    start_time = time.time()
    costs = 0.0
    iters = 0
    state = session.run(model.initial_state)
    fetches = {
        "cost": model.cost,
        "final_state": model.final_state,
    }
    
    if eval_op is not None:
        fetches["eval_op"] = eval_op
    
    # Recall that epoch_size = (batch_len - 1) // num_steps.
    for step in range(model.input.epoch_size):
        feed_dict = {}
        
        # Recall that `_initial_state` is a list of `num_layers` tensors
        # Each is a tuple of (`c_state`, `h_state`)
        for i, (c, h) in enumerate(model.initial_state):
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h
        
        vals = session.run(fetches, feed_dict)
        
        # Extract cost and final_state after the current step,
        # which become the new initial cost and state for the next step
        cost = vals["cost"]
        state = vals["final_state"]
        
        # Compute average cost up to the current step
        costs += cost
        iters += model.input.num_steps

        if verbose and step % (model.input.epoch_size // 10) == 10:
            print("%.3f (raw step: %.0f) perplexity: %.3f speed: %.0f wps" %
                  (step * 1.0 / model.input.epoch_size,
                   step,
                   np.exp(costs / iters),
                   iters * model.input.batch_size / (time.time() - start_time)))

    return np.exp(costs / iters)

### Text generations

In [71]:
# Use various pronouns as the start of each sentence
feeds = ['he', 'she', 'it', 'mr', 'mrs', 'miss']
feeds = [word_to_id[w] for w in feeds]

In [72]:
# Define sentence length
text_length = 500

In [73]:
def generate_text(session, model, feed, text_length):
    # Note: this function is based on `run_epoch` defined above
    
    state = session.run(model.initial_state)
    fetches = {
        "final_state": model.final_state,
        "logit_sample": model.logits_sample
    }
    
    generated_text = [feed]
    
    for i in range(text_length):
        feed_dict = {}
        feed_dict[model.input_data] = feed
        
        for i, (c, h) in enumerate(model.initial_state):
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h
        
        vals = session.run(fetches, feed_dict)
        
        # Extract final_state and sampled logit after the current step,
        # which become the new initial state and feed for the next step
        state = vals["final_state"]
        feed = vals["logit_sample"]
        
        # Append generated text
        generated_text.append(feed)

    return generated_text

In [124]:
def convert_punctuations(t):
    t = re.sub(" _comma_", ",", t)
    t = re.sub(" _period_", ".", t)
    t = re.sub("_quote_", "\"", t)
    t = re.sub("_dash_", "-", t)
    t = re.sub(" _semicolon_", ";", t)
    t = re.sub(" _exclamation_", "!", t)
    t = re.sub(" _question_", "?", t)
    t = re.sub(" _colon_", ":", t)
    t = re.sub("_leftparenthesis_ ", "(", t)
    t = re.sub(" _rightparenthesis_", ")", t)
    t = re.sub("_leftbracket_ ", "[", t)
    t = re.sub(" _rightbracket_", "]", t)

    return t

## Run all the things!

In [136]:
save_path = 'model_output' + '_' + datetime.now().strftime('%Y-%m-%d-%H-%M')
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [137]:
# Load configurations for training and validation
train_valid_config = config()

# Modify configurations for test data and feed for text generations
eval_config = config()
eval_config.batch_size = 1
eval_config.num_steps = 1

In [138]:
# List to store generated texts
generated_texts = []

In [None]:
# Redirect all output to a file
# First, save the default output
orig_stdout = sys.stdout

with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

    with tf.name_scope("Train"):
        train_input = ModelInput(config=train_valid_config, data=train_data, name="TrainInput")
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            m = Model(is_training=True, config=config, input_=train_input)
        tf.summary.scalar("Training_Loss", m.cost)
        tf.summary.scalar("Learning_Rate", m.lr)

    with tf.name_scope("Valid"):
        valid_input = ModelInput(config=train_valid_config, data=valid_data, name="ValidInput")
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mvalid = Model(is_training=False, config=config, input_=valid_input)
        tf.summary.scalar("Validation_Loss", mvalid.cost)
    
    with tf.name_scope("Test"):
        test_input = ModelInput(config=eval_config, data=test_data, name="TestInput")
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mtest = Model(is_training=False, config=eval_config, input_=test_input)
    
    # For text generations
    with tf.name_scope("Feed"):
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mfeed = Model(is_training=False, config=eval_config)
            
    sv = tf.train.Supervisor(logdir=save_path)
    with sv.managed_session() as session:
        for i in range(config.max_max_epoch):
            # Redirect output to a file
            log_file_path = 'log_file_' + str(i) + '.txt'
            f = open(os.path.join(save_path, log_file_path), 'w')
            sys.stdout = f
            
            # Update learning_rate if necessary
            lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)
            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            
            # Compute train and valid perplexity
            train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True)
            print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
            
            valid_perplexity = run_epoch(session, mvalid)
            print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
            
            # Generate text
            print("Sample text generation:\n")
            for feed in feeds:
                generated_text = generate_text(session, mfeed, np.array(feed).reshape(1, 1), text_length)
                generated_text = ' '.join([id_to_word[text[0, 0]] for text in generated_text])
                generated_text = convert_punctuations(generated_text)
                print(generated_text, '\n')
                
                generated_texts.append(generated_text)
            
            f.close()
            
        # Finally, compute test perplextiy
        log_file_path = 'log_file_test_perplexity.txt'
        f = open(os.path.join(save_path, log_file_path), 'w')
        sys.stdout = f

        test_perplexity = run_epoch(session, mtest)
        print("Test Perplexity: %.3f" % test_perplexity)
        
        print("Saving model to %s." % save_path)
        sv.saver.save(session, save_path, global_step=sv.global_step)
        
        f.close()

# Restore default output
sys.stdout = orig_stdout