<p><strong>Based on<a href="https://github.com/jiqizhixin/ML-Tutorial-Experiment/blob/master/Experiments/LSTM_PTB.ipynb"> HoratioJSY</a></strong><p>

In [1]:
import tensorflow as tf
import os
import sys
import collections
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
def _read_words(filename):
    with tf.gfile.GFile(filename,'r') as f:
        return f.read().replace('\n','<eos>').split()
    
def _build_vocab(filename):
    data = _read_words(filename)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(),key=lambda x:(-x[1],x[0]))
    
    words,_ = list(zip(*count_pairs))
    word_to_id = dict(zip(words,range(len(words))))
    
    return word_to_id

def _file_to_word_ids(filename,word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [3]:
def ptb_raw_data(data_path):
    train_path = os.path.join(data_path,'ptb.train.txt')
    valid_path = os.path.join(data_path,'ptb.valid.txt')
    test_path = os.path.join(data_path,'ptb.test.txt')
    
    word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path,word_to_id)
    valid_data = _file_to_word_ids(valid_path,word_to_id)
    test_data = _file_to_word_ids(test_path,word_to_id)
    word_len=len(word_to_id)
    return train_data,valid_data,test_data,word_len

def ptb_producer(raw_data,batch_size,num_steps,name=None):
    with tf.name_scope(name,'PTBProducer',[raw_data,batch_size,num_steps]):
        raw_data=tf.convert_to_tensor(raw_data,name='raw_data',dtype=tf.int32)
        data_len=tf.size(raw_data)
        batch_len=data_len//batch_size
        data=tf.reshape(raw_data[0:batch_size*batch_len],
                       [batch_size,batch_len])
        epoch_size=(batch_len-1)//num_steps
        assertion = tf.assert_positive(
            epoch_size,
            message='epoch_size==0,decrease batch_size or num_steps')
        with tf.control_dependencies([assertion]):
            epoch_size=tf.identity(epoch_size,name='epoch_size')
        
        i = tf.train.range_input_producer(epoch_size,shuffle=False).dequeue()
        x = tf.strided_slice(data,[0,i*num_steps],
                            [batch_size,(i+1)*num_steps])
        x.set_shape([batch_size,num_steps])
        y = tf.strided_slice(data,[0,i*num_steps+1],
                            [batch_size,(i+1)*num_steps+1])
        y.set_shape([batch_size,num_steps])
        return x,y

In [4]:
data_path = './PTB'

# the number of hidden unit and the layer of LSTM
hidden_size = 200
num_layers = 2
#the size of vocabulary
vocab_size = 10000

learning_rate = 1.0
train_batch_size = 16
# the length of truncating
train_num_step = 32

eval_batch_size = 1
eval_num_step = 1
num_epoch = 3
#dropout probability
keep_prob = 0.5

# the parameter of controling gradient explosion
max_grad_norm = 5

In [5]:
class PTBModel(object):
    def __init__(self, is_training, batch_size, num_steps):
        # batch size and trucating length
        self.batch_size = batch_size
        self.num_steps = num_steps

        # input layer
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        # output layer
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        # LSTM cell
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
        if is_training:
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)

        # initialize state to 0
        self.initial_state = cell.zero_state(batch_size, tf.float32)

        # vectorize vocabulary ids
        embedding = tf.get_variable('embedding', [vocab_size, hidden_size])
        # vectorization
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        # dropout only when training
        if is_training: inputs = tf.nn.dropout(inputs, keep_prob)

        # store output of lstm
        outputs = []
        # store LSTM state of different batch, initial 0
        state = self.initial_state
        with tf.variable_scope('RNN'):
            for time_step in range(num_steps):
                if time_step > 0: 
                    tf.get_variable_scope().reuse_variables()
                # get input of current time step and state of last time step, then feed forward to LSTM
                cell_output, state = cell(inputs[:, time_step, :], state)
            
                outputs.append(cell_output)

        # reshape to [batch,hidden*num_step], then reshape to [batch*num_step, hidden]
        output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])

        # feed the outputs of LSTM to fully connected layer to get results. The results each is tensor which shape is vocab_size
        # Then go through softmax layer to get the probability of vocab at the next position
        weight = tf.get_variable('weight', [hidden_size, vocab_size])
        bias = tf.get_variable('bias', [vocab_size])
        logits = tf.matmul(output, weight) + bias

        # loss
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits],  # prediction
            [tf.reshape(self.targets, [-1])],  # ground truth，flatten [batch_size, num_steps]
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])  # weights of loss，1 means the same weights

        # avg loss every batch
        self.cost = tf.reduce_sum(loss) / batch_size
        self.final_state = state

        # backpropagation only when training
        if not is_training: return
        trainable_variable = tf.trainable_variables()

        # control gradient explosion
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variable), max_grad_norm)
        
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        # train step
        self.train_op = optimizer.apply_gradients(zip(grads, trainable_variable))

In [6]:
def run_epoch(session, model, data, train_op, output_log, epoch_size):
    total_costs = 0.0
    iters = 0
    state = session.run(model.initial_state)

    
    for step in range(epoch_size):
        x, y = session.run(data)
        
        cost, state, _ = session.run([model.cost, model.final_state, train_op],
                                        {model.input_data: x, model.targets: y, model.initial_state: state})
        
        total_costs += cost
        iters += model.num_steps

        # log only when training
        if output_log and step % 100 == 0:
            print("After %d steps, perplexity is %.3f" % (step, np.exp(total_costs / iters)))
    return np.exp(total_costs / iters)

In [7]:
def main():
    train_data, valid_data, test_data, _ = ptb_raw_data(data_path)

    # calculate the number of training one epoch
    train_data_len = len(train_data)
    train_batch_len = train_data_len // train_batch_size
    train_epoch_size = (train_batch_len - 1) // train_num_step

    valid_data_len = len(valid_data)
    valid_batch_len = valid_data_len // eval_batch_size
    valid_epoch_size = (valid_batch_len - 1) // eval_num_step

    test_data_len = len(test_data)
    test_batch_len = test_data_len // eval_batch_size
    test_epoch_size = (test_batch_len - 1) // eval_num_step

    initializer = tf.random_uniform_initializer(-0.05, 0.05)
    with tf.variable_scope("language_model", reuse=None, initializer=initializer):
        train_model = PTBModel(True, train_batch_size, train_num_step)

    with tf.variable_scope("language_model", reuse=True, initializer=initializer):
        eval_model = PTBModel(False, eval_batch_size, eval_num_step)

    # training
    with tf.Session() as session:
        tf.global_variables_initializer().run()

        train_queue = ptb_producer(train_data, train_model.batch_size, train_model.num_steps)
        eval_queue = ptb_producer(valid_data, eval_model.batch_size, eval_model.num_steps)
        test_queue = ptb_producer(test_data, eval_model.batch_size, eval_model.num_steps)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=session, coord=coord)

        for i in range(num_epoch):
            print("In iteration: %d" % (i + 1))
            run_epoch(session, train_model, train_queue, train_model.train_op, True, train_epoch_size)

            valid_perplexity = run_epoch(session, eval_model, eval_queue, tf.no_op(), False, valid_epoch_size)
            print("Epoch: %d Validation Perplexity: %.3f" % (i + 1, valid_perplexity))

        test_perplexity = run_epoch(session, eval_model, test_queue, tf.no_op(), False, test_epoch_size)
        print("Test Perplexity: %.3f" % test_perplexity)

        coord.request_stop()
        coord.join(threads)

In [None]:
if __name__ == "__main__":
    main()

In iteration: 1
After 0 steps, perplexity is 10040.366
After 100 steps, perplexity is 1415.210
After 200 steps, perplexity is 1019.100
After 300 steps, perplexity is 886.449
After 400 steps, perplexity is 775.639
After 500 steps, perplexity is 690.577
After 600 steps, perplexity is 632.798
After 700 steps, perplexity is 590.683
After 800 steps, perplexity is 557.514
After 900 steps, perplexity is 529.996
After 1000 steps, perplexity is 505.546
After 1100 steps, perplexity is 485.353
After 1200 steps, perplexity is 467.655
After 1300 steps, perplexity is 453.902
After 1400 steps, perplexity is 441.095
After 1500 steps, perplexity is 430.351
After 1600 steps, perplexity is 416.922
After 1700 steps, perplexity is 407.194
After 1800 steps, perplexity is 399.936
Epoch: 1 Validation Perplexity: 247.690
In iteration: 2
After 0 steps, perplexity is 434.120
After 100 steps, perplexity is 288.719
After 200 steps, perplexity is 269.692
After 300 steps, perplexity is 276.311
After 400 steps, perpl