In [1]:
import time
import numpy as np
import tensorflow as tf

In [2]:
print(tf.__version__)

1.10.0


In [3]:
# step 1: data preprocessing
# step 2: build model
# step 3: train model
# step 4: apply model - generate text

## Step 1: Data Preprocessing

In [4]:
# load data

with open('/Users/pliu/Downloads/anna.txt', 'r') as f:
    text = f.read()

# get vocabulary
vocab = set(text)

# vocab-int mapping dict
vocab_to_int = {c:i for i, c in enumerate(vocab)}

# int-vocab mapping dict
int_to_vocab = dict(enumerate(vocab))

# encode text
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

In [5]:
text[:20]

'Chapter 1\n\n\nHappy fa'

In [6]:
len(vocab)

83

In [7]:
# split dataset via mini batch
# each batch is a N*M array, batch_size = n_seqs*n_steps
# n_seqs with len of N, how many samples in each batch
# n_steps iwth len of M, n_steps, 
def get_batches(arr, n_seqs, n_steps):
    """
    split array (original data) into batches (mini-batch)
    args:
        arr: the array to split
        n_seqs: num of sequence in a batch
        n_steps: length of each sequence
    """
    batch_size = n_seqs * n_steps
    n_batches = int(len(arr) / batch_size)
    
    arr = arr[:batch_size * n_batches]
    
    arr = arr.reshape((n_seqs,-1))
    
    # generator
    for n in range(0, arr.shape[1], n_steps):
        # inputs
        x = arr[:, n:n+n_steps]
        # targets
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield x, y

In [8]:
batches = get_batches(encoded, n_seqs=10, n_steps=50)
x, y = next(batches)

In [9]:
print('x\n', x[:10,:10])
print('\ny\n', y[:10,:10])

x
 [[57 22 60 51  1 68 64 66 18 48]
 [66 60 14 66 24 80  1 66 21 80]
 [67 26 24 30 48 48 39 40 68 50]
 [24 66 73 12 64 26 24 21 66 22]
 [66 26  1 66 26 50 65 66 50 26]
 [66  5  1 66 20 60 50 48 80 24]
 [22 68 24 66 62 80 14 68 66 56]
 [43 66 15 12  1 66 24 80 20 66]
 [ 1 66 26 50 24 59  1 30 66 27]
 [66 50 60 26 73 66  1 80 66 22]]

y
 [[22 60 51  1 68 64 66 18 48 48]
 [60 14 66 24 80  1 66 21 80 26]
 [26 24 30 48 48 39 40 68 50 65]
 [66 73 12 64 26 24 21 66 22 26]
 [26  1 66 26 50 65 66 50 26 64]
 [ 5  1 66 20 60 50 48 80 24 17]
 [68 24 66 62 80 14 68 66 56 80]
 [66 15 12  1 66 24 80 20 66 50]
 [66 26 50 24 59  1 30 66 27 22]
 [50 60 26 73 66  1 80 66 22 68]]


## Step 2: Build Model

### Build Layers: input, lstm, output

In [10]:
# input layer
# size of input layer depends on the batch size
def build_inputs(n_seqs, n_steps):
    """
    build input layer
    args:
        n_seqs: num of sequence in each batch
        n_steps: length of char in each sequence
    """
    inputs = tf.placeholder(tf.int32, shape=(n_seqs, n_steps), name='inputs')
    targets = tf.placeholder(tf.int32, shape=(n_seqs, n_steps), name='targets')
    
    # keep probability, for dropout
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs, targets, keep_prob

In [11]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    """
    build lstm layer
    args:
        lstm_size: number of nodes in hidden layer
        num_layers: number of lstm layers
        batch_size: n_seq * n_steps
        keep_prob: parameter for dropout
    """
    # build a lstm cell
    def get_lstm_cell():
        lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
        # add dropout regularization
        return tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=keep_prob)  
    
    # add mutiple lstm cell
    cell = tf.nn.rnn_cell.MultiRNNCell([get_lstm_cell() for _ in range(num_layers)])
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, initial_state

In [12]:
def build_out(lstm_output, in_size, out_size):
    """
    build output layer
    args:
        lstm_output: output of lstm layer, a 3D array
        in_size: size of reshaped lstm
        out_size: size of softmax
    return:
        logits
        probability distribution after softmax
    """
    # reshape lstm output
    # concat output of lstm by column, 2D => 1D
    # [[1,2,3], [4,5,6]] => [1,2,3,4,5,6]
    seq_output = tf.concat(1, lstm_output)
    # reshape
    x = tf.reshape(seq_output, [-1, in_size])
    
    # connect lstm to softmax layer
    with tf.variable_score('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal([in_size, out_size],stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(out_size))
        
    # calculate logits
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # get probability distribution from softmax layer
    out = tf.nn.softmax(logits, name='predictions')
    
    return out, logits   

In [13]:
def build_loss(logits, targets, lsmt_size, num_classes):
    """
    calculate training loss based on logits and targets
    args:
        logits: output of fully-connected layer, without softmax
        targets
        lsmt_size: number of nodes in lsmt layer
        num_classes: vocab_size, number of classes
    return:
        loss, softmax reduce entropy with logits
    """
    # one-hot encoding to targets
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # soft corss entropy between logits and labels
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tr.reduce_mean(loss)
    
    return loss

In [14]:
# optimizer
# RNN#s problems: gradients exploding and gradients disappering
# LSTM solves the problem of gradient disapppering
# use gradient clipping to deal with gradient exploding
# gradient clipping limits value of a gradient to a threshold

def build_optimizer(loss, learning_rate, grad_clip):
    """
    build optimizer 
    loss: loss
    learning_rate: learning rate
    """
    # use gradient clipping
    t_vars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, t_vars), grad_clip)
    # Adam optimizer
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

In [15]:
class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50,
                 lstm_size=128, num_layers=2, learning_rate=0.001,
                 grad_clips=5, sampling=False):
        # if sampling ture, use SGD
        if sampling:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps
        
        tf.reset_default_graph()
        
        # input layer
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)
        
        # lstm layer
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
        
        # encode inputs: one hot encoding
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        
        # run RNN, use tf.nn.dynamic_rnn
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=self.initial_state)
        self.final_state = state
        
        # predict
        self.prediction, self.logits = build_out(outputs, lstm_size, num_classes)
        
        # loss and optimizer
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clips)
        
        

## Step 3: Train Model

In [16]:
batch_size = 100
num_steps = 100
lstm_size = 512
num_layers = 2
learning_rate = 0.001
keep_prob = 0.5

In [17]:
epochs = 20
save_every_n = 200 # num of rounds to save the variables
model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers,
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    counter = 0
    
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss,
                                                 model.final_state,
                                                 model.optimizer],
                                                 feed_dict = feed)
            
            
            end = time.time()
            #control the print lines
            # for each 100 times
            if counter % 100 == 0:
                print('round: {}/{}...'.format(e+1, epochs),
                      'training steps (batch): {}...'.format(counter),
                      'training error: {:.4f}...'.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)))
                
                if (counter % save_every_n == 0):
                    saver.save(sess, 'checkpoints/i{}_1{}.ckpt'.format(counter, lsmt_size))
                    
    saver.save(sess, 'checkpoints/i{}_l{}.ckpt'.format(counter, lstm_size))
        

ValueError: Tensor conversion requested dtype int32 for Tensor with dtype float32: 'Tensor("rnn/transpose_1:0", shape=(100, 100, 512), dtype=float32)'