In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import urllib.request

import tensorflow as tf
from tensorflow.models.rnn.ptb import reader

In [2]:
file_url = 'https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt'
file_name = 'tinyshakespeare.txt'
if not os.path.exists(file_name):
    urllib.request.urlretrieve(file_url, file_name)
    
with open(file_name,'r') as f:
    raw_data = f.read()
    print("Data length:", len(raw_data))

vocab = set(raw_data)
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))

data = [vocab_to_idx[c] for c in raw_data]
print('The Vocab Size is: ', vocab_size)
print('The vocab_to_idx  is: ', vocab_to_idx)
print ('The data lenght is: ', len(data))

Data length: 1115394
The Vocab Size is:  65
The vocab_to_idx  is:  {'j': 38, 'a': 0, 'p': 1, 'q': 49, 'X': 2, 'B': 3, 'U': 4, 'f': 6, 'G': 13, 'o': 7, 'u': 8, 'O': 9, '-': 10, 'e': 11, '!': 12, 'l': 14, 'y': 15, 't': 16, 'N': 17, ';': 18, 'm': 19, 'h': 21, '\n': 48, 'S': 23, 'Q': 58, 'C': 25, 'D': 27, ',': 26, 's': 29, 'K': 31, 'A': 32, 'T': 33, 'F': 34, 'Y': 35, 'M': 36, 'n': 20, 'I': 37, '.': 39, 'V': 40, 'Z': 28, '?': 22, '3': 41, 'W': 42, 'c': 43, 'P': 45, 'b': 44, 'J': 46, 'i': 62, 'E': 63, 'x': 47, 'r': 5, 'H': 30, 'g': 50, ':': 51, ' ': 52, 'd': 53, 'L': 54, '$': 55, 'z': 56, 'w': 57, 'R': 59, 'v': 60, "'": 61, 'k': 24, '&': 64}
The data lenght is:  1115394


In [3]:
def reset_graph():  # Reset the graph
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

In [4]:
def dynamic_RNN_model(
    batch_size = 2,
    num_hid_units = 3,
    num_classes = 6,
    num_sequences = 4,
    momentum = 0.9,
    learning_rate = 1e-4):

    vocab_size = num_classes
    
    reset_graph()
    
    x = tf.placeholder(tf.int32, shape = [batch_size, num_sequences], name='input_placeholder')
    y = tf.placeholder(tf.int32, shape = [batch_size, num_sequences], name='output_placeholder')

    # ENBEDDING(Input) LAYER TO HIDDEN LAYER OPERATION
    # Creating an Embedding matrix with a random weight for all vacab to hidden_matrix
    embed_to_hid_wghts = tf.get_variable('embedding_matrix', [vocab_size, num_hid_units])
    # Normally we convert the input index into a one hot matrix and then multiply it to the embedded weights, When we do so, we get the same embed weight corresponding to 1's in the one-hot vector but in a different shape. The below operation does all that in a single shot.
    embed_to_hid_layer = tf.nn.embedding_lookup(embed_to_hid_wghts, x)

    # HIDDEN LAYER OPERATION
    rnn_cell = tf.nn.rnn_cell.LSTMCell(num_hid_units, state_is_tuple=True)
    init_state = rnn_cell.zero_state(batch_size, tf.float32)  # Each sequence will hava a state that it passes to its next sequence
    rnn_outputs, new_state = tf.nn.dynamic_rnn(
                                        cell=rnn_cell,
                                        # sequence_length=X_lengths,
                                        initial_state=init_state,
                                        inputs=embed_to_hid_layer)
    
    # Initialize the weight and biases for the output layer. We use variable scope because we would like to share the weights 
    with tf.variable_scope('output_layer'):
        hid_to_output_wght = tf.get_variable('hid_to_output_wght', 
                                                 [num_hid_units, num_classes], 
                                                 initializer = tf.random_normal_initializer())
        output_bias = tf.get_variable('output_bias',
                                      [num_classes],
                                      initializer = tf.random_normal_initializer())
    

    # OUTPUT LAYER OPERATION
    # The variable rnn_output is a Tensor of shape of [Batch_size x num_sequence x num_hid_units] and,
    # The hid_to_output_wght is in the shape of [num_hid_units x num_classes]
    # And We want an output with shape [Batch_size x num_sequence x num_classes]
    # We horizontlly stack all the batches to form a matrix of [(Batch_size x num_sequence]) x num_classes]
    rnn_outputs = tf.reshape(rnn_outputs, [-1, num_hid_units])  
    hid_to_ouptut_layer = tf.matmul(rnn_outputs, hid_to_output_wght) +  output_bias
    output_state = tf.nn.softmax(hid_to_ouptut_layer, name=None)
 
    
    # CALCULATING LOSS and OPTIMIZING THE COST FUNCTION
    loss_CE = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(hid_to_ouptut_layer, tf.reshape(y, [-1])))
    # sparse_softmax_cross_entropy_with_logits automatically converts the y's into on hot vectors and perform the softmax operation
    # When using softmax_cross_entropy_with_logits, we have to first convert the y's into one-hot vector
    # The sparse_softmax uses dtype as int32 or int64
    optimizer = tf.train.MomentumOptimizer(learning_rate, 
                                            momentum, 
                                            use_locking=False, 
                                            name='Momentum', 
                                            use_nesterov=True).minimize(loss_CE)
    
    # Returns a graph object
    return dict(
        x=x,
        y=y,
        embed_to_hid_wghts = embed_to_hid_wghts,
        embed_to_hid_layer = embed_to_hid_layer,
        init_state = init_state,
        rnn_outputs = rnn_outputs,
        new_state = new_state,
        hid_to_output_wght = hid_to_output_wght,
        hid_to_ouptut_layer = hid_to_ouptut_layer,
        output_bias = output_bias,
        output_state = output_state,
        loss_CE = loss_CE,
        optimizer = optimizer
    )

In [5]:
def train_network(graph_dict):
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        
#         x_new = np.array([[1,4,2,2],[1,4,3,0]])
#         y_new = np.array([[4,2,2,5],[4,3,0,5]])
        
        training_data = np.array([[1,2,3,4], [1,3,4,0], [1,4,2,2], [1,4,3,0]])
        training_labels = np.array([[2,3,4,5], [3,4,0,5], [4,2,2,5], [4,3,0,5]])
        epochs = 10
        for epoch in np.arange(epochs):
            new_hid_layer_state = None
            for i in [2,4]:
    #         print (training_data[0:2,:])
                batch_data = training_data[i-2:i,:]
                batch_labels = training_labels[i-2:i,:]

                batch_size=len(batch_data)
                num_hidden_layer = 3
                num_classes = 6, 
                num_sequences = 4

    #             print (batch_data)
    #             print (batch_labels)

    #             print (batch_size)
                if not new_hid_layer_state: 
                    feed_dict= {graph_dict['x']: batch_data, 
                                graph_dict['y']: batch_labels}
                else:
                    print ('Using the new RNN State')
                    feed_dict= {graph_dict['x']: batch_data, 
                                graph_dict['y']: batch_labels, 
                                graph_dict['init_state'] : new_hid_layer_state}

                a, b, c, d, e, f, g, h, i, j, k = sess.run([graph_dict['embed_to_hid_wghts'],
                                         graph_dict['embed_to_hid_layer'],
                                         graph_dict['init_state'],
                                         graph_dict['rnn_outputs'],
                                         graph_dict['new_state'],
                                         graph_dict['hid_to_output_wght'],
                                         graph_dict['output_bias'],
                                         graph_dict['hid_to_ouptut_layer'],
                                         graph_dict['output_state'],
                                         graph_dict['loss_CE'],
                                         graph_dict['optimizer']], feed_dict=feed_dict)
                new_hid_layer_state = e

                print ('embed_to_hid_wghts \n', a)
                print ('')
                print ('embed_to_hid_layer \n', b)
                print ('')
                print ('init_state \n', c)
                print ('')
                print ('rnn_outputs \n', d)
                print ('')
                print ('new_state \n', e)
                print ('')
                print ('hid_to_output_wght \n', f)
                print ('')
                print ('output_bias \n', g)
                print ('')
                print ('hid_to_ouptut_layer \n', h)
                print ('')
                print ('output_state \n', i)
                print ('')
                print ('loss_CE \n', j)
                print ('')
                print ('optimizer \n', k)
                print ('')
                print ('')
                print ('popopopopopopopoop')
                print ('')
                print ('')

        
graph_dict = dynamic_RNN_model()
train_network(graph_dict)


embed_to_hid_wghts 
 [[ 0.4789122   0.39062405 -0.36072388]
 [ 0.0159585   0.24110568  0.27572352]
 [ 0.43530509 -0.30481893 -0.45537844]
 [ 0.57793838  0.23446894  0.62802935]
 [-0.04043166  0.030019   -0.18177924]
 [ 0.68515283 -0.07943249 -0.56528735]]

embed_to_hid_layer 
 [[[ 0.0159477   0.24109209  0.27570611]
  [ 0.43529826 -0.30482858 -0.45538843]
  [ 0.57792586  0.23445821  0.62801403]
  [-0.04043669  0.03001481 -0.18179166]]

 [[ 0.0159477   0.24109209  0.27570611]
  [ 0.57792586  0.23445821  0.62801403]
  [-0.04043669  0.03001481 -0.18179166]
  [ 0.47890872  0.39062101 -0.36073133]]]

init_state 
 LSTMStateTuple(c=array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]], dtype=float32), h=array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]], dtype=float32))

rnn_outputs 
 [[ 0.00877824  0.05682949  0.04386874]
 [-0.02553974 -0.02316239 -0.01572303]
 [ 0.02469682  0.12786092  0.05894761]
 [-0.01773707  0.06262558  0.02878092]
 [ 0.00877824  0.05682949  0.04386874]
 [ 0.04479747  0.20766123