## Params

In [1]:
# Training Parameters
NUM_EPOCHS = 500
LEARNING_RATE = 0.001 # Learning Rate
BATCH_SIZE = 128 # Batch Size
CHECKPOINT_PATH_DIR = './datasetslib/model_dir'
RESTORE_TRAINING=False
SAVE_DIR = './datasetslib/save'

# Network Parameters
RNN_SIZE = 128 # RNN Size
SEQ_LENGTH = 32  # Sequence Length

# Data Parameters
TEXT_SAVE_DIR= "./datasetslib/data/postgre_book.txt"

## Data preprocess

In [2]:
import os
import pickle
import tensorflow as tf
import numpy as np

def load_data():
    """
    Loading Data
    """
    input_file = os.path.join(TEXT_SAVE_DIR)
    with open(input_file, "r") as f:
        data = f.read()

    return data

def preprocess_and_save_data():
    """
    Preprocessing the Book Scripts Dataset
    """
    text = load_data()
    token_dict = define_tokens()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_map(text)
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('processed_text.p', 'wb'))


def load_preprocess_file():
    """
    Loading the processed Book Scripts Data
    """
    return pickle.load(open('processed_text.p', mode='rb'))


def save_params(params):
    """
    Saving parameters to file
    """
    pickle.dump(params, open('parameters.p', 'wb'))


def load_params():
    """
    Loading parameters from file
    """
    return pickle.load(open('parameters.p', mode='rb'))

def create_map(input_text):
    """
    Map words in vocab to int and vice versa for easy lookup
    :param input_text: Book Script data split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    vocab = set(input_text)
    vocab_to_int = {c: i for i, c in enumerate(vocab)}
    int_to_vocab = dict(enumerate(vocab))
    return vocab_to_int, int_to_vocab

def define_tokens():
    """
    Generate a dict to turn punctuation into a token. Note that Sym before each text denotes Symbol
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    dict = {'.':'_Sym_Period_',
            ',':'_Sym_Comma_',
            '"':'_Sym_Quote_',
            ';':'_Sym_Semicolon_',
            '!':'_Sym_Exclamation_',
            '?':'_Sym_Question_',
            '(':'_Sym_Left_Parentheses_',
            ')':'_Sym_Right_Parentheses_',
            '--':'_Sym_Dash_',
            '\n':'_Sym_Return_',
           }
    return dict

def generate_batch_data(int_text):
    """
    Generate batch data of x (inputs) and y (targets)
    :param int_text: Text with the words replaced by their ids
    :return: Batches as a Numpy array
    """
    num_batches = len(int_text) // (BATCH_SIZE * SEQ_LENGTH)

    x = np.array(int_text[:num_batches * (BATCH_SIZE * SEQ_LENGTH)])
    y = np.array(int_text[1:num_batches * (BATCH_SIZE * SEQ_LENGTH) + 1])

    x_batches = np.split(x.reshape(BATCH_SIZE, -1), num_batches, 1)
    y_batches = np.split(y.reshape(BATCH_SIZE, -1), num_batches, 1)
    batches = np.array(list(zip(x_batches, y_batches)))
    return batches

def extract_tensors(tf_graph):
    """
    Get input, initial state, final state, and probabilities tensor from the graph
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (tensor_input,tensor_initial_state,tensor_final_state, tensor_probs)
    """
    tensor_input = tf_graph.get_tensor_by_name("Input/input:0")
    tensor_initial_state = tf_graph.get_tensor_by_name("Network/initial_state:0")
    tensor_final_state = tf_graph.get_tensor_by_name("Network/final_state:0")
    tensor_probs = tf_graph.get_tensor_by_name("Network/probs:0")
    return tensor_input, tensor_initial_state, tensor_final_state, tensor_probs

def select_next_word(probs, int_to_vocab):
    """
    Select the next work for the generated text
    :param probs: list of probabilities of all the words in vocab which can be selected as next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: predicted next word
    """
    index = np.argmax(probs)
    word = int_to_vocab[index]
    return word


def predict_book_script():
    _, vocab_to_int, int_to_vocab, token_dict = load_preprocess_file()
    seq_length, load_dir = load_params()

    script_length = 250 # Length of Book script to generate. 250 denotes 250 words

    first_word = 'postgresql' # postgresql or any other word from the book

    loaded_graph = tf.Graph()
    with tf.Session(graph=loaded_graph) as sess:
        # Load saved model
        loader = tf.train.import_meta_graph(load_dir + '.meta')
        loader.restore(sess, load_dir)

        # Get Tensors from loaded model
        input_text, initial_state, final_state, probs = extract_tensors(loaded_graph)

        # Sentences generation setup
        sentences = [first_word]
        previous_state = sess.run(initial_state, {input_text: np.array([[1]])})
        # Generate sentences
        for i in range(script_length):
            # Dynamic Input
            dynamic_input = [[vocab_to_int[word] for word in sentences[-seq_length:]]]
            dynamic_seq_length = len(dynamic_input[0])

            # Get Prediction
            probabilities, previous_state = sess.run([probs, final_state], {input_text: dynamic_input, initial_state: previous_state})
            probabilities= np.squeeze(probabilities)

            pred_word = select_next_word(probabilities[dynamic_seq_length - 1], int_to_vocab)
            sentences.append(pred_word)

        # Scraping out tokens from the words
        book_script = ' '.join(sentences)
        for key, token in token_dict.items():
            book_script = book_script.replace(' ' + token.lower(), key)
        book_script = book_script.replace('\n ', '\n')
        book_script = book_script.replace('( ', '(')

        # Write the generated script to a file
        with open("book_script", "w") as text_file:
            text_file.write(book_script)

        print(book_script)


## Build the LSTM model

In [3]:
import tensorflow as tf
from tensorflow.contrib import seq2seq

class Model():
    def __init__(self, int_to_vocab):
        self.vocab_size = len(int_to_vocab)

        with tf.variable_scope('Input'):
            self.X = tf.placeholder(tf.int32, [None, None], name='input')
            self.Y = tf.placeholder(tf.int32, [None, None], name='target')
            self.input_shape = tf.shape(self.X)

        self.define_network()
        self.define_loss()
        self.define_optimizer()

    def define_network(self):
        # Define an init cell of RNN
        with tf.variable_scope("Network"):
            # Defining an initial cell state
            lstm = tf.contrib.rnn.BasicLSTMCell(RNN_SIZE)
            cell = tf.contrib.rnn.MultiRNNCell([lstm] * 2)  # Defining two LSTM layers for this case
            self.initial_state = cell.zero_state(self.input_shape[0], tf.float32)
            self.initial_state = tf.identity(self.initial_state, name="initial_state")

            embedding = tf.Variable(tf.random_uniform((self.vocab_size, RNN_SIZE), -1, 1))
            embed = tf.nn.embedding_lookup(embedding, self.X)

            outputs, self.final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=None, dtype=tf.float32)
            self.final_state = tf.identity(self.final_state, name='final_state')
            self.predictions = tf.contrib.layers.fully_connected(outputs, self.vocab_size, activation_fn=None)
            # Probabilities for generating words
            probs = tf.nn.softmax(self.predictions, name='probs')

    def define_loss(self):
        # Defining the sequence loss
        with tf.variable_scope('Sequence_Loss'):
            self.loss = seq2seq.sequence_loss(self.predictions, self.Y,
                                              tf.ones([self.input_shape[0], self.input_shape[1]]))

    def define_optimizer(self):
        with tf.variable_scope("Optimizer"):
            optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
            # Gradient Clipping
            gradients = optimizer.compute_gradients(self.loss)
            capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(capped_gradients)
            

## Train

In [4]:
def train(model,int_text):

    # Creating the checkpoint directory
    if not os.path.exists(CHECKPOINT_PATH_DIR):
        os.makedirs(CHECKPOINT_PATH_DIR)

    batches = generate_batch_data(int_text)

    with tf.Session() as sess:
        if RESTORE_TRAINING:
            saver = tf.train.Saver()
            ckpt = tf.train.get_checkpoint_state(CHECKPOINT_PATH_DIR)
            saver.restore(sess, ckpt.model_checkpoint_path)
            print('Model Loaded')
            start_epoch = int(str(ckpt.model_checkpoint_path).split('-')[-1])
        else:
            start_epoch = 0
            tf.global_variables_initializer().run()
            print('All variables initialized')

        for epoch in range(start_epoch, NUM_EPOCHS):
            saver = tf.train.Saver()
            state = sess.run(model.initial_state, {model.X: batches[0][0]})

            for batch, (x, y) in enumerate(batches):
                feed = {
                    model.X: x,
                    model.Y: y,
                    model.initial_state: state}
                train_loss, state, _ = sess.run([model.loss, model.final_state, model.train_op], feed)

                if (epoch * len(batches) + batch) % 200 == 0:
                    print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                        epoch,
                        batch,
                        len(batches),
                        train_loss))
                    # Save Checkpoint for restoring if required
                    saver.save(sess, CHECKPOINT_PATH_DIR + '/model.tfmodel', global_step=epoch + 1)

        # Save Model
        saver.save(sess, SAVE_DIR)
        print('Model Trained and Saved')
        save_params((SEQ_LENGTH, SAVE_DIR))



def main():
    if os.path.exists("./processed_text.p"):
        print ("Processed File Already Present. Proceeding with that")
    else:
        print ("Preprocessing the data")
        preprocess_and_save_data()

    print ("Loading the preprocessed data")
    int_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess_file()

    model = Model(int_to_vocab)
    print ("Training the model")
    train(model,int_text)

    print ("Generating the Book Script")
    predict_book_script()


if __name__ == "__main__":
    main()

Preprocessing the data
Loading the preprocessed data
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Training the model
All variables initialized
Epoch   0 Batch    0/12   train_loss = 8.183
Epoch  16 Batch    8/12   train_loss = 6.058
Epoch  33 Batch    4/12   train_loss = 5.952
Epoch  50 Batch    0/12   train_loss = 5.853
Epoch  66 Batch    8/12   train_loss = 5.604
Epoch  83 Batch    4/12   train_loss = 5.155
Epoch 100 Batch    0/12   train_loss = 4.903
Epoch 116 Batch    8/12   train_loss = 4.700
Epoch 133 Batch    4/12   train_loss = 4.352
Epoch 150 Batch 