# LSTM - Text Generator
This notebook builds a character-wise RNN trained on "Gjakftohtesia" of the albanian author Ismail Kadare. It'll be used to generate a new chapter.The helper class and many parts of the code are from https://github.com/udacity/deep-learning/tree/master/tv-script-generation

In [3]:
import helper

data_dir = './data/Ismail Kadare - Gjakftohtesia.txt'
text = helper.load_data(data_dir)
# Remove the first 90 characters, which contain general information
text = text[90:]

## Explore the Data

In [48]:
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[0: 30]))

Dataset Stats
Roughly the number of unique words: 25655
Number of scenes: 2803
Average number of sentences in each scene: 5.716018551551909
Number of lines: 18825
Average number of words in each line: 8.2532802124834

The sentences 0 to 30:
 

Silva e kaloi me shpejtesi sheshin e ministrive, megjithate, 
kur mberriti te hyrja e ministrise se saj, ishte 
pak vone. 

Pa e ngadalesuar ecjen, pershendeti nepunesin e 
sherbimit, fytyra e te cilit mezi dallohej pas xhamt te 
portinerise dhe ashtu, gjysme me vrap, nisi te ngjiste 
shkallet. 

Ne korridorin e katit te dyte desh u perplas me te 
njohurin e saj te dikurshem Viktor Hilen, te cilin s'e kishte 
takuar prej kohesh. 


 O, si jeni?tha ajo, me frymemarrje akoma te 
shpeshuar nga ngjitja e shkalleve.C'ju ka sjelle kendej? 
Ai e veshtroi me ca sy te hutuar dhe vetem tani Silva 
vuri re ne fytyren e tij te parruar nje ndjenje lodhje-
je dhe inerzie. 

Nje ngaterrese,ia beri ai neper dhembe, duke i 
shoqeruar fjalet me nje levizje te do

## Preprocessing

In [73]:
import numpy as np
import problem_unittests as tests

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    vocab = sorted(set(text))
    vocab_to_int = {c: i for i, c in enumerate(vocab)}
    int_to_vocab = dict(enumerate(vocab))
    
    return vocab_to_int, int_to_vocab

### Tokenize Punctuation

In [50]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    punctuation = {'.':'||Period||',
                        ',': '||Comma||',
                        '"': '||Quotation_Mark||',
                        ';': '||Semicolon||',
                        '!': '||Exclamation_Mark||',
                        '?': '||Question_Mark||',
                        '(': '||Left_Parentheses||',
                        ')': '||Right_Parentheses||',
                        '--': '||Dash||',
                        '\n': '||Return||'}
    return punctuation

## Preprocess all the data and save it

In [51]:
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

# Check Point

In [52]:
import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

## Building the Neural Network
Following functions are going to be implemented:
- get_inputs
- get_init_cell
- get_embed
- build_rnn
- build_nn
- get_batches

### Check the Version of TensorFlow and Access to GPU

In [53]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.3'), 'Please use TensorFlow version 1.3 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.8.0




### Input
Creates a function which returns the placeholders for feeding the input.

In [54]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    input = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    return input, targets, learning_rate

### RNN Cell

In [60]:
def get_init_cell(batch_size, rnn_size, num_layers):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    # Basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    
    # Stack num_layers of basic lstm cells
    cell = tf.contrib.rnn.MultiRNNCell([lstm for _ in range(num_layers)])
    
    # Initial state
    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')

    return cell, initial_state

### Word Embedding

In [61]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    return embed

### Building the RNN

In [59]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    return outputs, final_state

### Add a fully connected layer on top of the RNN

In [62]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :param embed_dim: Number of embedding dimensions
    :return: Tuple (Logits, FinalState)
    """
    embed = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embed)
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    return logits, final_state

### Batches

In [63]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    chars_per_batch = batch_size * seq_length
    number_of_batches = len(int_text) // chars_per_batch
    
    xdata = int_text[: number_of_batches * chars_per_batch]
    ydata = int_text[1: number_of_batches * chars_per_batch +1]
    
    xdata = np.array(xdata)
    ydata = np.array(ydata)
    
    xdata = xdata.reshape(batch_size, -1)
    ydata = ydata.reshape(batch_size, -1)
    
    x_batches = np.split(xdata, number_of_batches, axis=1)
    y_batches = np.split(ydata, number_of_batches, axis=1)
    
    return np.array(list(zip(x_batches, y_batches)))

## Neural Network Training
### Hyperparameters

In [64]:
# Number of Epochs
num_epochs = 200
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 512
# Number of RNN layers
num_layers = 2
# Embedding Dimension Size
embed_dim = 512
# Sequence Length
seq_length = 20
# Learning Rate
learning_rate = 0.001
# Show stats for every n number of batches
show_every_n_batches = 26
# Directory for saving the model
save_dir = './save'

### Build the Graph

In [66]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size, num_layers)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

## Train

In [67]:
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/79   train_loss = 9.692


KeyboardInterrupt: 

## Save Parameters

In [68]:
# Save parameters for checkpoint
helper.save_params((seq_length, save_dir))

# Checkpoint

In [69]:
import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

## Generate Functions
### Get Tensors

In [70]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    input = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    return input, initial_state, final_state, probs

### Choose Word

In [71]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    idx = np.random.choice(len(probabilities), 1, p=probabilities)[0]
    return int_to_vocab[idx]

## Generate new text

In [74]:
gen_length = 200
prime_word = 'Megjithese'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[0][dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    text = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    text = text.replace('\n ', '\n')
    text = text.replace('( ', '(')
        
    print(text)

OSError: File ./save.meta does not exist.