# Character-based LSTM text generation using data generators 

In [0]:
import numpy as np
import os, sys, gc
import pandas as pd
from urllib.request import urlopen
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import string

import time

In [0]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM

## Introduction

I recommend that you run this notebook on the cloud using [Google colab](https://colab.research.google.com/notebooks/welcome.ipynb). This will allow you to train on a GPU and assess the benefits of training neural networks on GPUs.

To do so, follow the steps below.

- Go to [Google colab](https://colab.research.google.com/). 
- Make an account if you don't have one.
- Select "UPLOAD" and upload this notebook itself.
- Runtime --> change runtime type --> Select GPU.

In [0]:
class DLTextGenerator():
    def __init__(self, 
                 text, 
                 seq_length = 100,
                 vocab_size = 100):
        """
        Init method of the class DLTextGenerator
 
        Parameters
        -------------
        text : str
            the corpus as plain text 
        seq_length : int
            sequence length for the RNN 
        vocab_size : int
            maximum vocabulary size to be considered        
        """
        
        self.seq_length = seq_length        
        self.vocab_size = vocab_size        
        char_count = Counter(text)
        comn = char_count.most_common(self.vocab_size)
        self.vocab = [letter for letter, count in comn]
        self.text = text.translate({ord(c): None for c in char_count if c not in self.vocab}) 
        # (above) characters not in vocabulary are removed from the training string
        #self.vocab = sorted(set(text))
        self.n_vocab = len(self.vocab)
        self.n_examples = len(self.text) - self.seq_length        
        print ('{} total characters in text'.format(len(self.text)))
        print ('{} unique characters'.format(self.n_vocab))
        print ('{} sequence length'.format(self.seq_length))       
        
        self.char2idx = {u:i for i, u in enumerate(self.vocab)}
        self.idx2char = np.array(self.vocab)
        self.model = None

    def prepare_data(self):
        """
        Create X and y for text generation using text and 
        sequence length. 
        
        Parameters
        -------------
        None
        
        Returns
        -------------
        X and y for the text generation tasl
                
        """
        data_X = np.zeros((self.n_examples, self.seq_length, 
                          self.n_vocab),dtype=bool)
        data_y = np.zeros((self.n_examples, self.n_vocab))

        for i in range(self.n_examples):
            seq_in = self.text[i:i + self.seq_length]
            char_out = self.text[i + self.seq_length]
            for j, char in enumerate(seq_in):
                data_X[i, j, self.char2idx[char]] = 1.0
            data_y[i, self.char2idx[char_out]] = 1.0
        print('Total examples: %d'%(len(data_X)))
        print('Total examples: %d'%(len(data_y)))            
        return data_X, data_y

    def build_LSTM(self, layer_size=256, 
                   dropout_amount=0.5):
        """
        Given layer_size and dropout_amount, build an LSTM network
        using Keras and tensorflow and print summary of the model. 

        Parameters
        -----------
        layer_size : int
          The number of units to be passed in the LSTM layer
        dropout_amount : float
          the dropout amount to be passed in the Dropout layer. 

        Return
        -----------
          None
          print the summary of the model
        """

        print('Building model...')
        self.model = Sequential()
        self.model.add(LSTM(layer_size, input_shape=(self.seq_length, 
                                                     self.n_vocab), 
                                                     return_sequences=True))
        self.model.add(Dropout(dropout_amount))
        self.model.add(LSTM(layer_size, return_sequences=True))
        self.model.add(Dropout(dropout_amount))
        self.model.add(LSTM(layer_size))
        self.model.add(Dropout(dropout_amount))
        self.model.add(Dense(self.n_vocab, activation='softmax'))
        self.model.compile(loss='categorical_crossentropy', 
                           optimizer='adam', 
                           metrics=['accuracy'])    
        print(self.model.summary())
        

    def data_generator(self, X, y, num_features, batch_size = 128):
        """        
        Generates batches of vectorized texts for training/validation.
 
        Parameters
        -------------
            x: np.matrix, feature matrix.
            y: np.ndarray, labels.
            num_features: int, number of features.
            batch_size: int, number of samples per batch.

        Returns
        ----------
            Yields feature and label data in batches.
        Attribution: The code below is heavily based on the following code. 
        https://developers.google.com/machine-learning/guides/text-classification/appendix
        """
        num_samples = X.shape[0]
        num_batches = num_samples // batch_size
        if num_samples % batch_size:
            num_batches += 1

        while 1:
            for i in range(num_batches):
                start_idx = i * batch_size
                end_idx = (i + 1) * batch_size
                if end_idx > num_samples:
                    end_idx = num_samples
                X_batch = X[start_idx:end_idx]
                y_batch = y[start_idx:end_idx]
                yield X_batch, y_batch                
                                
    def fit(self, 
              X, y, 
              batch_size = 128, 
              epochs = 10, 
              checkpoint_dir='./training_checkpoints'): # Directory where the checkpoints will be saved
       """        
        Given the parameters, train a deep learning model and save it.  
        
        Parameters
        -------------
        X : (list) 
          the X values
        y : (list) 
          the y values
        batch_size : (int) 
          the batch_size for the training
        epochs : (int) 
          the number of epochs for training 
        checkpoint_dir : (str) the path to save the model        
        """        
        # Name of the checkpoint files
        checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

        checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_prefix,
            save_weights_only=True)
        
        training_generator = self.data_generator(X, y, self.n_vocab, batch_size)      
        print(training_generator)
        # fit the model
        # Fit the model without data generator
        # If you want to convince yourself why we need data generators, 
        # try calling this `fit` method with `X`, `y` instead of calling `fit`
        # with the generator object.
        #self.model.fit(X, y,  
        #          epochs=epochs, 
        #          batch_size=128, 
        #          callbacks=[checkpoint_callback], 
        #          validation_split=0.20)

        # Fit the model using data generator        
        steps_per_epoch = X.shape[0] // batch_size
        if X.shape[0] % batch_size:
          steps_per_epoch += 1
        print('Steps per epoch: ', steps_per_epoch)
        history = self.model.fit_generator(
                                    generator=training_generator,
                                    steps_per_epoch=steps_per_epoch,
                                    callbacks=[checkpoint_callback],
                                    epochs=epochs,
                                    verbose=2)  # Logs once per epoch.   
                                         
    def generate(self, seed, temperature=0.5, num_gen=1000, 
                 checkpoint_dir='./training_checkpoints'):
        """        
        Given the parameters abd saved path for the model, generate text.  
        
        Parameters
        -------------
        seed : (str) 
            the seed for text generation
        temperature : (float) 
            the temparature to generate the text
        num_gen : (int) 
            the number of characters to generate
        checkpoint_dir : (str) 
            the path where the model is saved
        """       
    
        result = ''
        print('Loading model: ', tf.train.latest_checkpoint(checkpoint_dir))
        self.model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))        
        #for p in seed:
        #    result += self.idx2char[np.argmax(p)]
        #result += '\n\n'
        print ("Seed:")
        for p in seed:
            sys.stdout.write(self.idx2char[np.argmax(p)])
        print("\n--------------------------------------------\n")
        pattern = seed

        pattern = seed

        # generate characters
        for i in range(num_gen):
            prediction = self.model.predict(pattern[None], verbose=0)

            probabilities = prediction.flatten()
            if temperature != 1:
                # maybe not exactly right but close enough, and probably exactly right
                probabilities = probabilities**(1.0/temperature) 
                probabilities /= np.sum(probabilities)

            index = np.random.choice(self.n_vocab, p=probabilities)
                
            #result += self.idx2char[index]            
            #sys.stdout.flush()
            result = self.idx2char[index]
            sys.stdout.write(result)
            sys.stdout.flush()

            new_char_one_hot = np.zeros(self.n_vocab)
            new_char_one_hot[index] = 1.0
            pattern = np.append(pattern[1:], new_char_one_hot[None], axis=0)

        print("\nDone.")
        gc.collect() # http://stackoverflow.com/questions/40560795/tensorflow-attributeerror-nonetype-object-has-no-attribute-tf-deletestatus
        #return result

In [0]:
# This is the data we used in the lab 
data_url = 'https://raw.github.ubc.ca/MDS-2019-20/datasets/master/data/wiki1MB.txt?token=AAAANP3GW2AJKUV4F77Z6VS6UZG26'
text = urlopen(data_url).read().decode("utf-8")

In [0]:
# Create LSTM text generator object
lstm_text_generator = DLTextGenerator(text)

# Prepare data 
X, y = lstm_text_generator.prepare_data()

# Build the network 
lstm_text_generator.build_LSTM()

993199 total characters in text
100 unique characters
100 sequence length
Total examples: 993099
Total examples: 993099
Building model...
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_33 (LSTM)               (None, 100, 256)          365568    
_________________________________________________________________
dropout_33 (Dropout)         (None, 100, 256)          0         
_________________________________________________________________
lstm_34 (LSTM)               (None, 100, 256)          525312    
_________________________________________________________________
dropout_34 (Dropout)         (None, 100, 256)          0         
_________________________________________________________________
lstm_35 (LSTM)               (None, 256)               525312    
_________________________________________________________________
dropout_35 (Dropout)         (None, 256)       

In [0]:
type(X), type(y)

(numpy.ndarray, numpy.ndarray)

In [0]:
# fit the model
lstm_text_generator.fit(X,y, epochs = 10)

<generator object DLTextGenerator.data_generator at 0x7f2ec5349f10>
Steps per epoch:  7759
Epoch 1/10
7759/7759 - 924s - loss: 2.7020 - accuracy: 0.2733
Epoch 2/10
7759/7759 - 922s - loss: 2.0574 - accuracy: 0.4224
Epoch 3/10


## Generate text 
Now we can load the model and start generating text. You may also want to try other text of your choice to train the model.  

In [0]:
# Generate text 
# Create a random seed to generate text 
start = np.random.randint(0, len(X)-1)
seed = X[start]
lstm_text_generator.generate(seed, temperature=0.5, num_gen=500)

Loading model:  ./training_checkpoints/ckpt_10
Seed:
l/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.
--------------------------------------------

l6P.PvvUI8X-9X7PXS689I174TPmk9xbI1_4PP9-L8X_0_7s05P1P78r:P934_P_gT7P6U-:3iXi7_8lXggP1__T_v043XvPv69P_1f7PxX66PXo/7P486ff<x"X5.h99X=5X_lfP_9g20rX-PIIt4T62UggPP2bT5Xf6558cXPoW_v4:S:SPbX7v97LgX_XT06Tv09_62IxSSMS0X4Xf02hiLX8MTf>x54o4 IWLi82xSf9ix8_Pv3wLP.LXPM_X5P9v.SX9PI_XI2996-SbI 99P4P20T48TS4v9S21f99h_/2209UyM7W>wr29
owr>bTg959P6_9ioLf4XXX4bg9xlT6Thf ox6"l.xs_"Xr5b91Uxl4o82T3g>_2"x3WcS4PPIX5ybPf07_2M7-7PW=Xx65PvxWoTIXx8Sc=_88SPPSPPlS6PI3vb9v83Imvv1_8_.P88x10oiXi
S882v_6b5l9vv:57Um676X236SPlTX__b3
Done.
