<a href="https://colab.research.google.com/github/SeekingNirvaana/LSTM_Projects/blob/main/Text_Generation_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Text Generator Using Keras (LSTM in RNN)

## Importing necessary libraries

In [9]:
import numpy as np
import pandas as pd
import re
import tensorflow
from IPython.display import clear_output

from keras.layers import Dense, LSTM, Input, Embedding, Dropout
from keras.utils import np_utils
from keras.models import Model, load_model
# from keras.optimizers import Adam, RMSprop
from tensorflow.keras.optimizers import Adam, RMSprop # - Works
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback

In [24]:
working_on_colab = False
if working_on_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    
    import os

    dir_to_drive_Colab = "/content/drive/MyDrive/DATASETS/"
    os.listdir(dir_to_drive_Colab)

In [25]:
os.system("mkdir ./data/")
if working_on_colab:
    os.system(f"cp -r {dir_to_drive_Colab}data/* ./data/")

In [11]:
token_type = 'word'

In [12]:
# Step 1. Loading the text and performing some cleanup!
# o data file contaning "Alice in Wonderland"

filename = "/content/drive/MyDrive/DATASETS/data.txt"

# opening the file
with open(filename, encoding='utf-8-sig') as f:
    text = f.read()


# Step 2. Removing text before and after the main stories 
start = text.find("CHAPTER I\n\n                      Down the Rabbit-Hole\n\n\n")
end = text.find("\n\n                             THE END")
text = text[start:end]

# Printing out the result:
print(len(text))

148329


## Decide if we are training the model from scratch and/or loading the pre-trained model

In [17]:
load_saved_model = False
train_model = True

## Cleaning up

In [13]:
# Step 3. Separating every chapter:
seq_length = 20 
start_story = '~ ' * seq_length
# adding the first chapter name back:
text = start_story + text

# Step 4. lowering the case
text = text.lower()
# separating every chapter with distinguished symbols for a better training
text = text.replace('\n\n\n\n', start_story)
text = text.replace('\n', ' ')
text = re.sub('  +', '. ', text).strip()
text = text.replace('..', '.')

text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)


## Tokenizing text
Splitting up the text into individual pieces, such as words/characters. Tokenization helps to "label" each individual unit of text. So, the labeled text can be used for supervised learning.

In [14]:
if token_type == 'word':
    tokenizer = Tokenizer(char_level = False, filters = '')
else:
    tokenizer = Tokenizer(char_level = True, filters = '', lower = False)

tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
token_list = tokenizer.texts_to_sequences([text])[0]

# printing interesting quantities:
print(f"Number of tokenized words: {total_words}")
# Mapping the dictionary between words and indices
print(tokenizer.word_index)
# text after tokenization
print(token_list)

Number of tokenized words: 2656
[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 320, 15, 2, 47, 3, 116, 8, 618, 2, 16, 19, 268, 7, 117, 36, 490, 12, 366, 85, 23, 436, 28, 3, 860, 1, 6, 12, 367, 144, 7, 62, 24, 2, 145, 65, 619, 10, 31, 861, 73, 3, 339, 23, 436, 19, 862, 1, 32, 11, 31, 53, 712, 65, 1506, 17, 11, 1, 4, 6, 38, 45, 3, 226, 12, 9, 339, 1, 5, 68, 16, 4, 183, 712, 65, 368, 26, 5, 2, 34, 10, 19, 863, 17, 23, 369, 340, 92, 21, 76, 21, 10, 66, 1, 33, 3, 491, 167, 161, 23, 437, 36, 620, 6, 556, 93, 1, 341, 3, 1094, 12, 438, 9, 1507, 8, 1508, 58, 35, 713, 3, 557, 12, 203, 48, 6, 1095, 3, 1509, 1, 64, 291, 9, 162, 116, 30, 1510, 168, 244, 292, 85, 23, 2, 67, 19, 144, 34, 36, 1096, 17, 20, 27, 864, 77, 16, 100, 11, 34, 36, 103, 43, 12, 3, 94, 7, 269, 3, 116, 104, 7, 270, 1, 4, 119, 169, 14, 2, 119, 169, 14, 2, 15, 187, 35, 558, 14, 5, 2, 92, 64, 10, 68, 11, 128, 1097, 1, 11, 1098, 7, 23, 20, 10, 271, 7, 63, 1511, 25, 39, 1, 32, 25, 3, 70, 11, 29, 176,



### LSTM network will be trained to predict the next word in a sequence, given a sequence of words preceding this point.

### The parameter of our training process (X) is a sequence length which we use for model training

### While, the response variable (Y) for each sequence is the subsequent/next word. 

### Using one-hot encoding into a vector of length 2656 (the number of distinct words in the vocabulary)

In [15]:
def generate_sequences(token_list, step):
    
    X = []
    y = []

    for i in range(0, len(token_list) - seq_length, step):
        X.append(token_list[i: i + seq_length])
        y.append(token_list[i + seq_length])
    
    # one-hot encoding, creating a categorical variable:
    y = np_utils.to_categorical(y, num_classes = total_words)
    
    num_seq = len(X)
    print('Number of sequences:', num_seq, "\n")
    
    return X, y, num_seq

step = 1
seq_length = 20

X, y, num_seq = generate_sequences(token_list, step)

X = np.array(X)
y = np.array(y)

# printing output:
print(f"Inout shape: {X.shape}")
print(f"Output shape: {y.shape}")


Number of sequences: 35564 

Inout shape: (35564, 20)
Output shape: (35564, 2656)


## Building LSTM model

In [18]:
if load_saved_model:
    model = load_model('./saved_models/lr=0.001.h5')

else:

    n_units = 256
    embedding_size = 100

    text_in = Input(shape = (None,))
    embedding = Embedding(total_words, embedding_size)
    x = embedding(text_in)
    x = LSTM(n_units)(x)
    # x = Dropout(0.2)(x)
    text_out = Dense(total_words, activation = 'softmax')(x)

    model = Model(text_in, text_out)
    learning_rate = 0.001
    opti = RMSprop(learning_rate = learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=opti)

In [19]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         265600    
                                                                 
 lstm (LSTM)                 (None, 256)               365568    
                                                                 
 dense (Dense)               (None, 2656)              682592    
                                                                 
Total params: 1,313,760
Trainable params: 1,313,760
Non-trainable params: 0
_________________________________________________________________


## Fitting the model

In [21]:
epochs = 100
batch_size = 32
model.fit(X, y, epochs=epochs, batch_size=batch_size, shuffle = True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fba3c453810>

In [26]:
# Saving the model, in h5 format
import os
os.system("mkdir ./saved_models/")

# model = ...  # Get model (Sequential, Functional Model, or Model subclass)
model.save(f'./saved_models/lr={learning_rate}.h5')

if working_on_colab:
    os.system(f"cp -r ./saved_models/lr={learning_rate}.h5 {dir_to_drive_Colab}/")


# Saving the model, in TensorFlow format directly
os.system("mkdir ./saved_models/")
# saving the model in tensorflow format
model.save(f'./lr={learning_rate}_tf', save_format='tf')
if working_on_colab:
    os.system(f"cp -r ./lr={learning_rate}_tf {dir_to_drive_Colab}/")

# loading the saved model
loaded_model = load_model(f'./lr={learning_rate}_tf')

# retraining the model if needed, etc. ...
loaded_model.summary()



INFO:tensorflow:Assets written to: ./lr=0.001_tf/assets


INFO:tensorflow:Assets written to: ./lr=0.001_tf/assets


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         265600    
                                                                 
 lstm (LSTM)                 (None, 256)               365568    
                                                                 
 dense (Dense)               (None, 2656)              682592    
                                                                 
Total params: 1,313,760
Trainable params: 1,313,760
Non-trainable params: 0
_________________________________________________________________


In [27]:
# creating the folder called saved_models
os.system("mkdir ./saved_models/")

# model = ...  # Get model (Sequential, Functional Model, or Model subclass)
# saving the model in .h5 format
model.save(f'./saved_models/lr={learning_rate}.h5')

# loading this model:
new_model = load_model(f'./saved_models/lr={learning_rate}.h5')
# checking the summary:
new_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         265600    
                                                                 
 lstm (LSTM)                 (None, 256)               365568    
                                                                 
 dense (Dense)               (None, 2656)              682592    
                                                                 
Total params: 1,313,760
Trainable params: 1,313,760
Non-trainable params: 0
_________________________________________________________________


# Generating text

In [28]:
def sample_with_temp(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)



def generate_text(seed_text, next_words, model, max_sequence_len, temp):
    output_text = seed_text
    seed_text = start_story + seed_text
    
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len:]
        token_list = np.reshape(token_list, (1, max_sequence_len))
        
        probs = model.predict(token_list, verbose=0)[0]
        y_class = sample_with_temp(probs, temperature = temp)
        
        if y_class == 0:
            output_word = ''
        else:
            output_word = tokenizer.index_word[y_class]
            
        if output_word == "~":
            break
            
        if token_type == 'word':
            output_text += output_word + " "
            seed_text += output_word + " "
        else:
            output_text += output_word + " " 
            seed_text += output_word + " "
            
    return output_text


In [29]:
seed_text = ""
gen_words = 50

print('Temp 0.2')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.2))
print('Temp 0.33')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.33))
print('Temp 0.5')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 0.5))

Temp 0.2
. chapter . the didn't see how to see she was here about this - tone was long ; and the great little was in the same tone , and she thought it to be came ! . ` it as great long ! . but you perhaps they don't 
Temp 0.33
. chapter the very queen . so very long and she began be a gryphon in a large queen found this way do get they out of . i replied , if you . ' . ` i don't tell off , ' the mock turtle replied : . ` 
Temp 0.5
. chapter hatter . caterpillar . ` and are me ! ' he out . ` yet , so he should ! ' the hatter went on in a - out . ` which have never to do that you ? ' so we then looked at it . ` 


In [30]:
print('Temp 1.0')
print (generate_text(seed_text, gen_words, model, seq_length, temp = 1))

Temp 1.0
. chapter his from the mock turtle to see a other know they called . as they sat getting off however it . ) . ` come on into a your , ' said the gryphon , ` i know ' . ) . ` that's the right talking , 
