In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Generating Quotes using LSTM**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import layers
from keras.models import Sequential
import keras.utils as ku
from keras.callbacks import EarlyStopping

In [None]:
# Loading the dataset
data = pd.read_csv('/content/drive/MyDrive/NLP PROJECT/eng/eng_quotes.csv')
print(data.shape)
data.head()

(3001, 5)


Unnamed: 0,index,quote,author,tags,likes
0,0,Be yourself; everyone else is already taken.,Oscar Wilde,attributed-no-source;be-yourself;honesty;inspi...,149270
1,1,You've gotta dance like there's nobody watching,William W. Purkey,dance;heaven;hurt;inspirational;life;love;sing,118888
2,2,Be the change that you wish to see in the world.,Mahatma Gandhi,action;change;inspirational;philosophy;wish,106749
3,3,No one can make you feel inferior without your...,"Eleanor Roosevelt,",confidence;inspirational;wisdom,85854
4,4,Live as if you were to die tomorrow. Learn as ...,Mahatma Gandhi,carpe-diem;education;inspirational;learning,73033


In [None]:
quotes = []
for i in data['quote']:
    quotes.append(i)

In [None]:
# Tokeinization
tokenizer = Tokenizer()

# Function to create the sequences
def generate_sequences(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    print(f"Total unique words in the text corpus: {total_words}")
    input_sequences = []
    for line in corpus:
        seq = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(seq)):
            ngram_seq = seq[:i+1]
            input_sequences.append(ngram_seq)

    return input_sequences, total_words

# Generating sequences
input_sequences, total_words = generate_sequences(quotes)
input_sequences[:5]

Total unique words in the text corpus: 1362


[[12, 67],
 [12, 67, 153],
 [12, 67, 153, 216],
 [12, 67, 153, 216, 4],
 [12, 67, 153, 216, 4, 381]]

Now that we have the data in required format, but each sequences are of different length. So, before feeding into the model, we will first pad the sequences to same length.

Also, we need to create predictor and label from the prepared sequences by taking all the tokens except the last one as predictors and the last token as label (For example, think of it like the data in the above table: "Don't cry" as predictors and "because" as label).

In [None]:
# Generating predictors and labels from the padded sequences
def generate_input_sequence(input_sequences):
    maxlen = max([len(x) for x in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=maxlen)
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, maxlen

predictors, label, maxlen = generate_input_sequence(input_sequences)
predictors[:1], label[:1]

(array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0, 12]]),
 array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

Finally, we are done with the preprocessing part of task. Now, we will start building our LSTM model for text generation. You can think of this model as a multiclass text classification task- given the previous words, the model will predict the next word which has high probability.

**Model Architecture:**
* Embedding layer with the embedding dimension of 64
* LSTM Layer with 128 units with dropout
* A dense layer with number of units equal to the total words in the vocabulary with **softmax** activation since it is a mulitclass classification task.
* The optimizer we use here is **Adam**, loss is **categorical_crossentropy**, and an epoch of 50.

In [None]:
maxlen

198

In [None]:
# Building the model
embedding_dim = 64

def create_model(maxlen, embedding_dim, total_words):
    model = Sequential()
    model.add(layers.Embedding(total_words, embedding_dim, input_length = maxlen))
    model.add(layers.LSTM(128, dropout=0.2))
    model.add(layers.Dense(total_words, activation='softmax'))

    # compiling the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = create_model(maxlen-1, embedding_dim, total_words)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 197, 64)           87168     
                                                                 
 lstm_3 (LSTM)               (None, 128)               98816     
                                                                 
 dense_3 (Dense)             (None, 1362)              175698    
                                                                 
Total params: 361,682
Trainable params: 361,682
Non-trainable params: 0
_________________________________________________________________


In [None]:
predictors.shape , label.shape, maxlen

((6034, 197), (6034, 1362), 198)

In [None]:
#Training the model
model.fit(predictors, label, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1b5baaccd10>

The model has been trained for almost two hours for only 50 epochs. So, will save the model to avoid training every time we want to generate a pice of text.

In [None]:
# Save the model for later use
# model.save("Quotes_generator.h5")

In [None]:
# Loading the model
from keras.models import load_model

Quotes_gen = load_model("../input/quote-generator-trained-model/Quotes_generator.h5")

In [None]:
Quotes_gen.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 82, 64)            629504    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 9836)              1268844   
Total params: 1,997,164
Trainable params: 1,997,164
Non-trainable params: 0
_________________________________________________________________


Now that we have our trained model, we will create a function to generate text.

The function takes in the trained model, the input words (also called seed text), how many words to genereate and maximum squence length. The function then tokenize the text, padds it and predict using our trained model.

The model predicts one word at a time. So after every prediction, we will get the word for the predicted label and append it to the seed_text. This process continues for the specified number of words you want to genereate. And once it is done, the text will then be returned.


In [None]:
# Text generating function
def generate_quote(seed_text, num_words, model, maxlen):

    for _ in range(num_words):
        tokens = tokenizer.texts_to_sequences([seed_text])[0]
        tokens = pad_sequences([tokens], maxlen=maxlen, padding='pre')

        predicted = np.argmax(model.predict(tokens))

        output_word = ''

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text = "\n" + seed_text + " " + output_word

    return seed_text

In [None]:
# Let's try to generate some quotes
print(generate_quote("Passion", num_words = 10, model= model, maxlen=maxlen-1))











Passion women seldom make a heart should be so hidden in


In [None]:
print(generate_quote("legend", num_words = 15, model= Quotes_gen, maxlen=maxlen))

legend he be a response it's finest books you love a good work of live and


In [None]:
print(generate_quote("consistency matters", num_words = 15, model= Quotes_gen, maxlen=maxlen))

consistency matters not the some of self world's jump know in one not the seeing of the
