<a href="https://colab.research.google.com/github/Sara-Esm/Natural-Language-Processing-NLP-and-generative-AI/blob/main/3_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text Generation Using LSTM

This project trains an LSTM model on Shakespeare's plays to generate new text. It involves text preprocessing, tokenization, sequence padding, and training a bidirectional LSTM network to predict and generate text based on a seed phrase.
**NOTE:** Switch to GPU Runtime

In [1]:
## Downloading the Dataset

import kagglehub
import os

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

path = kagglehub.dataset_download("kingburrito666/shakespeare-plays")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kingburrito666/shakespeare-plays?dataset_version_number=4...


100%|██████████| 4.55M/4.55M [00:00<00:00, 78.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/kingburrito666/shakespeare-plays/versions/4


In [2]:
# Load the dataset
import pandas as pd
reader = pd.read_csv(f"/root/.cache/kagglehub/datasets/kingburrito666/shakespeare-plays/versions/4/Shakespeare_data.csv", delimiter=',')
corpus = []

# Correct the indentation here
for row in reader.iterrows():  # Use .iterrows() to iterate through the rows
    corpus.append(row[1][5])  # Access the text column properly

print(f"Number of lines in the corpus: {len(corpus)}")
print("Sample text:", corpus[:3])

  corpus.append(row[1][5])  # Access the text column properly


Number of lines in the corpus: 111396
Sample text: ['ACT I', 'SCENE I. London. The palace.', 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others']


In [3]:
# Clean text
import string
def text_cleaner(text):
    text = "".join(car for car in text if car not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii", 'ignore')
    return text

corpus = [text_cleaner(line) for line in corpus]
print("Cleaned text sample:", corpus[:10])

Cleaned text sample: ['act i', 'scene i london the palace', 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others', 'so shaken as we are so wan with care', 'find we a time for frighted peace to pant', 'and breathe shortwinded accents of new broils', 'to be commenced in strands afar remote', 'no more the thirsty entrance of this soil', 'shall daub her lips with her own childrens blood', 'nor more shall trenching war channel her fields']


In [8]:
## Tokenizing and Preparing Input Sequences

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

corpus = corpus[:5000]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

word_index = tokenizer.word_index
total_words = len(word_index) + 1
print(f"Total unique words: {total_words}")

input_sequences = []

for sentence in corpus:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad input sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Split predictors and labels
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode labels
label = tf.keras.utils.to_categorical(label, num_classes=total_words)
print(f"Shape of predictors: {predictors.shape}")
print(f"Shape of labels: {label.shape}")

Total unique words: 5411
Shape of predictors: (32802, 33)
Shape of labels: (32802, 5411)


In [10]:
## Building and Training the LSTM Model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(512)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.build(input_shape=(None, 50))
print(model.summary())

None


In [16]:
# Train
model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - loss: 5.6840
Epoch 2/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - loss: 5.3932
Epoch 3/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - loss: 5.0409
Epoch 4/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - loss: 4.6838
Epoch 5/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - loss: 4.3480
Epoch 6/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - loss: 4.0158
Epoch 7/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - loss: 3.7299
Epoch 8/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - loss: 3.4603
Epoch 9/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - loss: 3.2274
Epoch 10/10
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x780e484da830>

In [17]:
## Generating Text

seed_text = "To be or not to be"
next_words = 20

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predict_x = model.predict(token_list, verbose=0)
    predicted = np.argmax(predict_x, axis=1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print("Generated Text:", seed_text)

Generated Text: To be or not to be undone too somerset and somerset and exeter and basset and others to his soldiers talbot them and put and put
