**Text generati
on model using LSTM**

# Dataset
Open this link
 https://www.gutenberg.org/cache/epub/100/pg100.txt

Save the file as shakespeare.txt

In [1]:
#Import Libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
import re


In [2]:
# Load text file
with open("/content/shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Convert to lowercase
text = text.lower()

# Remove punctuation & special characters
text = re.sub(r'[^a-z\s.,;!?]', '', text)


print("Total characters:", len(text))


Total characters: 5319235


In [3]:
#Character-Level Tokenization
# Create vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Mapping characters to numbers
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}

# Encode text
encoded_text = np.array([char_to_idx[c] for c in text])

print("Vocabulary size:", vocab_size)


Vocabulary size: 34


In [4]:
#Create Input–Output Sequences
seq_length = 80
X = []
y = []

for i in range(len(encoded_text) - seq_length):
    X.append(encoded_text[i:i+seq_length])
    y.append(encoded_text[i+seq_length])

X = np.array(X)
y = np.array(y)

print("Input shape:", X.shape)


Input shape: (5319155, 80)


In [None]:
#Build the LSTM Model
model = Sequential([
    Embedding(vocab_size, 100, input_length=seq_length),
    LSTM(256, return_sequences=True),
    LSTM(256),
    Dense(vocab_size, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam'
)

# 3️⃣ Train model
early_stop = EarlyStopping(monitor='loss', patience=3)

model.fit(
    X,
    y,
    epochs=5,
    batch_size=128,
    callbacks=[early_stop]
)





Epoch 1/5
[1m41556/41556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1156s[0m 28ms/step - loss: 1.6372
Epoch 2/5
[1m41556/41556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1149s[0m 28ms/step - loss: 1.3063
Epoch 3/5
[1m41556/41556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1149s[0m 28ms/step - loss: 1.2733
Epoch 4/5
[1m41556/41556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1149s[0m 28ms/step - loss: 1.2622
Epoch 5/5
[1m 7541/41556[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m15:40[0m 28ms/step - loss: 1.2506

In [10]:
# Save model
from tensorflow.keras.models import load_model

model = load_model("lstm_text_generator.h5")




In [11]:
def sample_with_temperature(preds, temperature=0.8):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)


In [12]:
# Generator function
def generate_text(seed_text, length=300):
    seed_text = seed_text.lower()
    generated = seed_text

    for _ in range(length):
        encoded_seed = [char_to_idx.get(c, 0) for c in seed_text]
        encoded_seed = tf.keras.preprocessing.sequence.pad_sequences(
            [encoded_seed], maxlen=seq_length
        )

        prediction = model.predict(encoded_seed, verbose=0)
        next_index = sample_with_temperature(prediction[0], temperature=0.8)
        next_char = idx_to_char[next_index]

        generated += next_char
        seed_text = seed_text[1:] + next_char

    return generated




In [13]:
print(generate_text("to be or not to be"))


to be or not to beliet not a vanish of rands, soothmens the self elspio.

reachthum.
good masters woodgess.
a fellow are pity of greating.

gaoler.
assounce, gentleman.

costard.
such a business in the foolock was this bastard.

thaisabinbure.

paris.
gives the earth and servants, cordelia.

titus.
he shall be sooner
