In [1]:
import os
import re
import requests
import numpy as np
from pathlib import Path
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import random

In [2]:
DATAFILE = "shakespeare.txt"
#GUTENBERG_URL = "https://www.gutenberg.org/files/100/100-0.txt"
GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/100/pg100.txt"
SEQ_LENGTH = 30
EMBED_DIM = 100
LSTM_UNITS = 256
BATCH_SIZE = 128
EPOCHS = 40
VALIDATION_SPLIT = 0.1
MODEL_CHECKPOINT = 'best_lstm.h5'
RANDOM_SEED = 42

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [3]:
def download_dataset_if_missing():
    if not os.path.exists(DATAFILE):
        print(f"{DATAFILE} not found. Attempting to download from Gutenberg...")
        try:
            response = requests.get(GUTENBERG_URL, timeout=30)
            response.raise_for_status()
            with open(DATAFILE, "w", encoding="utf-8") as f:   # <-- lowercase 'w'
                f.write(response.text)
            print(f"Saved to {DATAFILE}.")
        except Exception as e:
            print("Download failed! Please place the file manually as", DATAFILE)
    else:
        print(f"Found {DATAFILE} - using existing file.")

# Only read if the file exists
if os.path.exists(DATAFILE):
    with open(DATAFILE, "r", encoding="utf-8", errors="ignore") as f:
        raw_text = f.read()
    print("Raw text length (chars):", len(raw_text))
else:
    print("Dataset not available. Please download and place 'shakespeare.txt' in this folder.")

Raw text length (chars): 5359443


In [None]:
import re

def clean_text(s: str) -> str:
    s = s.lower()
    # remove Gutenberg headers/footers
    s = re.sub(r".*?\*\*\* start of this project gutenberg ebook.*?\n", '', s, flags=re.DOTALL)
    s = re.sub(r"\n\*\*\* end of this project gutenberg ebook.*", '', s, flags=re.DOTALL)
    # keep only letters, numbers, apostrophes, and spaces
    s = re.sub(r"[^a-z0-9'\s]", ' ', s)
    s = re.sub(r"\s+", ' ', s)
    s = s.strip()
    return s

# Clean the text
text = clean_text(raw_text)
print('Clean text length (chars):', len(text))

# Tokenization and sequence preparation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np

tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Vocabulary size:", vocab_size)

words = text.split()
print("Total words:", len(words))

SEQ_LENGTH = 30  # define sequence length before use

sequences = []
for i in range(SEQ_LENGTH, len(words)):
    seq = words[i - SEQ_LENGTH : i + 1]
    sequences.append(' '.join(seq))

print("Total sequences created:", len(sequences))

# Convert text to numeric sequences
sequences_encoded = tokenizer.texts_to_sequences(sequences)
sequences_encoded = np.array(sequences_encoded)

X = sequences_encoded[:, :-1]
y = sequences_encoded[:, -1]

print("X shape:", X.shape)
print("y shape:", y.shape)

y = to_categorical(y, num_classes=vocab_size)
print("One-hot encoded y shape:", y.shape)


In [None]:
from sklean.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = VALIDATION_SPLIT, random_state = RANDOM_SEED)
print("Train samples: ", X_train.shape[0])
print("Val samples: ", x_val.shape[0])

In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = EMBED_DIM, input_length = SEQ_LENGTH))
model.add(LSTM(LSTM_UNITS, return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation = "softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()


In [None]:
callbacks = [ EarlyStopping(monitor = "val_loss", patience = 3, restore_best_weights = True),
             ModelCheckpoint(MODEL_CHECKPOINT, save_best_only = True, monitor = "val_loss")
            ]

In [None]:
history = model.fit(X_train, y_train, validation_data = (X_val, y_val), batch_size = BATCH_SIZE, epochs = EPOCHS, callbacks = callbacks)

In [None]:
def sample_With_temperature(preds, temperature = 1.0):
    preds = np.asarry(preds).astype("float64")
    preds = np.log(preds + 1e-10) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(seed_text, next_words = 50, temperature = 1.0):
    result = seed_text.split()
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([' '.join(result[-SEQ_LENGTH:])])[0]
        if len(token_list) < SEQ_LENGTH:
            token_list = [0] * (SEQ_LENGTH - len(token_list)) + token_list
        token_array = np.array(token_list).reshape(1, -1)
        preds = model.predict(token_array. verbose = 0)[0]
        next_index = sample_with_temperature(preds, temperature = temperature)
        if next_index == 0:
            break:
        next_word = tokenizer.index_Word.get(next_index, '')
        result.append(next_word)
    return ' '.join(result)

    

In [None]:
seeds = [
    "to be or not to be that is the question",
    "o romeo romeo wherefore art thou romeo",
    "shall i compare thee to a summer s day"
]

print("\n=== Sample generations ===")
for seed in seeds:
    print("\n--- Seed:", seed)
    try:
        text_out = generate_text(seed, next_words = 40, temperature = 0.8)
        print(text_out)
    except Exception as e:
        print("Generation failed (maybe tokenizer vocab mismatch):", e)

In [None]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("\nDone. Model saved to", MODEL_CHECKPOINT, "and tokenizer to tokenizer.pkl")