#Importing Necessary Libraries

In [50]:
# -------------------------------
# Imports and library setup
# -------------------------------
import re
import os
import shutil
import kagglehub
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

In [51]:
# -------------------------------
# Download dataset using KaggleHub
# -------------------------------

In [52]:
# Download latest version
path = kagglehub.dataset_download("ashishpandey2062/next-word-predictor-text-generator-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'next-word-predictor-text-generator-dataset' dataset.
Path to dataset files: /kaggle/input/next-word-predictor-text-generator-dataset


In [53]:
# -------------------------------
# List input files and set up local directory
# -------------------------------
# List Kaggle input files and prepare destination directory for local copy

In [54]:
os.listdir("/kaggle/input")
Source_Dir="/kaggle/input/next-word-predictor-text-generator-dataset"
Dest_Dir="/content/data"
os.makedirs(Dest_Dir,exist_ok=True)

In [55]:
# -------------------------------
# Copy dataset to local working directory
# -------------------------------
# Copy dataset folder to /content/data for processing

In [56]:
shutil.copytree(
    Source_Dir,
    os.path.join(Dest_Dir, os.path.basename(Source_Dir)),
    dirs_exist_ok=True
)

'/content/data/next-word-predictor-text-generator-dataset'

In [57]:
# -------------------------------
# Load text file from dataset
# -------------------------------
# Read the main text file into a string variable

In [58]:
data_path = "/content/data/next-word-predictor-text-generator-dataset/next_word_predictor.txt"

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    text_data = f.read()


In [59]:
# Quick sanity check of dataset
print(type(text_data))        # should be <class 'str'>
print(len(text_data))         # should be > 10000 (ideally)
print(text_data[:200])        # readable text


<class 'str'>
167445
The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. People were out enjoying the beautiful weather, some sitting in the park, others taking a 


In [60]:
# -------------------------------
# Preprocess the text
# -------------------------------
# Clean text by lowercasing, removing non-alphanumeric chars, and stripping extra spaces


In [61]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

corpus = preprocess_text(text_data)

In [62]:
# -------------------------------
# Tokenize the text
# -------------------------------
# Fit a Keras tokenizer on the cleaned corpus and calculate vocabulary size

In [63]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts([corpus])

total_words = len(tokenizer.word_index) + 1
print("Vocabulary size:", total_words)

tokens = tokenizer.texts_to_sequences([corpus])[0]


Vocabulary size: 5012


In [64]:
# -------------------------------
# Create sequences for training
# -------------------------------
# Generate fixed-length sequences of tokens for input and next-word prediction

In [65]:
SEQUENCE_LENGTH = 30

input_sequences = []

for i in range(SEQUENCE_LENGTH, len(tokens)):
    seq = tokens[i-SEQUENCE_LENGTH:i+1]
    input_sequences.append(seq)

input_sequences = np.array(input_sequences)

X_train = input_sequences[:, :-1]
y_train = input_sequences[:, -1]

print("X shape:", X_train.shape)
print("y shape (before one-hot):", y_train.shape)


X shape: (27598, 30)
y shape (before one-hot): (27598,)


In [66]:
# -------------------------------
# One-hot encode labels
# -------------------------------
# Convert y labels to categorical one-hot vectors

In [67]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, num_classes=total_words)

print("X shape:", X_train.shape)
print("y shape:", y_train.shape)


X shape: (27598, 30)
y shape: (27598, 5012)


In [68]:
# -------------------------------
# Build LSTM model
# -------------------------------
# Define sequential LSTM model with embedding, dropout, and dense layers

In [69]:


SEQUENCE_LENGTH = 30

model = Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(input_dim=total_words, output_dim=128),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(256),
    Dense(256, activation="relu"),
    Dense(total_words, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [70]:
# -------------------------------
# Define callbacks for training
# -------------------------------
# Commit: Save best model and implement early stopping to prevent overfitting

In [71]:
checkpoint = ModelCheckpoint(
    filepath="best_next_word_model.h5",
    monitor="loss",
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

early_stop = EarlyStopping(
    monitor="loss",
    patience=5,
    restore_best_weights=True,
    verbose=1
)


In [43]:
# -------------------------------
# Train the model
# -------------------------------
# Commit: Fit model on training sequences with checkpoints and early stopping

In [26]:
history = model.fit(
    X_train,
    y_train,
    epochs=70,
    batch_size=64,
    callbacks=[checkpoint, early_stop]
)


Epoch 1/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.2368 - loss: 3.4327
Epoch 1: loss improved from 3.64494 to 3.51193, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.2367 - loss: 3.4330
Epoch 2/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.2618 - loss: 3.2917
Epoch 2: loss improved from 3.51193 to 3.36873, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.2617 - loss: 3.2922
Epoch 3/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2804 - loss: 3.1481
Epoch 3: loss improved from 3.36873 to 3.23332, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.2803 - loss: 3.1483
Epoch 4/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3015 - loss: 3.0158
Epoch 4: loss improved from 3.23332 to 3.10391, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.3015 - loss: 3.0160
Epoch 5/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.3298 - loss: 2.8925
Epoch 5: loss improved from 3.10391 to 2.98446, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.3297 - loss: 2.8929
Epoch 6/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3483 - loss: 2.7660
Epoch 6: loss improved from 2.98446 to 2.85555, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.3482 - loss: 2.7662
Epoch 7/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.3714 - loss: 2.6536
Epoch 7: loss improved from 2.85555 to 2.73200, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.3714 - loss: 2.6538
Epoch 8/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.3849 - loss: 2.5607
Epoch 8: loss improved from 2.73200 to 2.62734, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.3848 - loss: 2.5612
Epoch 9/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.4185 - loss: 2.4196
Epoch 9: loss improved from 2.62734 to 2.51235, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.4183 - loss: 2.4207
Epoch 10/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.4268 - loss: 2.3453
Epoch 10: loss improved from 2.51235 to 2.41004, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.4266 - loss: 2.3459
Epoch 11/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.4566 - loss: 2.2252
Epoch 11: loss improved from 2.41004 to 2.30766, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.4564 - loss: 2.2259
Epoch 12/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.4759 - loss: 2.1361
Epoch 12: loss improved from 2.30766 to 2.22073, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.4757 - loss: 2.1368
Epoch 13/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.4869 - loss: 2.0760
Epoch 13: loss improved from 2.22073 to 2.13663, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.4868 - loss: 2.0764
Epoch 14/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.5172 - loss: 1.9573
Epoch 14: loss improved from 2.13663 to 2.03995, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.5171 - loss: 1.9577
Epoch 15/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.5339 - loss: 1.8701
Epoch 15: loss improved from 2.03995 to 1.95152, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.5338 - loss: 1.8707
Epoch 16/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.5593 - loss: 1.7867
Epoch 16: loss improved from 1.95152 to 1.86789, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5590 - loss: 1.7876
Epoch 17/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.5646 - loss: 1.7285
Epoch 17: loss improved from 1.86789 to 1.80206, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.5644 - loss: 1.7292
Epoch 18/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.5853 - loss: 1.6395
Epoch 18: loss improved from 1.80206 to 1.72464, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5851 - loss: 1.6401
Epoch 19/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.5955 - loss: 1.5969
Epoch 19: loss improved from 1.72464 to 1.65714, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.5953 - loss: 1.5974
Epoch 20/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.6204 - loss: 1.5086
Epoch 20: loss improved from 1.65714 to 1.57513, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.6202 - loss: 1.5091
Epoch 21/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.6311 - loss: 1.4489
Epoch 21: loss improved from 1.57513 to 1.50881, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.6310 - loss: 1.4494
Epoch 22/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.6490 - loss: 1.3788
Epoch 22: loss improved from 1.50881 to 1.44668, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.6488 - loss: 1.3795
Epoch 23/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6644 - loss: 1.3259
Epoch 23: loss improved from 1.44668 to 1.38961, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.6644 - loss: 1.3260
Epoch 24/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.6788 - loss: 1.2522
Epoch 24: loss improved from 1.38961 to 1.33304, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.6787 - loss: 1.2526
Epoch 25/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.6856 - loss: 1.2264
Epoch 25: loss improved from 1.33304 to 1.27738, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.6855 - loss: 1.2268
Epoch 26/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.7045 - loss: 1.1507
Epoch 26: loss improved from 1.27738 to 1.21392, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.7044 - loss: 1.1512
Epoch 27/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.7153 - loss: 1.1094
Epoch 27: loss improved from 1.21392 to 1.15607, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.7152 - loss: 1.1096
Epoch 28/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.7229 - loss: 1.0660
Epoch 28: loss improved from 1.15607 to 1.10594, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.7228 - loss: 1.0663
Epoch 29/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7374 - loss: 1.0122
Epoch 29: loss improved from 1.10594 to 1.09641, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.7373 - loss: 1.0124
Epoch 30/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.7422 - loss: 0.9935
Epoch 30: loss improved from 1.09641 to 1.03685, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.7420 - loss: 0.9939
Epoch 31/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.7592 - loss: 0.9305
Epoch 31: loss improved from 1.03685 to 0.97270, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.7591 - loss: 0.9307
Epoch 32/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.7695 - loss: 0.8708
Epoch 32: loss improved from 0.97270 to 0.92018, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.7694 - loss: 0.8714
Epoch 33/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.7808 - loss: 0.8370
Epoch 33: loss improved from 0.92018 to 0.88347, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.7807 - loss: 0.8374
Epoch 34/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.7915 - loss: 0.7953
Epoch 34: loss improved from 0.88347 to 0.85046, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.7913 - loss: 0.7959
Epoch 35/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8005 - loss: 0.7659
Epoch 35: loss improved from 0.85046 to 0.80854, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8005 - loss: 0.7660
Epoch 36/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.8126 - loss: 0.7214
Epoch 36: loss improved from 0.80854 to 0.76908, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8124 - loss: 0.7218
Epoch 37/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.8204 - loss: 0.6836
Epoch 37: loss improved from 0.76908 to 0.72478, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8204 - loss: 0.6839
Epoch 38/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.8283 - loss: 0.6558
Epoch 38: loss improved from 0.72478 to 0.69523, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.8281 - loss: 0.6562
Epoch 39/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8354 - loss: 0.6236
Epoch 39: loss improved from 0.69523 to 0.66881, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8354 - loss: 0.6237
Epoch 40/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.8409 - loss: 0.6056
Epoch 40: loss improved from 0.66881 to 0.64763, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.8408 - loss: 0.6058
Epoch 41/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8475 - loss: 0.5837
Epoch 41: loss improved from 0.64763 to 0.63713, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.8474 - loss: 0.5838
Epoch 42/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.8602 - loss: 0.5440
Epoch 42: loss improved from 0.63713 to 0.57698, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8601 - loss: 0.5443
Epoch 43/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.8634 - loss: 0.5166
Epoch 43: loss improved from 0.57698 to 0.54833, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8634 - loss: 0.5167
Epoch 44/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.8759 - loss: 0.4831
Epoch 44: loss improved from 0.54833 to 0.52032, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.8758 - loss: 0.4835
Epoch 45/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8807 - loss: 0.4554
Epoch 45: loss improved from 0.52032 to 0.49188, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.8807 - loss: 0.4555
Epoch 46/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.8815 - loss: 0.4478
Epoch 46: loss improved from 0.49188 to 0.48517, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8814 - loss: 0.4481
Epoch 47/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8861 - loss: 0.4350
Epoch 47: loss improved from 0.48517 to 0.47045, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.8861 - loss: 0.4351
Epoch 48/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.8925 - loss: 0.4104
Epoch 48: loss improved from 0.47045 to 0.43251, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8924 - loss: 0.4106
Epoch 49/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.8985 - loss: 0.3801
Epoch 49: loss improved from 0.43251 to 0.41016, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8985 - loss: 0.3803
Epoch 50/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9010 - loss: 0.3753
Epoch 50: loss improved from 0.41016 to 0.40945, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9008 - loss: 0.3756
Epoch 51/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9105 - loss: 0.3496
Epoch 51: loss improved from 0.40945 to 0.37344, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9105 - loss: 0.3497
Epoch 52/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9038 - loss: 0.3545
Epoch 52: loss improved from 0.37344 to 0.36935, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9037 - loss: 0.3546
Epoch 53/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9163 - loss: 0.3118
Epoch 53: loss improved from 0.36935 to 0.33709, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9163 - loss: 0.3120
Epoch 54/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9188 - loss: 0.3020
Epoch 54: loss improved from 0.33709 to 0.32571, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9187 - loss: 0.3021
Epoch 55/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9178 - loss: 0.3116
Epoch 55: loss did not improve from 0.32571
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9177 - loss: 0.3119
Epoch 56/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9234 - loss: 0.2840
Epoch 56: loss improved from 0.32571 to 0.31385, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9234 - loss: 0.2841
Epoch 57/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9334 - loss: 0.2616
Epoch 57: loss improved from 0.31385 to 0.28131, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9334 - loss: 0.2617
Epoch 58/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9376 - loss: 0.2425
Epoch 58: loss improved from 0.28131 to 0.26375, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.9376 - loss: 0.2426
Epoch 59/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9424 - loss: 0.2300
Epoch 59: loss improved from 0.26375 to 0.25948, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9423 - loss: 0.2302
Epoch 60/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9362 - loss: 0.2393
Epoch 60: loss did not improve from 0.25948
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9361 - loss: 0.2395
Epoch 61/70
[1m428/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9394 - loss: 0.2324
Epoch 61: loss improved from 0.25948 to 0.25837, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9393 - loss: 0.2327
Epoch 62/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9381 - loss: 0.2308
Epoch 62: loss improved from 0.25837 to 0.24564, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9381 - loss: 0.2309
Epoch 63/70
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9506 - loss: 0.1949
Epoch 63: loss improved from 0.24564 to 0.21694, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9506 - loss: 0.1949
Epoch 64/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9442 - loss: 0.2141
Epoch 64: loss did not improve from 0.21694
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9442 - loss: 0.2142
Epoch 65/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9517 - loss: 0.1927
Epoch 65: loss improved from 0.21694 to 0.20696, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9517 - loss: 0.1927
Epoch 66/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9515 - loss: 0.1849
Epoch 66: loss improved from 0.20696 to 0.19456, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9515 - loss: 0.1850
Epoch 67/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9563 - loss: 0.1752
Epoch 67: loss improved from 0.19456 to 0.18462, saving model to best_next_word_model.h5




[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9562 - loss: 0.1753
Epoch 68/70
[1m430/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9537 - loss: 0.1753
Epoch 68: loss did not improve from 0.18462
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9536 - loss: 0.1755
Epoch 69/70
[1m431/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9508 - loss: 0.1817
Epoch 69: loss did not improve from 0.18462
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9508 - loss: 0.1818
Epoch 70/70
[1m429/432[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9471 - loss: 0.1965
Epoch 70: loss did not improve from 0.18462
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9471 -

In [72]:
# -------------------------------
# Load best saved model
# -------------------------------
# Commit: Load the best performing model from checkpoint

In [73]:
model = load_model("best_next_word_model.h5")



In [74]:
# -------------------------------
# Define next-word prediction function
# -------------------------------
# Commit: Generate multiple next words in a loop using temperature-based sampling

In [75]:
def predict_next_words(model, tokenizer, seed_text, num_words=10, sequence_length=30, temperature=1.0):
    """
    Predicts next words in a loop using a trained model.

    Args:
        model: Trained Keras model
        tokenizer: Fitted Keras Tokenizer
        seed_text: Input text to start predictions
        num_words: Number of words to generate
        sequence_length: Length of input sequences used during training
        temperature: Sampling temperature (higher = more random)

    Returns:
        Generated text including seed_text and predicted words
    """
    output_text = seed_text

    for _ in range(num_words):
        # Preprocess seed_text
        text = output_text.lower()
        text = text.strip()

        # Convert to sequence
        sequence = tokenizer.texts_to_sequences([text])[0]

        # Pad sequence
        sequence = pad_sequences([sequence], maxlen=sequence_length, padding='pre')

        # Predict probabilities
        preds = model.predict(sequence, verbose=0)[0]

        # Apply temperature for more creativity
        preds = np.log(preds + 1e-8) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        # Sample the next word index
        next_index = np.random.choice(len(preds), p=preds)

        # Find corresponding word
        for word, index in tokenizer.word_index.items():
            if index == next_index:
                next_word = word
                break

        # Append next word
        output_text += ' ' + next_word

    return output_text


In [76]:
# -------------------------------
# Generate text from user input
# -------------------------------
# Prompt user for seed text and generate 25 predicted words

In [77]:
seed_text = input("Enter your text:")
generated_text = predict_next_words(
    model=model,
    tokenizer=tokenizer,
    seed_text=seed_text,
    num_words=25,
    sequence_length=SEQUENCE_LENGTH,
    temperature=0.8   # Lower = more predictable, Higher = more creative
)

print(generated_text)


Enter your text:The sun was shining brightly
The sun was shining brightly wait ross youre going to joey right maybe dont do that alright now they want you to stay with rachel and me how ive should
