# **Next Word Prediction using LSTM**

## Locate the Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Installing Required Libraries and Modules

In [None]:
!pip install tensorflow==2.16.1
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install matplotlib
!pip install nltk

## Data Collection

### Load the Dataset

In [None]:
project_path = '/content/drive/MyDrive/Next Word Prediction using LSTM'

In [3]:
"""
    Dataset: http://www.gutenberg.org/cache/epub/5200/pg5200.txt
"""

file = open(f'{project_path}/data/raw/Gutenberg - Metamorphosis.txt', 'r', encoding='utf8')

## Data Preprocessing

In [4]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

### Preprocess the Text

In [5]:
lines = [line for line in file]
lines[:10]

['\ufeffThe Project Gutenberg eBook of Metamorphosis\n',
 '    \n',
 'This ebook is for the use of anyone anywhere in the United States and\n',
 'most other parts of the world at no cost and with almost no restrictions\n',
 'whatsoever. You may copy it, give it away or re-use it under the terms\n',
 'of the Project Gutenberg License included with this ebook or online\n',
 'at www.gutenberg.org. If you are not located in the United States,\n',
 'you will have to check the laws of the country where you are located\n',
 'before using this eBook.\n',
 '\n']

In [6]:
data = ' '.join(lines)
data



In [7]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data



In [8]:
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
data = data.translate(translator)
data



### Tokenization

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[1, 43, 97, 253, 5, 578, 27, 253, 75, 17]

### Convert Tokens into Sequences

In [10]:
sequence_len = 25
length = sequence_len + 1

sequences = []

for i in range(length, len(sequence_data)):
    seq = sequence_data[i-length:i]
    sequences.append(seq)

sequences[:10]

[[1,
  43,
  97,
  253,
  5,
  578,
  27,
  253,
  75,
  17,
  1,
  143,
  5,
  269,
  681,
  8,
  1,
  226,
  167,
  3,
  190,
  68,
  1107,
  5,
  1,
  682],
 [43,
  97,
  253,
  5,
  578,
  27,
  253,
  75,
  17,
  1,
  143,
  5,
  269,
  681,
  8,
  1,
  226,
  167,
  3,
  190,
  68,
  1107,
  5,
  1,
  682,
  22],
 [97,
  253,
  5,
  578,
  27,
  253,
  75,
  17,
  1,
  143,
  5,
  269,
  681,
  8,
  1,
  226,
  167,
  3,
  190,
  68,
  1107,
  5,
  1,
  682,
  22,
  52],
 [253,
  5,
  578,
  27,
  253,
  75,
  17,
  1,
  143,
  5,
  269,
  681,
  8,
  1,
  226,
  167,
  3,
  190,
  68,
  1107,
  5,
  1,
  682,
  22,
  52,
  847],
 [5,
  578,
  27,
  253,
  75,
  17,
  1,
  143,
  5,
  269,
  681,
  8,
  1,
  226,
  167,
  3,
  190,
  68,
  1107,
  5,
  1,
  682,
  22,
  52,
  847,
  3],
 [578,
  27,
  253,
  75,
  17,
  1,
  143,
  5,
  269,
  681,
  8,
  1,
  226,
  167,
  3,
  190,
  68,
  1107,
  5,
  1,
  682,
  22,
  52,
  847,
  3,
  14],
 [27,
  253,
  75,
  17,
  1,
  143

In [11]:
[{ k:v } for k,v in tokenizer.index_word.items()][:10]

[{1: 'the'},
 {2: 'to'},
 {3: 'and'},
 {4: 'he'},
 {5: 'of'},
 {6: 'his'},
 {7: 'was'},
 {8: 'in'},
 {9: 'it'},
 {10: 'had'}]

### Save the Tokenizer

In [12]:
with open(f'{project_path}/data/models/tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

### Create the Input Sequences

In [13]:
vocab_size = len(tokenizer.word_index) + 1

In [14]:
input_sequences = np.array(sequences)
input_sequences.shape

(25240, 26)

### Create the Predictors and Label

In [15]:
X, y = input_sequences[:,:-1], input_sequences[:,-1]

In [16]:
X.shape, y.shape

((25240, 25), (25240,))

### Convert the Label into Categorical Feature

In [17]:
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
y.shape

(25240, 3246)

### Split the Dataset into Train and Test

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20192, 25), (5048, 25), (20192, 3246), (5048, 3246))

### Save the Processed Train and Test data into CSV Files

In [30]:
pd.DataFrame(X_train).to_csv(f'{project_path}/data/processed/train/X_train.csv', index=False, header=False)
pd.DataFrame(X_test).to_csv(f'{project_path}/data/processed/test/X_test.csv', index=False, header=False)
pd.DataFrame(y_train).to_csv(f'{project_path}/data/processed/train/y_train.csv', index=False, header=False)
pd.DataFrame(y_test).to_csv(f'{project_path}/data/processed/test/y_test.csv', index=False, header=False)

## Model Building and Training

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.regularizers import l2

### GPU Availability

In [21]:
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))

Found GPU at: /device:GPU:0


### Callbacks

In [22]:
checkpoint = ModelCheckpoint(f'{project_path}/data/models/model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, min_lr=0.00001, verbose=1)
early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

### Define the LSTM Model and Compile Model

In [23]:
with tf.device(device_name):
  model = Sequential()
  model.add(Embedding(vocab_size, 300, mask_zero=True))
  model.add(LSTM(512, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))
  model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.3))
  model.add(Dense(256, activation=tf.nn.relu, kernel_regularizer=l2(0.001)))
  model.add(BatchNormalization())
  model.add(Dense(vocab_size, activation=tf.nn.softmax))

  # Build the model with the input shape
  input_shape = (X_train.shape[1], X_test.shape[1])
  model.build(input_shape)

  optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=True)
  model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=['accuracy'])

  model.summary()

### Train the Model

In [24]:
model.fit(X_train, y_train, batch_size=64, epochs=100, validation_data=(X_test, y_test), callbacks=[checkpoint, reduce, early_stop])

Epoch 1/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - accuracy: 0.0178 - loss: 7.6506
Epoch 1: val_loss improved from inf to 6.84735, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 257ms/step - accuracy: 0.0178 - loss: 7.6490 - val_accuracy: 0.0416 - val_loss: 6.8473 - learning_rate: 0.0010
Epoch 2/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step - accuracy: 0.0519 - loss: 6.4473
Epoch 2: val_loss improved from 6.84735 to 6.68042, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 225ms/step - accuracy: 0.0519 - loss: 6.4473 - val_accuracy: 0.0529 - val_loss: 6.6804 - learning_rate: 0.0010
Epoch 3/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step - accuracy: 0.0503 - loss: 6.3403
Epoch 3: val_loss improved from 6.68042 to 6.55758, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 233ms/step - accuracy: 0.0503 - loss: 6.3402 - val_accuracy: 0.0529 - val_loss: 6.5576 - learning_rate: 0.0010
Epoch 4/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218ms/step - accuracy: 0.0528 - loss: 6.1376
Epoch 4: val_loss improved from 6.55758 to 6.55285, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 230ms/step - accuracy: 0.0528 - loss: 6.1376 - val_accuracy: 0.0576 - val_loss: 6.5528 - learning_rate: 0.0010
Epoch 5/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step - accuracy: 0.0621 - loss: 5.9565
Epoch 5: val_loss improved from 6.55285 to 6.42202, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 233ms/step - accuracy: 0.0621 - loss: 5.9565 - val_accuracy: 0.0598 - val_loss: 6.4220 - learning_rate: 0.0010
Epoch 6/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step - accuracy: 0.0727 - loss: 5.7797
Epoch 6: val_loss improved from 6.42202 to 6.28132, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 229ms/step - accuracy: 0.0727 - loss: 5.7797 - val_accuracy: 0.0630 - val_loss: 6.2813 - learning_rate: 0.0010
Epoch 7/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step - accuracy: 0.0792 - loss: 5.6926
Epoch 7: val_loss improved from 6.28132 to 6.25256, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 236ms/step - accuracy: 0.0792 - loss: 5.6926 - val_accuracy: 0.0737 - val_loss: 6.2526 - learning_rate: 0.0010
Epoch 8/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step - accuracy: 0.0907 - loss: 5.5373
Epoch 8: val_loss did not improve from 6.25256
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 225ms/step - accuracy: 0.0907 - loss: 5.5374 - val_accuracy: 0.0775 - val_loss: 6.2627 - learning_rate: 0.0010
Epoch 9/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step - accuracy: 0.0929 - loss: 5.4512
Epoch 9: val_loss improved from 6.25256 to 6.15128, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 236ms/step - accuracy: 0.0929 - loss: 5.4513 - val_accuracy: 0.0882 - val_loss: 6.1513 - learning_rate: 0.0010
Epoch 10/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 223ms/step - accuracy: 0.1068 - loss: 5.3077
Epoch 10: val_loss improved from 6.15128 to 6.11621, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 234ms/step - accuracy: 0.1068 - loss: 5.3079 - val_accuracy: 0.0977 - val_loss: 6.1162 - learning_rate: 0.0010
Epoch 11/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218ms/step - accuracy: 0.1171 - loss: 5.1285
Epoch 11: val_loss improved from 6.11621 to 6.08868, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 230ms/step - accuracy: 0.1171 - loss: 5.1287 - val_accuracy: 0.1099 - val_loss: 6.0887 - learning_rate: 0.0010
Epoch 12/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step - accuracy: 0.1322 - loss: 4.9863
Epoch 12: val_loss improved from 6.08868 to 6.01229, saving model to /content/drive/MyDrive/Next Word Prediction using LSTM/data/models/model.h5




[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 236ms/step - accuracy: 0.1322 - loss: 4.9864 - val_accuracy: 0.1121 - val_loss: 6.0123 - learning_rate: 0.0010
Epoch 13/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step - accuracy: 0.1397 - loss: 4.7973
Epoch 13: val_loss did not improve from 6.01229
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 224ms/step - accuracy: 0.1397 - loss: 4.7974 - val_accuracy: 0.1193 - val_loss: 6.0629 - learning_rate: 0.0010
Epoch 14/100
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step - accuracy: 0.1503 - loss: 4.6410
Epoch 14: val_loss did not improve from 6.01229
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 230ms/step - accuracy: 0.1503 - loss: 4.6411 - val_accuracy: 0.1222 - val_loss: 6.0367 - learning_rate: 0.0010
Epoch 15/100
[1m316/316[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e73057222d0>

### Save the Model

In [25]:
model.save(f'{project_path}/data/models/lstm_model.keras')

## Model Evaluation

In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [27]:
evaluation = model.evaluate(X_test, y_test, verbose=1)

[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.1177 - loss: 5.9719


### Create Function to Predict the Next Word

In [28]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len - 1):] # Ensure the sequence length matches max_sequence_len - 1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)

    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [29]:
input_text = "The Project Gutenberg "
max_sequence_len = model.input_shape[1] + 1

next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Input Text: {input_text}")
print(f"Next Predicted Word: {next_word}")

Input Text: The Project Gutenberg 
Next Predicted Word: work
