#### Next Word Prediction using GRU-RNN

In [1]:
## Data collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

##Load the dataset
dataset = gutenberg.raw('shakespeare-hamlet.txt')

##Save the dataset in a file
with open('hamlet.txt', 'w') as file:
    file.write(dataset)


[nltk_data] Downloading package gutenberg to /home/vscode/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
## Data pre-processing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

##Load the dataset
with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

## Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text]) #Tokenize the entire dataset
total_words = len(tokenizer.word_index)+1
total_words

2026-01-14 06:35:05.001481: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-14 06:35:05.087840: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-14 06:35:05.087887: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-14 06:35:05.096226: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-14 06:35:05.117264: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-14 06:35:05.119508: I tensorflow/core/platform/cpu_feature_guard.cc:1

4818

In [3]:
## Create input sequences from text
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0] ## Convert sentence into tokens and returns list of lists
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891],
 [57, 407],
 [57, 407, 2],
 [57, 407, 2, 1181],
 [57, 407, 2, 1181, 177],
 [57, 407, 2, 1181, 177, 1892],
 [407, 1182],
 [407, 1182, 63],
 [408, 162],
 [408, 162, 377],
 [408, 162, 377, 21],
 [408, 162, 377, 21, 247],
 [408, 162, 377, 21, 247, 882],
 [18, 66],
 [451, 224],
 [451, 224, 248],
 [451, 224, 248, 1],
 [451, 224, 248, 1, 30],
 [408, 407],
 [451, 25],
 [408, 6],
 [408, 6, 43],
 [408, 6, 43, 62],
 [408, 6, 43, 62, 1893],
 [408, 6, 43, 62, 1893, 96],
 [408, 6, 43, 62, 1893, 96, 18],
 [408, 6, 43, 62, 1893, 96, 18, 566],
 [451, 71],
 [451, 71, 51],
 [451, 71, 51, 1894],
 [451, 71, 51, 1894, 567],
 [451, 71, 51, 1894, 567, 378],
 [451, 71, 51, 1894, 567, 378, 80],
 [451, 71, 51, 1894, 567, 378, 80, 3],
 [451, 71, 51, 1894, 567, 378, 80, 3, 273],
 [451, 71

In [4]:
## Pad sequence

max_seq_len = max(len(x) for x in input_sequences)

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [5]:
## Create input and output label
x, y = input_sequences[:,:-1], input_sequences[:,-1]
x,y

(array([[   0,    0,    0, ...,    0,    0,    1],
        [   0,    0,    0, ...,    0,    1,  687],
        [   0,    0,    0, ...,    1,  687,    4],
        ...,
        [   0,    0,    0, ...,  687,    4,   45],
        [   0,    0,    0, ...,    4,   45, 1047],
        [   0,    0,    0, ...,   45, 1047,    4]], dtype=int32),
 array([ 687,    4,   45, ..., 1047,    4,  193], dtype=int32))

In [6]:
import tensorflow as tf
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [8]:
## Train the LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

## Create model
model = Sequential()
model.add(Embedding(total_words, 100, input_shape=(max_seq_len-1,)))
model.add(GRU(150, return_sequences=True))
model.add(Dropout(0.2)) ## Disable 20% of neurons in hidden layer
model.add(GRU(100))
model.add(Dense(total_words, activation='softmax'))

## Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [9]:
## Earlystopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [10]:
## Train the model
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), verbose=1)

2026-01-14 06:35:51.730252: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 396714120 exceeds 10% of free system memory.


Epoch 1/100
[1m643/644[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 47ms/step - accuracy: 0.0312 - loss: 7.1674

2026-01-14 06:36:31.384549: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 99192984 exceeds 10% of free system memory.


[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 51ms/step - accuracy: 0.0325 - loss: 6.9415 - val_accuracy: 0.0235 - val_loss: 6.8194
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 45ms/step - accuracy: 0.0473 - loss: 6.4100 - val_accuracy: 0.0472 - val_loss: 6.8285
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 45ms/step - accuracy: 0.0575 - loss: 6.1578 - val_accuracy: 0.0569 - val_loss: 6.8220
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 48ms/step - accuracy: 0.0678 - loss: 5.9265 - val_accuracy: 0.0604 - val_loss: 6.8667
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.0787 - loss: 5.7012 - val_accuracy: 0.0663 - val_loss: 6.9501
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 46ms/step - accuracy: 0.0880 - loss: 5.4835 - val_accuracy: 0.0680 - val_loss: 7.0280
Epoch 7/100
[1m644/64

In [11]:
## Prdict the next word

## Function converts input text into tokens, pads it.
# predicts the most probable next word using the model
# Maps the predicted index back to a word

def predict_next_word(model, tokenizer, text, max_seq_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_seq_len:
        token_list = token_list[-(max_seq_len-1):] #To ensure the sequence length matches max_seq_len
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    prediction = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(prediction, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [15]:
input_text = "Bar. Welcome Horatio, welcome good"
print(f"Input text: {input_text}")
max_seq_len = model.input_shape[1] + 1
predicted_word = predict_next_word(model, tokenizer, input_text, max_seq_len)
print(f"Predicted word: {predicted_word}")


Input text: Bar. Welcome Horatio, welcome good
Predicted word: marcellus


In [17]:
## save the model
model.save("gru_rnn_model.h5")

#Save the tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)


