# Word Prediction using LSTM model
Data source: https://www.kaggle.com/datasets/ronikdedhia/next-word-prediction/data

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Reading corpus the text file
with open("1661-0.txt", 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

In [3]:
mytext



In [4]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

In [5]:
mytokenizer.word_index

{'the': 1,
 'and': 2,
 'of': 3,
 'to': 4,
 'a': 5,
 'i': 6,
 '”': 7,
 'in': 8,
 'that': 9,
 'he': 10,
 'it': 11,
 'was': 12,
 'his': 13,
 'you': 14,
 'is': 15,
 'as': 16,
 'have': 17,
 'with': 18,
 'my': 19,
 'had': 20,
 'at': 21,
 'for': 22,
 'which': 23,
 'not': 24,
 'be': 25,
 'me': 26,
 'but': 27,
 'from': 28,
 'holmes': 29,
 'upon': 30,
 'said': 31,
 'him': 32,
 'we': 33,
 'there': 34,
 'this': 35,
 'so': 36,
 '“i': 37,
 'all': 38,
 'on': 39,
 'very': 40,
 'your': 41,
 'an': 42,
 'she': 43,
 'were': 44,
 'would': 45,
 'by': 46,
 'one': 47,
 'when': 48,
 'her': 49,
 'been': 50,
 'mr': 51,
 'are': 52,
 'man': 53,
 '’': 54,
 'out': 55,
 'what': 56,
 'do': 57,
 'could': 58,
 'no': 59,
 'up': 60,
 'then': 61,
 'will': 62,
 'some': 63,
 'or': 64,
 'who': 65,
 'little': 66,
 'if': 67,
 'down': 68,
 'has': 69,
 'see': 70,
 'into': 71,
 'they': 72,
 'may': 73,
 'am': 74,
 'us': 75,
 'come': 76,
 'now': 77,
 'know': 78,
 'did': 79,
 'must': 80,
 'more': 81,
 'over': 82,
 'than': 83,
 'shoul

In [6]:
my_input_sequences = []
for line in mytext.split('\n'):
    #print(line)
    token_list = mytokenizer.texts_to_sequences([line])[0]
    #print(token_list)
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[:i+1]
        #print(my_n_gram_sequence)
        my_input_sequences.append(my_n_gram_sequence)
        #print(input_sequences)

In [7]:
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding='pre'))

In [8]:
input_sequences[50]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   18,   35, 1039,   64, 2662,   21, 2663], dtype=int32)

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [10]:
X[50]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   18,   35, 1039,   64, 2662,   21], dtype=int32)

In [11]:
y

array([2657,    1,  663, ...,    8,  320,    7], dtype=int32)

In [12]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [13]:
y[1]

array([0., 1., 0., ..., 0., 0., 0.])

In [14]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

2025-03-24 10:53:10.453131: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


None


In [32]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - accuracy: 0.1782 - loss: 4.7428
Epoch 2/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - accuracy: 0.1943 - loss: 4.4830
Epoch 3/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - accuracy: 0.2184 - loss: 4.0948
Epoch 4/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - accuracy: 0.2501 - loss: 3.7406
Epoch 5/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.2963 - loss: 3.4159
Epoch 6/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.3404 - loss: 3.1177
Epoch 7/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.3891 - loss: 2.8490
Epoch 8/50
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.4368 - loss: 2.6081
Epoch 9/

<keras.src.callbacks.history.History at 0x78a916c3d370>

In [44]:
input_text = "“Thank you,” said Holmes, "
predict_next_words= 10

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

[656, 14, 7, 31, 29]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[656, 14, 7, 31, 29, 5170]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[656, 14, 7, 31, 29, 5170, 60]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[656, 14, 7, 31, 29, 5170, 60, 1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[656, 14, 7, 31, 29, 5170, 60, 1, 171]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[656, 14, 7, 31, 29, 5170, 60, 1, 171, 2]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[656, 14, 7, 31, 29, 5170, 60, 1, 171, 2, 897]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[656, 14, 7, 31, 29, 5170, 60, 1, 171, 2, 897, 11]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[656, 14, 7, 31, 29, 5170, 60, 1, 171, 2, 897, 11, 4]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[656, 