In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:
file = open("book.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines)

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces
data = data.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg eBook of The Aldine Speller: Part Two This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBoo'

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 12, 37, 123, 2, 1, 160, 161, 64, 137, 14, 123, 7, 11, 1]

In [4]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

3521


In [5]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  13642


array([[  1,  12,  37, 123],
       [ 12,  37, 123,   2],
       [ 37, 123,   2,   1],
       [123,   2,   1, 160],
       [  2,   1, 160, 161],
       [  1, 160, 161,  64],
       [160, 161,  64, 137],
       [161,  64, 137,  14],
       [ 64, 137,  14, 123],
       [137,  14, 123,   7]])

In [6]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])

X = np.array(X)
y = np.array(y)

In [7]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[  1  12  37]
 [ 12  37 123]
 [ 37 123   2]
 [123   2   1]
 [  2   1 160]
 [  1 160 161]
 [160 161  64]
 [161  64 137]
 [ 64 137  14]
 [137  14 123]]
Response:  [123   2   1 160 161  64 137  14 123   7]


In [8]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))



In [10]:
model.summary()

In [16]:
pip install keras-vis


Collecting keras-vis
  Downloading keras_vis-0.4.1-py2.py3-none-any.whl.metadata (757 bytes)
Downloading keras_vis-0.4.1-py2.py3-none-any.whl (30 kB)
Installing collected packages: keras-vis
Successfully installed keras-vis-0.4.1


In [22]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

# Set up the ModelCheckpoint callback
checkpoint = ModelCheckpoint(
    "next_words.keras",             # File name to save the best model
    monitor='loss',              # Monitor the loss during training
    verbose=1,                   # Print updates to the console
    save_best_only=True,         # Save only the best model based on the monitored metric
    mode='min'                   # Save the model with the minimum loss
)

# Compile the model with loss function and optimizer
model.compile(
    loss="categorical_crossentropy",  # Loss function for multi-class classification
    optimizer=Adam(learning_rate=0.001)  # Optimizer with specified learning rate
)

# Train the model with the checkpoint callback
model.fit(
    X,                              # Input data
    y,                              # Target data
    epochs=10,                       # Number of training epochs
    batch_size=64,                   # Batch size for training
    callbacks=[checkpoint]           # List of callbacks to apply during training
)


Epoch 1/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 770ms/step - loss: 7.4064
Epoch 1: loss improved from inf to 7.32196, saving model to next_words.keras
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 800ms/step - loss: 7.4060
Epoch 2/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 655ms/step - loss: 6.9287
Epoch 2: loss improved from 7.32196 to 6.94156, saving model to next_words.keras
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 690ms/step - loss: 6.9288
Epoch 3/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 649ms/step - loss: 6.7573
Epoch 3: loss improved from 6.94156 to 6.75224, saving model to next_words.keras
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 678ms/step - loss: 6.7573
Epoch 4/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 673ms/step - loss: 6.5122
Epoch 4: loss improved from 6.75224 to 6.53380, saving model to nex

<keras.src.callbacks.history.History at 0x7dc9e00e5b40>

In [23]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.keras')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break

  print(predicted_word)
  return predicted_word

In [24]:
while(True):
  text = input("Enter your line: ")

  if text == "0":
      print("Execution completed.....")
      break

  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)

          Predict_Next_Words(model, tokenizer, text)

      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line: whar are you
['whar', 'are', 'you']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 352ms/step
the
Enter your line: what is this
['what', 'is', 'this']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step
word
Enter your line: what do you 
['do', 'you', '']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
the
Enter your line: what is
['what', 'is']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
be
Enter your line: i don't
['i', "don't"]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
the
Enter your line: what are
['what', 'are']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
the
Enter your line: what are the
['what', 'are', 'the']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
best
Enter your line: what are the best
['are', 'the', 'best']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
spelle

KeyboardInterrupt: Interrupted by user