In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os


In [None]:
file = open("Pride and Prejudice.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines)

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces
data = data.split()
data = ' '.join(data)
data[:500]

'Though her brother and sister were persuaded that there was no real occasion for such a seclusion from the family, they did not attempt to oppose it; for they knew that she had not prudence enough to hold her tongue before the servants, while they waited at table, and judged it better that _one_ only of the household, and the one whom they could most trust, should comprehend all her fears and solicitude on the subject. In the dining-room they were soon joined by Mary and Kitty, who had been too '

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[79, 6, 228, 4, 93, 46, 394, 13, 66, 8, 42, 585, 395, 19, 54]

In [None]:
len(sequence_data)

35724

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

3694


In [None]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  35721


array([[ 79,   6, 228,   4],
       [  6, 228,   4,  93],
       [228,   4,  93,  46],
       [  4,  93,  46, 394],
       [ 93,  46, 394,  13],
       [ 46, 394,  13,  66],
       [394,  13,  66,   8],
       [ 13,  66,   8,  42],
       [ 66,   8,  42, 585],
       [  8,  42, 585, 395]])

In [None]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])

X = np.array(X)
y = np.array(y)

In [None]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 79   6 228]
 [  6 228   4]
 [228   4  93]
 [  4  93  46]
 [ 93  46 394]
 [ 46 394  13]
 [394  13  66]
 [ 13  66   8]
 [ 66   8  42]
 [  8  42 585]]
Response:  [  4  93  46 394  13  66   8  42 585 395]


In [None]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]




array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))



In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.keras", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 6.6933
Epoch 1: loss improved from inf to 6.45052, saving model to next_words.keras
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 32ms/step - loss: 6.6929
Epoch 2/70
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 6.0539
Epoch 2: loss improved from 6.45052 to 6.00750, saving model to next_words.keras
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - loss: 6.0538
Epoch 3/70
[1m558/559[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - loss: 5.6493
Epoch 3: loss improved from 6.00750 to 5.61447, saving model to next_words.keras
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 22ms/step - loss: 5.6492
Epoch 4/70
[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 5.2993
Epoch 4: loss improved from 5.61447 to 5.27994, saving model to next_words.ke

<keras.src.callbacks.history.History at 0x7b846edbcb80>

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.keras')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break

  print(predicted_word)
  return predicted_word

In [None]:
 while(True):
  text = input("Enter your line: ")

  if text == "0":
      print("Execution completed.....")
      break

  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)

          Predict_Next_Words(model, tokenizer, text)

      except Exception as e:
        print("Error occurred: ",e)
        continue


Enter your line: to inherit this estate after 
['estate', 'after', '']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step
the
Enter your line: the first two months:
['first', 'two', 'months:']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
of
Enter your line: Though her brother and sister were 
['sister', 'were', '']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
to
Enter your line:  As for Mary, she was mistress enough of herself to 
['herself', 'to', '']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
be
Enter your line: Then perceiving in
['Then', 'perceiving', 'in']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
elizabeth
Enter your line: 0
Execution completed.....
