In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [12]:
import pandas as pd

# Load the text file into a DataFrame
file= pd.read_csv('blue_castle.txt', delimiter='\t')

# Display the DataFrame
print("DataFrame:")
print(file)

# Save the DataFrame as a CSV file
file.to_csv('blue_castle.txt', index=False)

print("Text file successfully converted to CSV!")


DataFrame:
     The Project Gutenberg eBook of The Blue Castle, by Lucy Maud Montgomery
0     This eBook is for the use of anyone anywhere i...                     
1     most other parts of the world at no cost and w...                     
2     whatsoever. You may copy it, give it away or r...                     
3     of the Project Gutenberg License included with...                     
4     www.gutenberg.org. If you are not located in t...                     
...                                                 ...                     
6694                        facility: www.gutenberg.org                     
6695  This website includes information about Projec...                     
6696  including how to make donations to the Project...                     
6697  Archive Foundation, how to help produce our ne...                     
6698  subscribe to our email newsletter to hear abou...                     

[6699 rows x 1 columns]
Text file successfully converted to CSV!

In [14]:
import pandas as pd

# Load the CSV file
file= pd.read_csv('blue_castle.txt')

# Display the first few rows
print(file.head())


  The Project Gutenberg eBook of The Blue Castle, by Lucy Maud Montgomery
0  This eBook is for the use of anyone anywhere i...                     
1  most other parts of the world at no cost and w...                     
2  whatsoever. You may copy it, give it away or r...                     
3  of the Project Gutenberg License included with...                     
4  www.gutenberg.org. If you are not located in t...                     


In [16]:
file = open("blue_castle.txt", "r", encoding = "utf8" )

#store file in list
lines = []
for i in file:
     lines.append(i)

#Convert list to string
data = ""
for i in lines:
    data = '  '. join(lines)

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces
data = data.split()
data = ' '.join(data)
data[:500]

'"The Project Gutenberg eBook of The Blue Castle, by Lucy Maud Montgomery" This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions "whatsoever. You may copy it, give it away or re-use it under the terms" of the Project Gutenberg License included with this eBook or online at "www.gutenberg.org. If you are not located in the United States, you" will have to check the laws of the country where you are located befo'

In [18]:
len(data)

407834

In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

#saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl','wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]


[1, 112, 97, 587, 4, 1, 94, 147, 58, 2383, 2384, 1818, 51, 587, 42]

In [22]:
len(sequence_data)

72052

In [24]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8413


In [26]:
sequences = []

for i in range (3, len(sequence_data)):
  words = sequence_data[i-3:i+1]
  sequences.append(words)

print("The Length if sequences are:" , len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length if sequences are: 72049


array([[   1,  112,   97,  587],
       [ 112,   97,  587,    4],
       [  97,  587,    4,    1],
       [ 587,    4,    1,   94],
       [   4,    1,   94,  147],
       [   1,   94,  147,   58],
       [  94,  147,   58, 2383],
       [ 147,   58, 2383, 2384],
       [  58, 2383, 2384, 1818],
       [2383, 2384, 1818,   51]])

In [28]:
X = []
y = []

for i in sequences:
  X.append(i[0:3])
  y.append(i[3])

X = np.array(X)
y = np.array(y)

In [30]:
print("Data:", X[:10])
print("Response:", y[:10])

Data: [[   1  112   97]
 [ 112   97  587]
 [  97  587    4]
 [ 587    4    1]
 [   4    1   94]
 [   1   94  147]
 [  94  147   58]
 [ 147   58 2383]
 [  58 2383 2384]
 [2383 2384 1818]]
Response: [ 587    4    1   94  147   58 2383 2384 1818   51]


In [32]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Ensure the same vocab_size and embedding dimension
vocab_size = 8413

# Define the model
model = Sequential()
model.add(Embedding(vocab_size, 10))  # Removed input_length as it's deprecated
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

# Display the model summary
model.summary()


In [58]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

# Define the ModelCheckpoint callback
checkpoint = ModelCheckpoint(
    "next_word.keras",  # Updated to `.keras` extension
    monitor='loss',
    verbose=1,
    save_best_only=True
)

# Compile the model
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

# Train the model
model.fit(X, y, epochs=20, batch_size=64, callbacks=[checkpoint])


Epoch 1/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388ms/step - loss: 7.0118
Epoch 1: loss improved from inf to 6.76844, saving model to next_word.keras
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m460s[0m 390ms/step - loss: 7.0116
Epoch 2/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 367ms/step - loss: 6.2708
Epoch 2: loss improved from 6.76844 to 6.20952, saving model to next_word.keras
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m414s[0m 368ms/step - loss: 6.2707
Epoch 3/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 361ms/step - loss: 5.8234
Epoch 3: loss improved from 6.20952 to 5.79924, saving model to next_word.keras
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 362ms/step - loss: 5.8234
Epoch 4/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 369ms/step - loss: 5.4940
Epoch 4: loss improved from 5.79924 to 5.49677, saving m

<keras.src.callbacks.history.History at 0x22d76d6d520>

In [64]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

#Load the model and tokenizer
model = load_model('next_word.keras')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predict_word = ""

  for key, value in tokenizer.word_index.items():
    if value == preds:
      predicted_word = key
      break

  print(predicted_word)
  return predicted_word

In [None]:
while(True):
  text = input("Enter your line:")

  if text == "0":
    print("Execution completed....")
    break
  
  else:
    try:
      text = text.split(" ")
      text = text[-3:]
      print(text)

      Predict_Next_Words(model, tokenizer, text)

    
    except Exception as e:
       print("Error occured: ",e)
       continue

Enter your line: The project gutenburg


['The', 'project', 'gutenburg']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397ms/step
gutenberg


Enter your line: he was quite


['he', 'was', 'quite']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step
prepared


Enter your line: she is beautiful


['she', 'is', 'beautiful']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
enough


Enter your line: it may all come to


['all', 'come', 'to']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
cool


Enter your line: hello everyone hope you are okayy doing good and fine


['good', 'and', 'fine']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
and


Enter your line: blue castle is looking wonderful


['is', 'looking', 'wonderful']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
and
