In [3]:
# Data Collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


In [4]:
# load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')

# save to a file
with open('hamlet.txt', 'w') as file:
  file.write(data)

In [5]:
# Data Preprocessing
import numpy as np
import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [6]:
# Load the data & lower them
with open('hamlet.txt', 'r') as file:
  text = file.read().lower()

In [7]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1 # creating index of words

In [8]:
# Creating input sequences
inputSequences = []
for line in text.split('\n'):
  tokenList = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(tokenList)):
    nGramSequence = tokenList[:i+1]
    inputSequences.append(nGramSequence)

In [9]:
# Pad sequences
maxSequenceLength = max([len(x) for x in inputSequences])
inputSequences = np.array(pad_sequences(inputSequences, maxlen=maxSequenceLength, padding='pre'))

In [10]:
# Create predictors and label
X, y = inputSequences[:,:-1], inputSequences[:,-1]

In [11]:
y

array([ 687,    4,   45, ..., 1047,    4,  193])

In [12]:
y = keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
# Split the data into  train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train LSTM Model

In [14]:
# Define the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(total_words, 100))
model.add(LSTM(150, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='linear'))

# Compile with numerically stable loss and metric
optimizer = Adam(learning_rate=0.001)
model.compile(loss=CategoricalCrossentropy(from_logits=True), optimizer=optimizer, metrics=['accuracy']) # Added 'accuracy' metric

In [72]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 121ms/step - accuracy: 0.0231 - loss: 7.4125 - val_accuracy: 0.0336 - val_loss: 6.7741
Epoch 2/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 142ms/step - accuracy: 0.0321 - loss: 6.6028 - val_accuracy: 0.0336 - val_loss: 6.8056
Epoch 3/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 118ms/step - accuracy: 0.0335 - loss: 6.4862 - val_accuracy: 0.0336 - val_loss: 6.8107
Epoch 4/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 118ms/step - accuracy: 0.0343 - loss: 6.4027 - val_accuracy: 0.0410 - val_loss: 6.8569
Epoch 5/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 135ms/step - accuracy: 0.0442 - loss: 6.3187 - val_accuracy: 0.0470 - val_loss: 6.8273
Epoch 6/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 117ms/step - accuracy: 0.0478 - loss: 6.2216 - val_accuracy: 0.0459 - val_loss: 6.8464
Epoch 7/50

In [73]:
model.summary()

In [74]:
# Calculate model loss & accuracy
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Model Loss: {loss}, Model Accuracy: {accuracy}')

[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.0663 - loss: 8.2921
Model Loss: 8.205774307250977, Model Accuracy: 0.06353215128183365


In [75]:
# Create a reverse word index for efficient lookup
reverse_word_index = dict([(index, word) for word, index in tokenizer.word_index.items()])

# function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
  token_list = tokenizer.texts_to_sequences([text])[0]
  # Pad sequences to the correct length (max_sequence_len - 1 for the input)
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
  prediction = model.predict(token_list,verbose=0)
  predicted_word_index = np.argmax(prediction, axis=1)[0] # Get the single predicted index
  # Use the reverse word index for efficient lookup
  predicted_word = reverse_word_index.get(predicted_word_index)
  return predicted_word

In [76]:
input_text = "To be or not to be"
max_sequence_len = model.input_shape[1]+1
predicted_word = predict_next_word(model, tokenizer, input_text,    max_sequence_len)
print(f'Predicted Next Word: {predicted_word}')

Predicted Next Word: made


In [15]:
# save the model
model.save('nextWord_LSTM_model.keras')

# Save the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

  return saving_lib.save_model(model, filepath)
