In [1]:
# Data Collection 
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\bhimr\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
# load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')
# save to a file 
with open('hamlet.txt','w') as file:
    file.write(data)

In [3]:
# data preprocessing 
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## load dataset 
with open('hamlet.txt','r')as file:
    text=file.read().lower()

## tokenize the text

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words

4818

In [4]:
## create input sequences 
input_sequences=[]
for line in text.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [5]:
## Pad Sequences 
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

14

In [6]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))

In [7]:
# create predictors and label 
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]

In [8]:
# convert this into categorial 
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [9]:
# train test split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20)

## GRU LSTM 

In [10]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dropout, Dense

# Define the model with GRU
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1))
model.add(GRU(150, return_sequences=False))  # GRU instead of LSTM, return_sequences=False for feeding into Dense
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

# Model compiler
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Build the model by explicitly defining the input shape (None for batch size)
model.build(input_shape=(None, max_sequence_len))  # None for batch size, max_sequence_len for sequence length
model.summary()




Explanation:

- GRU Layer: I replaced the LSTM layer with a GRU layer. Both are recurrent layers, but GRU uses a simplified gating mechanism compared to LSTM, making it less computationally expensive.
- Same Structure: The rest of the model remains the same (Embedding, Dropout, Dense layers) because only the recurrent layer type was changed.
Build Method: The input shape is defined using model.build(input_shape=(None, max_sequence_len)).
- The GRU model will likely have similar performance to the LSTM but may train faster due to fewer parameters.

In [11]:
## Train the model 
history = model.fit(x_train,y_train,epochs=85,validation_data=(x_test,y_test),verbose=1)

Epoch 1/85
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 33ms/step - accuracy: 0.0327 - loss: 7.1298 - val_accuracy: 0.0488 - val_loss: 6.6385
Epoch 2/85
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.0515 - loss: 6.3086 - val_accuracy: 0.0589 - val_loss: 6.5844
Epoch 3/85
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 28ms/step - accuracy: 0.0723 - loss: 5.9822 - val_accuracy: 0.0701 - val_loss: 6.6461
Epoch 4/85
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 28ms/step - accuracy: 0.0841 - loss: 5.6293 - val_accuracy: 0.0781 - val_loss: 6.7456
Epoch 5/85
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 29ms/step - accuracy: 0.1070 - loss: 5.2564 - val_accuracy: 0.0746 - val_loss: 6.8492
Epoch 6/85
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 29ms/step - accuracy: 0.1228 - loss: 4.9113 - val_accuracy: 0.0754 - val_loss: 7.0383
Epoch 7/85
[1m6

In [12]:
# Function to predict the next word 
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    # Convert the input text into a sequence of tokens
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    # Truncate the token list to the maximum sequence length
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
    
    # Pad the sequence to match the input shape expected by the model
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    # Predict the next word
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    
    # Find the word corresponding to the predicted index
    predicted_word = {index: word for word, index in tokenizer.word_index.items()}.get(predicted_word_index)
    
    return predicted_word if predicted_word else None

In [13]:
input_text = "To be or not to be"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word Prediction : {next_word}")

Input text:To be or not to be
Next Word Prediction : your


In [15]:
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
input_text = "What is the best "
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word Prediction : {next_word}")

Input text:What is the best 
Next Word Prediction : marke


In [18]:
# Save the model with the new .keras extension
model.save("next_word_gru.keras") 