In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Data Collection

In [3]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

## loading the dataset
data = gutenberg.raw("blake-poems.txt")

##Saving to a file
with open("blake-poems.txt","w") as file:
    file.write(data)


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\srish\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


### Data Preprocessing

In [4]:
## Import necessary libraries for text preprocessing and data splitting
from tensorflow.keras.preprocessing.text import Tokenizer  # Tokenizer for converting text to numerical sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences  # For padding sequences to the same length
from sklearn.model_selection import train_test_split  # For splitting dataset into training and testing sets

## Loading the dataset (Jane Austen's 'Emma')
with open('blake-poems.txt', "r") as file:  # Open the text file in read mode
    text = file.read().lower()  # Read the entire text and convert it to lowercase to standardize the data
    
## Tokenizing the text
tokenizer = Tokenizer()  # Initialize the tokenizer
tokenizer.fit_on_texts([text])  # Create a dictionary where each unique word in the text is assigned a unique integer ID
total_words = len(tokenizer.word_index) + 1  # Get the total number of unique words (vocabulary size), adding 1 because Keras starts indexing at 1

print(total_words)


1551


In [5]:
## Creating input sequences
input_sequences = []  # Initialize an empty list to store input sequences

# Split the text into lines, and for each line, create tokenized sequences
for line in text.split('/n'):  # Split the text at every newline ('\n') character, treating each line as a separate sequence
    token_list = tokenizer.texts_to_sequences([line])[0]  # Convert each line into a sequence of tokens (integers) using the tokenizer
    # For each tokenized line, generate n-gram sequences
    for i in range(1, len(token_list)):  # Iterate over the tokenized line, starting from the second token
        n_gram_sequence = token_list[:i+1]  # Create an n-gram sequence: slice the token list from the beginning up to the (i+1)-th token
        input_sequences.append(n_gram_sequence)  # Add the generated n-gram sequence to the input_sequences list

In [7]:
##Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Assuming input_sequences is a list of sequences, where each sequence is a list of integers (e.g., word indices).
# Example: input_sequences = [[1, 2, 3], [4, 5], [6]]
# The goal is to pad all sequences to a uniform length.

# Step 1: Find the maximum sequence length among all input sequences.
# This will determine the length to which all sequences will be padded.
max_sequence_len = max([len(x) for x in input_sequences])

# Step 2: Pad sequences.
# 'pad_sequences' pads each sequence to the same length (max_sequence_len).
# 'padding="pre"' adds padding at the beginning of each sequence, meaning shorter sequences will be padded with zeros in front.
# 'maxlen' is the argument that specifies the maximum length for padding.
# The result is converted to a NumPy array for compatibility with further processing.
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Step 3: Display the padded sequences.
# The output will be an array where all sequences have the same length (max_sequence_len),
# with shorter sequences padded with zeros at the beginning.
input_sequences


array([[  0,   0,   0, ...,   0, 717,  42],
       [  0,   0,   0, ..., 717,  42, 460],
       [  0,   0,   0, ...,  42, 460, 346],
       ...,
       [  0,   0, 717, ..., 249,   1, 130],
       [  0, 717,  42, ...,   1, 130,   3],
       [717,  42, 460, ..., 130,   3, 344]])

In [8]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Step 1: Create predictors (x) and labels (y) from the padded sequences.
# Predictors (x) are all elements of the sequences except the last one (features).
# Labels (y) are the last element of each sequence (targets).
x, y = input_sequences[:, :-1], input_sequences[:, -1]

# Step 2: Convert labels to categorical format.
# 'tf.keras.utils.to_categorical' converts integer labels to one-hot encoded vectors.
# 'num_classes' specifies the total number of classes (e.g., total number of unique words).
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Step 3: Split the data into training and testing sets.
# 'train_test_split' divides the data into training and testing sets based on the 'test_size' ratio.
# Here, 20% of the data is used for testing, and the rest is used for training.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)


### Training the model

In [1]:
from tensorflow.keras.callback import EarlyStopping
early_stopping=EarlyStopping(monitor='val_loss',patience=3,restore_best_weights=True)




ModuleNotFoundError: No module named 'tensorflow.keras.callback'

In [53]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 6838, 100)         155100    
                                                                 
 lstm_8 (LSTM)               (None, 200)               240800    
                                                                 
 dropout_8 (Dropout)         (None, 200)               0         
                                                                 
 dense_8 (Dense)             (None, 1551)              311751    
                                                                 
Total params: 707651 (2.70 MB)
Trainable params: 707651 (2.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [54]:
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1,callbacks=[early_stopping])

Epoch 1/50


  4/171 [..............................] - ETA: 3:39:20 - loss: 7.3457 - accuracy: 0.0234  

In [9]:
# Import necessary modules from tensorflow.keras

# Sequential: Allows us to build a model layer by layer in sequence.
# Embedding: Converts integer-encoded words into dense vectors of fixed size (word embeddings).
# LSTM: Long Short-Term Memory layer for processing sequences.
# Dense: Fully connected layer for making predictions.
# Dropout: Regularization technique that randomly sets some neurons to zero during training to prevent overfitting.
# Bidirectional: Wraps an LSTM layer to process the input sequence in both forward and backward directions.
# BatchNormalization: Normalizes the output of the previous layer to stabilize training.
# Adam: Optimizer that adapts the learning rate during training.
# EarlyStopping: Stops training when the validation performance starts to degrade to prevent overfitting.
# ReduceLROnPlateau: Reduces the learning rate when the model performance plateaus to fine-tune training.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Initialize a Sequential model
# Sequential: Initializes a model where you can stack layers sequentially.
model = Sequential()

# Add an Embedding layer
# Converts words into dense vectors (embeddings).
# total_words: Defines the size of the vocabulary (number of unique words).
# 128: Specifies the dimension of each word vector (embedding size).
# input_length: Defines the maximum length of input sequences (each input will have this fixed length).
model.add(Embedding(total_words, 128, input_length=max_sequence_len-1))

# Add a Bidirectional LSTM layer
# Processes input sequences from both forward and backward directions, improving the context understanding.
# 256: Specifies the number of LSTM units (output dimensions).
# return_sequences=True: Ensures that the full sequence of outputs is returned, which is necessary for stacking additional LSTM layers.
model.add(Bidirectional(LSTM(100)))

# Add a Dropout layer for regularization
# 0.2: Specifies the dropout rate, meaning 20% of the neurons will be randomly set to 0 during training to prevent overfitting and improve generalization.
model.add(Dropout(0.2))

# Add a Batch Normalization layer
# Normalizes the output of the previous layer, which helps to stabilize and accelerate training by reducing the internal covariate shift.
model.add(BatchNormalization())

# Add another Dropout layer for regularization
# 0.2: Another Dropout layer with a 0.2 rate to further reduce overfitting.
model.add(Dropout(0.2))

# Add a Dense layer
# 128: Fully connected layer with 128 units and ReLU activation, which helps to learn complex representations from the LSTM output.
model.add(Dense(128, activation='relu'))

# Add the output Dense layer
# total_words: Output layer with units equal to the vocabulary size, using the softmax activation function for multi-class classification.
model.add(Dense(total_words, activation='softmax'))

# Compile the model
# loss="categorical_crossentropy": Loss function for multi-class classification, suitable for one-hot encoded labels.
# optimizer=Adam(learning_rate=0.001): Adam optimizer with a custom learning rate.
# metrics=["accuracy"]: Specifies accuracy as the metric to evaluate the model's performance during training and testing.
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# Set up early stopping
# Stops training when the validation loss stops improving, preventing overfitting by not over-training.
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Set up learning rate scheduler
# Reduces the learning rate when the validation loss plateaus, helping the model fine-tune its weights when progress slows down.
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6838, 128)         198528    
                                                                 
 bidirectional (Bidirection  (None, 200)               183200    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 batch_normalization (Batch  (None, 200)               800       
 Normalization)                                                  
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 128)               

In [10]:

# Train the model
# epochs=20: The model will train for up to 20 epochs, but early stopping may halt it sooner.
# validation_data=(X_val, y_val): Uses a validation set to monitor model performance during training.
# callbacks=[early_stopping, lr_scheduler]: Applies the early stopping and learning rate reduction callbacks during training.
history = model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test), callbacks=[early_stopping, lr_scheduler])
model.summary()

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6838, 128)         198528    
                                                                 
 bidirectional (Bidirection  (None, 200)               183200    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 batch_normalization (Batch  (None, 200)               800       
 Normalization)                                                  
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                           

In [11]:
history = model.fit(x_train, y_train, epochs=100, validation_data=(x_test, y_test), callbacks=[early_stopping, lr_scheduler])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [14]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    # Tokenize the input text to convert it into a sequence of integers
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    # If the sequence is longer than or equal to max_sequence_len, truncate it to the last max_sequence_len-1 tokens
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    
    # Pad the sequence to ensure it's of length max_sequence_len-1
    # Padding is applied at the beginning ('pre') if the sequence is shorter
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    # Use the model to predict the next word, based on the tokenized input
    predicted = model.predict(token_list, verbose=0)
    
    # Get the index of the predicted word by finding the highest probability in the model's output
    predicted_word_index = np.argmax(predicted, axis=1)
    
    # Loop through the tokenizer's word index to find the word corresponding to the predicted index
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word  # Return the predicted word
    
    # If no word is found (which is rare), return None
    return None


In [16]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    # Tokenize the input text to convert it into a sequence of integers
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    # If the sequence is longer than or equal to max_sequence_len, truncate it to the last max_sequence_len-1 tokens
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    
    # Pad the sequence to ensure it's of length max_sequence_len-1
    # Padding is applied at the beginning ('pre') if the sequence is shorter
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    # Use the model to predict the next word, based on the tokenized input
    predicted = model.predict(token_list, verbose=0)
    
    # Get the index of the predicted word by finding the highest probability in the model's output
    predicted_word_index = np.argmax(predicted, axis=1)
    
    # Loop through the tokenizer's word index to find the word corresponding to the predicted index
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word  # Return the predicted word
    
    # If no word is found (which is rare), return None
    return None

# Example usage
input_text = "Summer breeze"
print(f"Input text: {input_text}")

# The maximum sequence length is determined based on the model's input shape
max_sequence_len = model.input_shape[1] + 1  # Adding 1 because model input is typically one less than sequence length

# Predict the next word
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next Word Prediction: {next_word}")


Input text: Summer breeze
Next Word Prediction: by


In [17]:
# Save the trained model
model.save("next_word_lstm.h5")  # The model is saved in HDF5 format with the filename "next_word_lstm.h5"

# Save the tokenizer using pickle
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    # The tokenizer is saved using pickle for later use
    # The highest protocol ensures efficient saving and loading
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


  saving_api.save_model(


In [19]:
# Input text for prediction
input_text = "  On the echoing"
print(f"Input text: {input_text}")

# Determine the maximum sequence length from the model's input shape
max_sequence_len = model.input_shape[1] + 1  # Adding 1 to match sequence length for prediction

# Predict the next word based on the input text
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)

# Print the predicted next word
print(f"Next Word Prediction: {next_word}")


Input text:   On the echoing
Next Word Prediction: night
