In [16]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

In [None]:
# --- Step 1: Prepare the Text Data ---
# Read Shakespeare's Hamlet text file

text_file_path = 'shakespeare-hamlet.txt'
with open(text_file_path, 'r', encoding='utf-8') as file:
    raw_text = file.read()

In [18]:
# Convert text to lowercase for consistency

cleaned_text = raw_text.lower()

In [19]:
# --- Step 2: Create and Configure Tokenizer ---
# Initialize tokenizer

text_tokenizer = Tokenizer()

In [20]:
# Train tokenizer on cleaned text

text_tokenizer.fit_on_texts([cleaned_text])

In [6]:
# Calculate total words in vocabulary
total_words = len(text_tokenizer.word_index) + 1  # +1 for padding index

In [None]:
# --- Step 3: Load Pre-trained Model ---

model = load_model('./Bidirectional_LSTM.keras')

In [22]:
# Create reverse lookup dictionary (index -> word)

reverse_word_mapping = {}
for word, index in text_tokenizer.word_index.items():
    reverse_word_mapping[index] = word

In [24]:
# Determine required input length from model

sequence_length = model.input_shape[1] # Model's expected input size

In [13]:
# --- Step 5: Set Prediction Parameters ---
starting_text = "i am a an"  # Initial seed text
words_to_predict = 1       # Number of words to generate

In [25]:
# --- Step 6: Generate Words Step-by-Step ---

current_text = starting_text

In [15]:
for prediction_step in range(words_to_predict):
    
    # Convert current text to numerical tokens
    token_sequence = text_tokenizer.texts_to_sequences([current_text])[0]
    
    # Handle sequence length requirements
    if len(token_sequence) < sequence_length:
        # Pad short sequences with zeros at the end
        padded_sequence = pad_sequences(
            [token_sequence], 
            maxlen=sequence_length,
            padding='post'
        )[0]
    else:
        # Trim long sequences to required length
        padded_sequence = token_sequence[-sequence_length:]
    
    # Reshape for model input (batch size = 1)
    model_input = padded_sequence.reshape(1, sequence_length)
    
    # Get prediction probabilities from model
    word_probabilities = model.predict(model_input, verbose=0)
    
    # Extract probabilities for next word position
    next_word_probs = word_probabilities[0][-1]  # Last position in output
    
    # Prevent selection of padding token (index 0)
    next_word_probs[0] = 0
    
    # Find highest probability word index
    predicted_index = np.argmax(next_word_probs)
    
    # Convert index to actual word
    new_word = reverse_word_mapping.get(predicted_index, "")
    
    # Update text with new word
    current_text += " " + new_word

# --- Step 7: Display Results ---
print("Starting text:", starting_text)
print("Predicted text:", current_text)
print("\nExplanation:")
print("1. Loaded and preprocessed Shakespeare's Hamlet text")
print("2. Created tokenizer with vocabulary of", total_words, "words")
print("3. Model expects input sequences of", sequence_length, "tokens")

Starting text: i am a an
Predicted text: i am a an an

Explanation:
1. Loaded and preprocessed Shakespeare's Hamlet text
2. Created tokenizer with vocabulary of 4818 words
3. Model expects input sequences of 13 tokens
