In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import LearningRateScheduler
import logging
import os

# Setup logging for debugging and performance tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Specify the path to your text file
file_path = r"1661-0 (1).txt"

# Step 1: Load and preprocess the text data with exception handling
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        book_text = file.read()
except FileNotFoundError as e:
    logging.error(f"File not found: {e}")
    raise

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([book_text])
total_words = len(tokenizer.word_index) + 1

# Create input sequences and labels
input_sequences = tokenizer.texts_to_sequences([book_text])[0]
max_sequence_length = 50  # For example

sequences = []
for i in range(1, len(input_sequences)):
    if i >= max_sequence_length:
        n_gram_sequence = input_sequences[i - max_sequence_length : i + 1]
        sequences.append(n_gram_sequence)

sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]

# Ensure that X has the correct shape
input_sequence_length = max_sequence_length - 1
X = pad_sequences(X, maxlen=input_sequence_length, padding='pre')
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Step 2: Dynamic learning rate scheduler
def lr_schedule(epoch, lr):
    return lr * 0.95 if epoch > 10 else lr

# Define the input layer
input_layer = Input(shape=(input_sequence_length,))

# Step 3: Embedding and LSTM layers
embedding_layer = Embedding(total_words, 100, input_length=input_sequence_length)(input_layer)
lstm_output = LSTM(150, return_sequences=True)(embedding_layer)

# Step 4: Attention mechanism (query = value = lstm_output)
attention = Attention()([lstm_output, lstm_output])  # Use the same LSTM output as both query and value

# Concatenate the attention output and LSTM output
concatenated = Concatenate()([lstm_output, attention])

# Step 5: Another LSTM layer and Dense output
lstm_output_2 = LSTM(100)(concatenated)
output_layer = Dense(total_words, activation='softmax')(lstm_output_2)

# Step 6: Create and compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add learning rate scheduler
lr_scheduler = LearningRateScheduler(lr_schedule)

# Step 7: Train the model with callbacks and logging
logging.info("Starting model training...")
model.fit(X, y, epochs=50, verbose=1, callbacks=[lr_scheduler])

# Step 8: Save model with versioning and metadata
model_version = "v1.1"
model_dir = f'models/word_generate_model_{model_version}.h5'
if not os.path.exists('models'):
    os.makedirs('models')

model.save_weights(model_dir)
logging.info(f"Model saved at {model_dir}")

# Save model tokenizer for future predictions
tokenizer_json = tokenizer.to_json()
with open(f'tokenizer_{model_version}.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)
logging.info("Tokenizer saved.")

# Step 9: Generate text with better prediction using beam search
def beam_search(seed_text, num_words, model, max_sequence_length, beam_width=3):
    sequences = [[seed_text, 0.0]]
    
    for _ in range(num_words):
        all_candidates = []
        for seq, score in sequences:
            token_list = tokenizer.texts_to_sequences([seq])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
            predicted_probabilities = model.predict(token_list, verbose=0)[0]
            
            # Consider top beam_width predictions
            top_predictions = np.argsort(predicted_probabilities)[-beam_width:]
            
            for pred in top_predictions:
                new_seq = seq + ' ' + tokenizer.index_word[pred]
                new_score = score - np.log(predicted_probabilities[pred])
                all_candidates.append([new_seq, new_score])
        
        # Select top beam_width sequences with the highest score
        sequences = sorted(all_candidates, key=lambda x: x[1])[:beam_width]
    
    return sequences[0][0]

# Generate text using the advanced beam search method
generated_text = beam_search("This is", 10, model, max_sequence_length)
print("Generated Text:", generated_text)





2024-09-11 18:57:49,074 - From C:\Users\sikha\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

2024-09-11 18:57:49,140 - Starting model training...


Epoch 1/50



2024-09-11 18:58:08,675 - From C:\Users\sikha\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.






2024-09-11 18:58:11,411 - From C:\Users\sikha\anaconda3\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


2024-09-12 00:45:00,508 - Model saved at models/word_generate_model_v1.1.h5
2024-09-12 00:45:00,708 - Tokenizer saved.


Generated Text: This is became improved since i came down under this morning and
