In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import random

# Load text dataset
with open("/content/drive/MyDrive/sonnets.txt", "r", encoding="utf-8") as file:
    text = file.read().lower()

# Tokenization
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(text)
total_chars = len(tokenizer.word_index) + 1

# Generate character sequences
sequences = []
sequence_length = 40  # Reduced sequence length to speed up training
for i in range(sequence_length, len(text)):
    seq = text[i-sequence_length:i]
    sequences.append([tokenizer.word_index[char] for char in seq if char in tokenizer.word_index])

# Ensure sequences are not empty
if len(sequences) == 0:
    raise ValueError("No valid sequences generated. Check text preprocessing.")

# Convert to NumPy array and reshape
data = np.array(sequences, dtype=object)
data = np.stack(data)  # Ensures it's 2D

# Prepare input and output
x, y = data[:, :-1], data[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_chars)
x = pad_sequences(x, maxlen=sequence_length - 1, padding='pre')

# Define optimized LSTM model
model = Sequential([
    Embedding(input_dim=total_chars, output_dim=32, input_length=sequence_length - 1),  # Reduced output_dim
    LSTM(64, return_sequences=True),  # Reduced LSTM units
    LSTM(64),
    Dense(total_chars, activation='softmax')
])

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model with fewer epochs and smaller batch size
model.fit(x, y, epochs=10, batch_size=32, verbose=1)  # Reduced epochs & batch size

# Text Generation function
def generate_text(seed_text, next_chars=100, temperature=1.0):
    generated_text = seed_text
    for _ in range(next_chars):
        seq = [tokenizer.word_index[char] for char in generated_text[-(sequence_length - 1):] if char in tokenizer.word_index]
        seq = pad_sequences([seq], maxlen=sequence_length - 1, padding='pre')
        preds = model.predict(seq, verbose=0)[0]
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        next_index = np.random.choice(range(total_chars), p=preds)
        next_char = tokenizer.index_word.get(next_index, '')
        generated_text += next_char
    return generated_text

# Generate sample text
seed = "shall i compare thee to a summer's day?"
print(generate_text(seed, next_chars=200, temperature=0.8))




Epoch 1/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 10ms/step - accuracy: 0.2384 - loss: 2.7328
Epoch 2/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 8ms/step - accuracy: 0.3747 - loss: 2.1343
Epoch 3/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 8ms/step - accuracy: 0.4142 - loss: 1.9721
Epoch 4/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.4388 - loss: 1.8857
Epoch 5/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 8ms/step - accuracy: 0.4548 - loss: 1.8126
Epoch 6/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - accuracy: 0.4736 - loss: 1.7409
Epoch 7/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 7ms/step - accuracy: 0.4810 - loss: 1.7097
Epoch 8/10
[1m2989/2989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 7ms/step - accuracy: 0.4888 - loss: 1.6789
Epoch 9/10
[1m