In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Loading Shakespeare dataset

In [4]:
with open("dataset.txt", "r", encoding="utf-8") as file:
    text = file.read().lower()

print("Total characters in raw dataset:", len(text))

Total characters in raw dataset: 5458199


# Remove Gutenberg header text

In [22]:
start_index = text.find("the sonnets")
text = text[start_index:]

print("Total characters after cleaning:", len(text))

Total characters after cleaning: 5447737


In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1
print("Total unique words:", total_words)

Total unique words: 27141


In [8]:
input_sequences = []

for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i + 1])

In [10]:
# Pad sequences
max_sequence_len = max(len(seq) for seq in input_sequences)

input_sequences = pad_sequences(
    input_sequences,
    maxlen=max_sequence_len,
    padding='pre'
)

# Split inputs and labels
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

print("Input shape:", X.shape)
print("Label shape:", y.shape)

Input shape: (790577, 16)
Label shape: (790577,)


In [11]:
X = X[:200000]
y = y[:200000]

print("Reduced input shape:", X.shape)
print("Reduced label shape:", y.shape)

Reduced input shape: (200000, 16)
Reduced label shape: (200000,)


In [13]:
model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len - 1),
    LSTM(150),
    Dense(total_words, activation='softmax')
])

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

In [14]:
model.fit(
    X,
    y,
    epochs=10,
    batch_size=128,
    verbose=1
)


Epoch 1/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 113ms/step - accuracy: 0.0398 - loss: 6.8950
Epoch 2/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 113ms/step - accuracy: 0.0697 - loss: 6.3478
Epoch 3/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 113ms/step - accuracy: 0.0970 - loss: 6.0129
Epoch 4/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 122ms/step - accuracy: 0.1121 - loss: 5.7706
Epoch 5/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 119ms/step - accuracy: 0.1234 - loss: 5.5701
Epoch 6/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 115ms/step - accuracy: 0.1297 - loss: 5.3858
Epoch 7/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 112ms/step - accuracy: 0.1361 - loss: 5.2118
Epoch 8/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 111ms/step - accuracy: 0.1426 - loss:

<keras.src.callbacks.history.History at 0x1dc1bd95550>

In [20]:
def generate_text_with_temperature(seed_text, next_words=30, temperature=0.8):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences(
            [token_list],
            maxlen=max_sequence_len - 1,
            padding='pre'
        )

        predictions = model.predict(token_list, verbose=0)[0]
        predictions = np.log(predictions + 1e-9) / temperature
        exp_preds = np.exp(predictions)
        predictions = exp_preds / np.sum(exp_preds)

        predicted_index = np.random.choice(len(predictions), p=predictions)

        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                seed_text += " " + word
                break

    return seed_text

In [21]:
seed_text = "shall i compare thee to a summer"

print("Temperature = 0.5 (More Safe)")
print(generate_text_with_temperature(seed_text, 30, temperature=0.5))

print("\nTemperature = 0.8 (Balanced)")
print(generate_text_with_temperature(seed_text, 30, temperature=0.8))

print("\nTemperature = 1.2 (More Creative)")
print(generate_text_with_temperature(seed_text, 30, temperature=1.2))

Temperature = 0.5 (More Safe)
shall i compare thee to a summer and i know it is a good good lord and you sir john i 'my wife to me and i am the chain of me there is it the queen

Temperature = 0.8 (Balanced)
shall i compare thee to a summer world with good mind i will please you none no i heard it for before you for my view he embark'd and you have done sir for't is i am

Temperature = 1.2 (More Creative)
shall i compare thee to a summer helen half possess'd hard good let i see antony god's sake hath prove my dearest steed then unto him still we publish seek him as your own semblance follow saying


Lower temperature values generate safer and more repetitive text,
while higher values increase creativity at the cost of grammatical correctness.
A temperature of 0.8 provides the best balance between coherence and diversity.

Due to hardware limitations, the dataset size was capped while preserving
the overall language structure and learning behavior.