In [None]:
import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
import io

file_path = 'sample_text.txt'
with io.open(file_path, encoding='utf-8') as f:
    text = f.read()

In [None]:
import re
import string

text = text.lower()

text = text.replace('\n', ' ').replace('\r', ' ')
text = re.sub(r'\s+', ' ', text).strip()

In [None]:
chars = sorted(list(set(text)))
vocabulary_size = len(chars)

char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

print(f"Total unique characters (vocabulary size): {vocabulary_size}")

In [None]:
sequence_length = 70

data_X = []
data_Y = []

for i in range(0, len(text) - sequence_length, 1): # Step by 1 character
    seq_in = text[i:i + sequence_length]
    seq_out = text[i + sequence_length]
    data_X.append([char_to_int[char] for char in seq_in])
    data_Y.append(char_to_int[seq_out])

n_patterns = len(data_X)
print(f"Total patterns: {n_patterns}")

In [None]:
from tensorflow.keras.utils import to_categorical

X = np.reshape(data_X, (n_patterns, sequence_length, 1))

X = X / float(vocabulary_size)

X = np.zeros((n_patterns, sequence_length, vocabulary_size), dtype=bool)
for i, sequence_ints in enumerate(data_X):
    for t, char_int_val in enumerate(sequence_ints):
        X[i, t, char_int_val] = 1

# One-hot encode the output (Y)
y = to_categorical(data_Y, num_classes=vocabulary_size)

print(f"Shape of X (input sequences): {X.shape}")
print(f"Shape of y (target characters): {y.shape}")

In [None]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
sequence_length = X.shape[1]
vocabulary_size = y.shape[1]

In [None]:
model = Sequential(
    [
        LSTM(256, input_shape=(sequence_length, vocabulary_size), return_sequences=True),
        Dropout(0.2),
        LSTM(512),
        Dropout(0.2),
        Dense(vocabulary_size, activation='softmax')
    ]
)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
filepath = "Basic-Text-Gen-{epoch:02d}-{loss:.4f}.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

model.fit(X, y, epochs=15, validation_split=0.2, batch_size=128, callbacks=callbacks_list)

In [None]:
custom_seed_text = "the quick brown fox jumps over the lazy dog and wishes for a sunny day"
pattern = [char_to_int[char] for char in custom_seed_text]

if len(pattern) != sequence_length:
    raise ValueError(f"Custom seed text length ({len(pattern)}) does not match sequence_length ({sequence_length}).")

seed_text = custom_seed_text # Keep for printing

In [None]:
import sys

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

sys.stdout.write(seed_text)

generated_text = seed_text

num_characters_to_generate = 200

for i in range(num_characters_to_generate):
    x_pred = np.zeros((1, sequence_length, vocabulary_size), dtype=bool)
    for t, char_int in enumerate(pattern):
        x_pred[0, t, char_int] = 1

    preds = model.predict(x_pred, verbose=0)[0]

    next_char_index = sample(preds, temperature=0.7)
    next_char = int_to_char[next_char_index]

    generated_text += next_char

    pattern.append(next_char_index)
    pattern = pattern[1:]

    sys.stdout.write(next_char)
    sys.stdout.flush()