In [1]:
import tensorflow as tf
import numpy as np
import os
import time

# 1. Download the dataset
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# 2. Read the data
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text)} characters')

# 3. Take a look at the first 250 characters
print("\n--- FIRST 250 CHARACTERS ---")
print(text[:250])
print("----------------------------")

# 4. Get the unique characters (The Vocabulary)
vocab = sorted(set(text))
print(f'\n{len(vocab)} unique characters: {vocab}')

Length of text: 1115394 characters

--- FIRST 250 CHARACTERS ---
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

----------------------------

65 unique characters: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [2]:
# Create a mapping from unique characters to indices
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

# Create a mapping from indices back to characters (so we can read the output)
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# Helper function to turn a list of numbers back into a text string
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

# Test it out!
test_ids = ids_from_chars(["H", "e", "l", "l", "o"])
print("IDs:", test_ids.numpy())
print("Back to Text:", text_from_ids(test_ids).numpy())

IDs: [21 44 51 51 54]
Back to Text: b'Hello'


In [3]:
# Convert the entire text into IDs
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

# Create a Dataset object
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

# The length of the sequence we will train on
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

# Turn the individual characters into sequences of 101 characters
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)

# Helper function to split "Hello" into input: "Hell" and target: "ello"
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# Map this function to the dataset
dataset = sequences.map(split_input_target)

# Shuffle and batch the data for training (Optimization stuff)
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

print("Data is ready! Input shape:", dataset.element_spec)

Data is ready! Input shape: (TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))


In [4]:
# The size of our vocabulary
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension (how complex the vector for each char is)
embedding_dim = 256

# Number of RNN units (how big the "memory" is)
rnn_units = 1024

# --- THE CORRECTED MODEL CLASS ---
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__() # FIXED: Removed 'self' which causes bugs in TF
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    # GRU with return_state=True always returns (output, state)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = self.embedding(inputs, training=training)

    # FIXED: We let the GRU handle the None state automatically.
    # It creates the zero-state for us internally.
    x, states = self.gru(x, initial_state=states, training=training)

    x = self.dense(x, training=training)

    if return_state:
      return x, states
    return x

# Re-create the model instance with the clean class
model = MyModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=256,
    rnn_units=1024)

print("Model brain re-built successfully.")

Model brain re-built successfully.


In [5]:
import tensorflow as tf
import os

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    # We ask for the state, so we will ALWAYS get a list/tuple back
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = self.embedding(inputs, training=training)

    # --- SAFE EXECUTION BLOCK ---
    # We capture the full result object
    gru_result = self.gru(x, initial_state=states, training=training)

    # We explicitly access items by index.
    # item 0 is always the sequence. item 1 is always the state.
    # This ignores any 3rd or 4th item Keras 3 might throw in.
    x = gru_result[0]
    states = gru_result[1]
    # ----------------------------

    x = self.dense(x, training=training)

    if return_state:
      return x, states
    return x

# Clean up old models
if 'model' in globals():
    del model

# Build the model fresh
vocab_size = len(ids_from_chars.get_vocabulary())
embedding_dim = 256
rnn_units = 1024

model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

# Test it immediately
print("Sanity Check: Running one batch...")
for input_ex, target_ex in dataset.take(1):
    output = model(input_ex)
    print("✅ SUCCESS! Output shape:", output.shape)

Sanity Check: Running one batch...
✅ SUCCESS! Output shape: (64, 100, 66)


In [8]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 20
print("Starting Training (GPU Mode)...")
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Starting Training (GPU Mode)...
Epoch 1/20


OperatorNotAllowedInGraphError: Exception encountered when calling GRU.call().

[1mIterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.[0m

Arguments received by GRU.call():
  • sequences=tf.Tensor(shape=(64, 100, 256), dtype=float32)
  • initial_state=None
  • mask=None
  • training=True

In [9]:
import os

# --- 1. DEFINING THE MODEL (Simple Version) ---
# We use a standard Sequential model. It works automatically.
# No custom __init__ or call() functions to debug.
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(ids_from_chars.get_vocabulary()), 256),
    # We set return_sequences=True so it predicts a character for every character input
    tf.keras.layers.GRU(1024, return_sequences=True),
    tf.keras.layers.Dense(len(ids_from_chars.get_vocabulary()))
])

# --- 2. FIXING THE CHECKPOINT ERROR ---
# Keras 3 requires the file extension to be .weights.h5
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5") # <--- FIXED HERE

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

# --- 3. COMPILE AND VERIFY ---
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

# Run a dummy input just to build the model shape in memory
for example_input, example_target in dataset.take(1):
    example_output = model(example_input)
    print("New Model Shape:", example_output.shape) # Should be (64, 100, 66)

print("Model built and ready for training.")

New Model Shape: (64, 100, 66)
Model built and ready for training.


In [14]:
EPOCHS = 50
print(f"Starting training on GPU...")
# The logic handles itself now. No manual loops.
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
print("Training complete!")

Starting training on GPU...
Epoch 1/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 62ms/step - loss: 0.6518
Epoch 2/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 62ms/step - loss: 0.6173
Epoch 3/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 61ms/step - loss: 0.5806
Epoch 4/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 61ms/step - loss: 0.5550
Epoch 5/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - loss: 0.5347
Epoch 6/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 61ms/step - loss: 0.5121
Epoch 7/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 61ms/step - loss: 0.4957
Epoch 8/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 61ms/step - loss: 0.4838
Epoch 9/50
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 61ms/step - loss: 0.4747
Epoch 10/50
[1m172/172[0m [32m━━━━━━━━━━━━

In [11]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits = self.model(input_ids, training=False)

    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Apply the prediction mask: prevent "[UNK]" from being sampled.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

# Instantiate the one-step generator
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

print("Generator built successfully!")

Generator built successfully!


In [15]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:']) # <--- You can change the starting prompt here!
result = [next_char]

print("Generating text...")

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print("\n------------------------------------------------")
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print(f"\n(Run time: {end - start:.2f} seconds)")

Generating text...

------------------------------------------------
ROMEO:
EThisclland chet, veceacowhosthinowall t mocourt
VE ist he t th Mand ley athan!

I'BUSTher't's ullyonth? litoonsancke all Condgs wil tharots,
FRetour? fe:
S
Younis morthachier, by.
Ay pr GLO:
TClsthan me t
AMNatire ound wherr oun:
MNBUCAS:
I' han.
CA:
LONCHAThing hinoucorathos,
I ave,
Yowhoure aigucochou y che,

ULUSA thes thanghave fifon, taf metour har ind fo yocuryou bud hande!
Thome fears Vit r Pll ant cesuthomy t taters, nchomavea!
G ttod ce't.
NGHABRWimiord ais o hito fond ERDont ticure gh an ENCuk meare,
BRUSoolas ge, sthyorooome int

GHI p hitest bede his msockis oundonds ne amay, he t:
Beanearst me t teshind he w whyom g m ourest bu m, ou moranaditicimonthavim machyo mpe ther?
I ovaif hont wng RI

VE s, wacha ares thorew mangnde; INGouk d dst t ded shell u yourat th bu ove, s sthitil utho. t pe metha g's, t thetho, owiliotora g'se;
D yownd oreset,
IORENUCHe ithit istonchook.
Sthis.
So.
AMysamereake tel

In [16]:
class Generator(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    # The crucial part: return_state=True lets us remember the past
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, training=False):
    x = self.embedding(inputs, training=training)

    # Keras 3 safe execution: explicitly grabbing output and state
    gru_result = self.gru(x, initial_state=states, training=training)
    x = gru_result[0]
    states = gru_result[1]

    x = self.dense(x, training=training)
    return x, states

In [17]:
# 1. Create the new body
vocab_size = len(ids_from_chars.get_vocabulary())
embedding_dim = 256
rnn_units = 1024
gen_model = Generator(vocab_size, embedding_dim, rnn_units)

# 2. Build it by passing a dummy input (needed to initialize shapes)
_ = gen_model(tf.zeros([1, 1]))

# 3. TRANSPLANT THE BRAIN
# We copy the weights from your trained 'model' to 'gen_model'
gen_model.set_weights(model.get_weights())

print("Brain transplant successful! Logic transferred.")

Brain transplant successful! Logic transferred.


In [18]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model # This will now be the gen_model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Mask to prevent [UNK] characters
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Turn text into numbers
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model, PASSING and RECEIVING the states (Memory!)
    predicted_logits, states = self.model(input_ids, states=states)

    # Focus only on the last character's prediction
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits / self.temperature
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the next character
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert back to text
    predicted_chars = self.chars_from_ids(predicted_ids)

    return predicted_chars, states

# Create the generator with the SMART model
one_step_model = OneStep(gen_model, chars_from_ids, ids_from_chars, temperature=0.7)
# Note: I lowered temperature to 0.7 to make it a bit more coherent

In [19]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:']) # You can start with any word
result = [next_char]

print("Generating Shakespeare...")

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

Generating Shakespeare...


OperatorNotAllowedInGraphError: in user code:

    File "/tmp/ipython-input-3453425432.py", line 24, in generate_one_step  *
        predicted_logits, states = self.model(input_ids, states=states)
    File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/ipython-input-1576750373.py", line 15, in call
        gru_result = self.gru(x, initial_state=states, training=training)

    OperatorNotAllowedInGraphError: Exception encountered when calling GRU.call().
    
    [1mIterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.[0m
    
    Arguments received by GRU.call():
      • sequences=tf.Tensor(shape=(1, None, 256), dtype=float32)
      • initial_state=None
      • mask=None
      • training=False


In [20]:
import time

# --- A SIMPLE GENERATION FUNCTION ---
def generate_text_simple(model, start_string, num_generate=1000, temperature=1.0):
  # 1. Setup the starting text
  input_eval = [ids_from_chars([s]) for s in tf.strings.unicode_split(start_string, 'UTF-8')]
  input_eval = tf.convert_to_tensor(input_eval) # Convert to IDs
  input_eval = tf.expand_dims(input_eval, 0)    # Add batch dimension -> (1, length, 1)

  # Store result
  text_generated = []

  # 2. Reset the model's internal states implicitly by building a fresh logic
  print("Generating text...")
  start_time = time.time()

  for i in range(num_generate):
      # To avoid slowing down as the text gets huge, only use the last 100 chars
      # (This matches what we trained on)
      input_segment = input_eval[:, -100:, :]

      # 3. Get predictions
      predictions = model(input_segment)

      # 4. Look only at the LAST character prediction
      predictions = predictions[:, -1, :]
      predictions = predictions / temperature

      # 5. Randomly sample the next character ID
      predicted_id = tf.random.categorical(predictions, num_samples=1)

      # 6. Add it to our list and append to the input for the next loop
      input_eval = tf.concat([input_eval, predicted_id], axis=1)

      # Convert ID back to readable text char
      text_generated.append(chars_from_ids(predicted_id)[0, 0].numpy().decode('utf-8'))

  return (start_string + ''.join(text_generated))

# --- RUN IT! ---
print("Shakespeare AI is thinking...")
txt = generate_text_simple(model, start_string="ROMEO: ", num_generate=1000, temperature=0.7)

print("\n" + "="*80)
print(txt)
print("="*80)

Shakespeare AI is thinking...
Generating text...


InvalidArgumentError: {{function_node __wrapped__ConcatV2_N_2_device_/job:localhost/replica:0/task:0/device:GPU:0}} ConcatOp : Ranks of all input tensors should match: shape[0] = [1,7,1] vs. shape[1] = [1,1] [Op:ConcatV2] name: concat

In [21]:
import time

def generate_text_simple(model, start_string, num_generate=1000, temperature=1.0):
  # --- FIXING THE SHAPE ERROR HERE ---
  # 1. Convert string to characters
  input_chars = tf.strings.unicode_split(start_string, 'UTF-8')
  # 2. Convert to IDs (result is shape [Length])
  input_ids = ids_from_chars(input_chars)
  # 3. Add batch dimension (result is shape [1, Length]) - THIS IS THE CRITICAL FIX
  input_eval = tf.expand_dims(input_ids, 0)

  text_generated = []

  print("Generating text...")
  for i in range(num_generate):
      # Use only the last 100 characters as context
      input_segment = input_eval[:, -100:]

      # Predict
      predictions = model(input_segment)
      # Remove the batch dimension to get predictions for the last character
      predictions = predictions[:, -1, :] / temperature

      # Pick the next character ID
      predicted_id = tf.random.categorical(predictions, num_samples=1)

      # Concat: Now both are Rank 2 tensors [1, N] and [1, 1], so they glue perfectly.
      input_eval = tf.concat([input_eval, predicted_id], axis=1)

      # Convert back to readable text
      text_generated.append(chars_from_ids(predicted_id)[0, 0].numpy().decode('utf-8'))

  return (start_string + ''.join(text_generated))

# --- RUN IT ---
print("Shakespeare AI is thinking...")
txt = generate_text_simple(model, start_string="ROMEO: ", num_generate=1000, temperature=0.7)

print("\n" + "="*80)
print(txt)
print("="*80)

Shakespeare AI is thinking...
Generating text...

ROMEO: mine honest
grace I hair, if it be more than a great desired
Cool my honour, please you this contrady.

POMPEY:
Spir him, or lose the duke asfect your worships. My man
great eaten hate the mother of the king,
He seit his ears a quarrel upon that.

WARWICK:
This same ancernet of mine conceit in him
Than the ears--my brother's life,--
That you shall choose but loss of my mouth, to whose eaten back
And gasping bend that seek to fight.

ISABELLA:
This festirument as a name of death.
Thou fly it will not have it strange,
I'll go and excuse the while
Somewhat was your turn; and I am out--

JULIET:
Our spoiling loss might help in Marcius,
A happy villain, death, sir, herein about him to age,
And hope I throw my gage, by nothing
but some bond; or pardon thee this night.

DUCHESS OF YORK:
Now, for I must not, I'll to my bed;
But softer I see you wot or talk of Juliet,
Destrudgious sweet feasts that fellow or four affairs,
That clog'd the 