### Setup

In [None]:
# Import libraries

import contextlib
import gc
import io
import numpy as np
import os
import random
import re
import string
import time

import tensorflow as tf
from tensorflow.keras import Sequential # type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, StringLookup, TextVectorization # type: ignore

# NOTE: testing
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2025-03-20 01:52:39.007305: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-20 01:52:39.139210: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742457159.189067  459171 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742457159.204269  459171 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-20 01:52:39.329560: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
RESTART = True
EPOCH_TO_PICKUP = 0
PATH = ''
VOCAB_SIZE = 8192
SEQUENCE_LENGTH = 128
BATCH_SIZE = 64
BUFFER_SIZE = 10000
TEACHER_FORCED = False


### Text Processing

In [3]:
def preprocess_text(text):

    # text = text.replace("Project Gutenberg", "")
    # text = text.replace("Gutenberg", "")

    # Remove carriage returns
    # text = text.replace("\r", "")

    # fix quotes
    text = text.replace("“", "\"")
    text = text.replace("”", "\"")

    # Replace any capital letter at the start of a word with ^ followed by the lowercase letter
    text = re.sub(r"(?<![a-zA-Z])([A-Z])", lambda match: f"^{match.group(0).lower()}", text)

    # Replace all other capital letters with lowercase
    text = re.sub(r"([A-Z])", lambda match: f"{match.group(0).lower()}", text)

    # Remove duplicate whitespace
    # text = re.sub(r"\s+", " ", text)
    # text = re.sub(r"\n+", "\n", text)
    # text = re.sub(r"\t+", "\t", text)

    # Replace whitespace characters with special words
    text = re.sub(r"(\t)", r" zztabzz ", text)
    text = re.sub(r"(\n)", r" zznewlinezz ", text)
    text = re.sub(r"(\s)", r" zzspacezz ", text)

    # Split before and after punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, f" {punctuation} ")

    return text

def postprocess_text(text):

    # Replace special words with whitespace characters
    text = text.replace("zztabzz", "\t")
    text = text.replace("zznewlinezz", "\n")
    text = text.replace("zzspacezz", " ")

    # Remake capital letters at beginning of words
    text = re.sub(r"\^([a-z])", lambda match: f"{match.group(1).upper()}", text)

    text = text.replace("^", "")

    return text

def getMyText(filename='all_talks.txt'):
    file_name = filename
    local_dir = 'saved_files'
    local_path = os.path.join(local_dir, file_name)

    try:
        # Ensure the directory exists
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        # Check if the file exists locally
        if os.path.exists(local_path):
            print(f"File '{file_name}' found locally. Using it.")
        else:
            print(f"File '{file_name}' not found.")
            return

        # Read the file's contents
        with open(local_path, 'rb') as file:
            text = file.read().decode(encoding='utf-8')

        return preprocess_text(text)

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
if RESTART:
  vocab_text = getMyText()

  ### Make vocabulary (Adapted from TensorFlow word embedding tutorial)
  # Use the text vectorization layer to normalize, split, and map strings to
  # integers. Note that the layer uses the custom standardization defined above.
  # Set maximum_sequence length as all samples are not of the same length.
  vectorize_layer = TextVectorization(
      standardize='lower',
      split='whitespace',
      max_tokens=VOCAB_SIZE,
      output_mode='int',
      )
  # Make a text-only dataset (no labels) and call adapt to build the vocabulary.
  vectorize_layer.adapt([vocab_text])
  vocabulary = vectorize_layer.get_vocabulary()

  ### Save Vocabulary
  with open(PATH + "vocabulary.txt", "w") as file:
    for word in vocabulary:
        file.write(word + "\n")
else:
  ### Load Saved Vocabulary
  with open(PATH + "vocabulary.txt", "r") as file:
      vocabulary = [word.strip() for word in file.readlines()]
      vocabulary = vocabulary

  vectorize_layer = TextVectorization(
      vocabulary=vocabulary,
      standardize='lower',
      split='whitespace',
      max_tokens=VOCAB_SIZE,
      output_mode='int',
      )
  
# NOTE: testing
# print(vocabulary[:20])
# print(vocabulary[-20:])

File 'all_talks.txt' found locally. Using it.


I0000 00:00:1742457189.913623  459171 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13499 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4080 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9


In [5]:
# This function will generate our sequence pairs:
def split_input_target(sequence):
    input_ids = sequence[:-1]
    target_ids = sequence[1:]
    return input_ids, target_ids

# This function will create the dataset
def text_to_dataset_teacher_forced(text, *args, **kwargs):
  
  all_ids = vectorize_layer(text)
  ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
  del all_ids
  sequences = ids_dataset.batch(SEQUENCE_LENGTH+1, drop_remainder=True)
  del ids_dataset

  # Call the function for every sequence in our list to create a new dataset
  # of input->target pairs
  dataset = sequences.map(split_input_target)
  del sequences

  return dataset

def text_to_dataset_curriculum_learning(text, initial_seq_length=24, max_seq_length=SEQUENCE_LENGTH*2, growth_factor=1.05, current_epoch=0):
    """
    Implements curriculum learning by gradually increasing sequence length as training progresses.
    
    Parameters:
        text (str): The input text data.
        initial_seq_length (int): The starting sequence length.
        max_seq_length (int): The maximum sequence length.
        growth_factor (float): The rate at which sequence length increases per epoch.
        current_epoch (int): The current training epoch.
    
    Returns:
        tf.data.Dataset: The processed dataset with gradually increasing sequence length.
    """
    # Compute the sequence length dynamically based on the current epoch
    sequence_length = min(max_seq_length, int(initial_seq_length * (growth_factor ** current_epoch)))
    
    print(f"Using sequence length: {sequence_length} for epoch {current_epoch}")
    
    # Convert text to token IDs
    all_ids = vectorize_layer(text)
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    del all_ids

    # Create sequences with the dynamically determined length
    sequences = ids_dataset.batch(sequence_length + 1, drop_remainder=True)
    del ids_dataset

    # Create input-target pairs (teacher forcing still applies, but curriculum learning controls sequence length)
    dataset = sequences.map(split_input_target)
    del sequences

    return dataset


def text_from_ids(ids):
  text = ''.join([vocabulary[index] for index in ids])
  return postprocess_text(text)

def setup_dataset(dataset):
  dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
  return dataset

In [6]:
if TEACHER_FORCED:
    learning_mode_fn = text_to_dataset_teacher_forced
else:
    learning_mode_fn = text_to_dataset_curriculum_learning

vocabulary_adjusted = vocabulary
vocabulary_adjusted[0] = '[UNK]'
vocabulary_adjusted[1] = ''
words_from_ids = tf.keras.layers.StringLookup(vocabulary=vocabulary_adjusted, invert=True)

if RESTART:
  vocab_ds = learning_mode_fn(vocab_text)
  for input_example, target_example in vocab_ds.take(1):
    print("Input: ")
    print(input_example)
    print(text_from_ids(input_example))
    print(words_from_ids(input_example))
    print("Target: ")
    print(target_example)
    print(text_from_ids(target_example))
    
  vocab_ds = setup_dataset(vocab_ds)

Using sequence length: 24 for epoch 0
Input: 
tf.Tensor(
[   3  169    2   37    5    2   19    2   13    2 2909    5    2 3558
    2   21    2 1035    2  908    2 3379    2    9], shape=(24,), dtype=int64)
Let us, for a minute, examine our leadership report card to
tf.Tensor(
[b'^' b'let' b'zzspacezz' b'us' b',' b'zzspacezz' b'for' b'zzspacezz' b'a'
 b'zzspacezz' b'minute' b',' b'zzspacezz' b'examine' b'zzspacezz' b'our'
 b'zzspacezz' b'leadership' b'zzspacezz' b'report' b'zzspacezz' b'card'
 b'zzspacezz' b'to'], shape=(24,), dtype=string)
Target: 
tf.Tensor(
[ 169    2   37    5    2   19    2   13    2 2909    5    2 3558    2
   21    2 1035    2  908    2 3379    2    9    2], shape=(24,), dtype=int64)
let us, for a minute, examine our leadership report card to 


2025-03-20 01:53:19.109745: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


### Model Building


In [7]:
# Create our custom model. Given a sequence of characters, this
# model's job is to predict what character should come next.
class ConferenceTextModel(tf.keras.Model):

  # This is our class constructor method, it will be executed when
  # we first create an instance of the class
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__()

    # Our model will have three layers:

    # 1. An embedding layer that handles the encoding of our vocabulary into
    #    a vector of values suitable for a neural network
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    # 2. A GRU layer that handles the "memory" aspects of our RNN. If you're
    #    wondering why we use GRU instead of LSTM, and whether LSTM is better,
    #    take a look at this article: https://datascience.stackexchange.com/questions/14581/when-to-use-gru-over-lstm
    #    then consider trying out LSTM instead (or in addition to!)
    self.lstm1 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.lstm2 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.lstm3 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)


    self.hidden1 = tf.keras.layers.Dense(embedding_dim*64, activation='relu')
    self.hidden2 = tf.keras.layers.Dense(embedding_dim*16, activation='relu')

    # 3. Our output layer that will give us a set of probabilities for each
    #    character in our vocabulary.
    self.dense = tf.keras.layers.Dense(vocab_size)

    ### Addition to default:
    # Define default initial states as None (they will be created once)
    self.initial_states = None

  ### Addition to default:
  def get_initial_states(self, batch_size):
    """Generate initial states once per batch"""
    return [
        [tf.zeros([batch_size, self.lstm1.units]), tf.zeros([batch_size, self.lstm1.units])],
        [tf.zeros([batch_size, self.lstm2.units]), tf.zeros([batch_size, self.lstm2.units])],
        [tf.zeros([batch_size, self.lstm3.units]), tf.zeros([batch_size, self.lstm3.units])],
    ]
  
  # This function will be executed for each epoch of our training. Here
  # we will manually feed information from one layer of our network to the
  # next.

  ### Original call():
  '''
  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs

    # 1. Feed the inputs into the embedding layer, and tell it if we are
    #    training or predicting
    # x = self.embedding(x, training=training)
    x = self.embedding(x, training=training)

    # 2. If we don't have any state in memory yet, get the initial random state
    #    from our GRUI layer.
    batch_size = tf.shape(inputs)[0]

    if states is None:
      states1 = [tf.zeros([batch_size, self.lstm1.units]), tf.zeros([batch_size, self.lstm1.units])]
      states2 = [tf.zeros([batch_size, self.lstm2.units]), tf.zeros([batch_size, self.lstm2.units])]
      states3 = [tf.zeros([batch_size, self.lstm3.units]), tf.zeros([batch_size, self.lstm3.units])]
    else:
      states1 = states[0]
      states2 = states[1]
      states3 = states[2]
    # 3. Now, feed the vectorized input along with the current state of memory
    #    into the gru layer.
    x, state_h_1, state_c_1 = self.lstm1(x, initial_state=states1, training=training)
    states_out_1 = [state_h_1,state_c_1]

    x, state_h_2, state_c_2 = self.lstm2(x, initial_state=states2, training=training)
    states_out_2 = [state_h_2,state_c_2]

    x, state_h_3, state_c_3 = self.lstm3(x, initial_state=states3, training=training)
    states_out_3 = [state_h_3,state_c_3]

    states_out = [states_out_1, states_out_2, states_out_3]

    x = self.hidden1(x,training=training)
    x = self.hidden2(x,training=training)
    # 4. Finally, pass the results on to the dense layer
    x = self.dense(x, training=training)

    # 5. Return the results
    if return_state:
      return x, states_out
    else:
      return x
  '''

  ### Modified call():
  def call(self, inputs, states=None, return_state=False, training=False):
    x = self.embedding(inputs, training=training)
    batch_size = tf.shape(inputs)[0]

    # Use pre-defined function to get initial states
    if states is None:
        states = self.get_initial_states(batch_size)

    # LSTM layers process input sequentially
    x, state_h_1, state_c_1 = self.lstm1(x, initial_state=states[0], training=training)
    states_out_1 = [state_h_1, state_c_1]

    x, state_h_2, state_c_2 = self.lstm2(x, initial_state=states[1], training=training)
    states_out_2 = [state_h_2, state_c_2]

    x, state_h_3, state_c_3 = self.lstm3(x, initial_state=states[2], training=training)
    states_out_3 = [state_h_3, state_c_3]

    states_out = [states_out_1, states_out_2, states_out_3]

    x = self.hidden1(x, training=training)
    x = self.hidden2(x, training=training)
    x = self.dense(x, training=training)

    return (x, states_out) if return_state else x

In [None]:
if RESTART:
  dataset = vocab_ds
  del vocab_text
  del vocab_ds
else:
  dataset = learning_mode_fn(getMyText())
  dataset = setup_dataset(dataset)

embedding_dim = 128
rnn_units = 512
model = ConferenceTextModel(VOCAB_SIZE, embedding_dim, rnn_units)

# NOTE: testing
# Verify the output of our model is correct by running one sample through
# This will also compile the model for us. This step will take a bit.
# for input_example_batch, target_example_batch in dataset.take(1):
#     example_batch_predictions = model(input_example_batch)
#     print(example_batch_predictions.shape, "# (BATCH_SIZE, sequence_length, VOCAB_SIZE)")

In [None]:
# NOTE: testing
# model.summary()

In [9]:
# Here's the code we'll use to sample for us. It has some extra steps to apply
# the temperature to the distribution, and to make sure we don't get empty
# characters in our text. Most importantly, it will keep track of our model
# state for us.

class OneStep(tf.keras.Model):
  def __init__(self, model, vectorize_layer, vocabulary, temperature=1):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.vectorize_layer = vectorize_layer
    self.vocabulary = vocabulary

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = StringLookup(vocabulary=list(vocabulary))(['', '[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(vocabulary)])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask,validate_indices=False)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_ids = self.vectorize_layer(inputs)

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states =  self.model(inputs=input_ids, states=states,
                                          return_state=True)
    del input_ids
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    del predicted_logits
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Return the characters and model state.
    return words_from_ids(predicted_ids), states


In [10]:
def produce_sample(model, vectorize_layer, vocabulary, temp, epoch, prompt, length=200, do_print=True):
  one_step_model = OneStep(model, vectorize_layer, vocabulary, temp)
  states = None
  next_char = tf.constant([preprocess_text(prompt)])
  result = [tf.constant([prompt])]

  for _ in range(length):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)
  if do_print:
    print(postprocess_text(result[0].numpy().decode('utf-8')))
    print('Epoch: ' + str(epoch) + '\n', file=open(PATH + 'tree.txt', 'a'))
    print('Temp: ' + str(temp) + '\n', file=open(PATH + 'tree.txt', 'a'))
    print(postprocess_text(result[0].numpy().decode('utf-8')), file=open(PATH + 'tree.txt', 'a'))
    print('\n\n', file=open(PATH + 'tree.txt', 'a'))
    del states
    del next_char
    del result
  else:
    return postprocess_text(result[0].numpy().decode('utf-8'))

### Model Training

In [11]:
if RESTART == False:
  model.load_weights(PATH + "models/conf-test2-400ep.keras")

In [None]:
early_stop = EarlyStopping(
    monitor='loss',
    min_delta=0.002,
    patience=10,
    restore_best_weights=True
)
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
LEARNING_RATE = 0.002
opt = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=opt, loss=loss)

### NOTE:
# Look into learning rate more (& decay?)
# Set learning rate based on if transfer learning or not

base_epochs = 200
transfer_epochs = 0
num_epochs_total = base_epochs + transfer_epochs
main_test_num = 4
transfer_test_num = 2
if TEACHER_FORCED:
    learn_type = 'teacher'
else:
    learn_type = 'curric'

if RESTART:
  start_epoch = 0
else:
  start_epoch = EPOCH_TO_PICKUP

In [15]:
def run(name='all_talks'):
  stop_training = False
  for e in range(start_epoch, num_epochs_total):
    if stop_training:
      print(f"Early stopping triggered at epoch {e}. Exiting training.")
      break
    success = False
    while(success == False):
      try:
        print(f'epoch: {e + 1}/{num_epochs_total}')
        new_text = getMyText(f'{name}.txt')
        dataset = learning_mode_fn(new_text, current_epoch=e)
        del new_text
        dataset = setup_dataset(dataset)
        model.optimizer.learning_rate.assign(LEARNING_RATE*(0.99**e))
        model.fit(dataset, epochs=1, verbose=1, callbacks=[early_stop])
        if early_stop.stopped_epoch > 0:
          stop_training = True
        print("finished training...")
        del dataset
        for temp in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
          produce_sample(model, vectorize_layer, vocabulary, temp, e, 'As a disciple of Jesus Christ, I testify')
        print("samples produced...")
        gc.collect()
        print("garbage collected...")
        tf.keras.backend.clear_session()
        print("session cleared (to save memory)...")
        success = True
      except:
        gc.collect()
        tf.keras.backend.clear_session()
        try:
          del dataset
        except:
          print("dataset already deleted")
        print("retrying epoch: " , e)

  print(f'Final epoch ({num_epochs_total}) reached.')
  filename = f'models/conf-t{main_test_num}-{base_epochs}e_{learn_type}'
  if base_epochs < num_epochs_total:
    filename += f'_{name.split("-")[-1]}-t{transfer_test_num}-{transfer_epochs}e'
  filename += '.keras'
  model.save(filename)
  print(f"Model saved as {filename}")

In [16]:
run() # just once, for training from scratch

epoch: 1/200
File 'all_talks.txt' found locally. Using it.
Using sequence length: 24 for epoch 0


I0000 00:00:1742457304.897278  460334 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m4966/4966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 70ms/step - loss: 2.7488
finished training...
As a disciple of Jesus Christ, I testify of the Savior Jesus Christ, we must be saved, we will be a loving Father in His Son. Jesus Christ, is the Savior of Israel and His Atonement. The Lord has said that the Savior is the Savior and His Atonement, and His Atonement is not surprising that we are not accountable to God. 
 I know that we are not careful to be blessed with the Lord’s Church and to the Church. The Lord said, "I know that the 
As a disciple of Jesus Christ, I testify that His Atonement is not surprising that the gospel of Jesus Christ is the Lord and His Son, Jesus Christ, who holds the gospel of Jesus Christ and be added to the Savior’s Atonement. We are grateful for the priesthood and auxiliary leaders and of the ward and ward presidencies. The Lord promised that "the Lord hath commanded in heaven to his people, and the Lord shall be accomplished in heaven.

In [17]:
speakers = ['david-a-bednar', 'dieter-f-uchtdorf', 'jeffrey-r-holland', 'russell-m-nelson', 'thomas-s-monson', 'patrick-kearon']

In [None]:
for speaker in speakers:
  name = f'speakers/{speaker}'
  run(name)

In [None]:
model.load_weights("models/conf-t4-200e.keras")
for temp in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    produce_sample(model, vectorize_layer, vocabulary, temp, num_epochs_total - 1, 'What does the fox say?')

In [18]:
final_seed_text = "The world seemed like such a peaceful place until the magic tree was discovered in London."
model_path = 'models/'
response_path = 'saved_files/responses/'

base_filename = f'conf-t{main_test_num}-{base_epochs}e_{learn_type}'
file_end = f'-t{transfer_test_num}-{transfer_epochs}e'

keras_ext = '.keras'
response_ext = '.txt'

In [25]:
def save_responses(model_file, response_file, seed_text):
    model.load_weights(model_file)
    for temp in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        response = \
            produce_sample(model,
                vectorize_layer,
                vocabulary,
                temp,
                num_epochs_total - 1,
                seed_text,
                length=1000,
                do_print=False)
        with open(response_file, 'a') as f:
            f.write(f"Temp: {temp}\n")
            f.write(response)
            f.write('\n\n')
    print(f"Response saved as {response_path + response_file}")

In [None]:
for speaker in speakers:
    last_name = '_' + speaker.split('-')[-1]
    keras_filename = model_path + base_filename + last_name + file_end + keras_ext
    response_filename = response_path + base_filename + last_name + file_end + response_ext
    save_responses(keras_filename, response_filename, final_seed_text)

In [26]:
general_keras_filename = model_path + base_filename + keras_ext
general_response_filename = response_path + base_filename + response_ext
save_responses(general_keras_filename, general_response_filename, final_seed_text)

Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric.txt
