### Setup

In [1]:
# Import libraries

# import contextlib
import gc
# import io
# import numpy as np
import os
# import random
import re
import string
import time

os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

import tensorflow as tf
from tensorflow.keras import Sequential # type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, StringLookup, TextVectorization # type: ignore

# NOTE: testing
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2025-03-26 18:10:12.691039: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-26 18:10:12.701409: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743034212.713304   29607 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743034212.717057   29607 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-26 18:10:12.729238: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
RESTART = False
TRANSFER = False
if TRANSFER:
    BATCH_SIZE = 64
    LEARNING_RATE = 0.00005
else:
    BATCH_SIZE = 128
    LEARNING_RATE = 0.0005
# BATCH_SIZE = 64
EPOCH_TO_PICKUP = 42
PATH = ''
VOCAB_SIZE = 8192
SEQUENCE_LENGTH = 128
BUFFER_SIZE = 10000
TEACHER_FORCED = False

### Text Processing

In [3]:
def preprocess_text(text):

    # fix quotes
    text = text.replace("“", "\"")
    text = text.replace("”", "\"")

    # Replace any capital letter at the start of a word with ^ followed by the lowercase letter
    text = re.sub(r"(?<![a-zA-Z])([A-Z])", lambda match: f"^{match.group(0).lower()}", text)

    # Replace all other capital letters with lowercase
    text = re.sub(r"([A-Z])", lambda match: f"{match.group(0).lower()}", text)

    # Replace whitespace characters with special words
    text = re.sub(r"(\t)", r" zztabzz ", text)
    text = re.sub(r"(\n)", r" zznewlinezz ", text)
    text = re.sub(r"(\s)", r" zzspacezz ", text)

    # Split before and after punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, f" {punctuation} ")

    return text

def postprocess_text(text):

    # Replace special words with whitespace characters
    text = text.replace("zztabzz", "\t")
    text = text.replace("zznewlinezz", "\n")
    text = text.replace("zzspacezz", " ")

    # Remake capital letters at beginning of words
    text = re.sub(r"\^([a-z])", lambda match: f"{match.group(1).upper()}", text)

    text = text.replace("^", "")

    return text

def getMyText(filename='all_talks_2000.txt'):
    file_name = filename
    local_dir = 'saved_files'
    local_path = os.path.join(local_dir, file_name)

    try:
        # Ensure the directory exists
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        # Check if the file exists locally
        if os.path.exists(local_path):
            print(f"File '{file_name}' found locally. Using it.")
        else:
            print(f"File '{file_name}' not found.")
            return

        # Read the file's contents
        with open(local_path, 'rb') as file:
            text = file.read().decode(encoding='utf-8')

        return preprocess_text(text)

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [4]:
if RESTART:
  vocab_text = getMyText()

  ### Make vocabulary (Adapted from TensorFlow word embedding tutorial)
  # Use the text vectorization layer to normalize, split, and map strings to
  # integers. Note that the layer uses the custom standardization defined above.
  # Set maximum_sequence length as all samples are not of the same length.
  vectorize_layer = TextVectorization(
      standardize='lower',
      split='whitespace',
      max_tokens=VOCAB_SIZE,
      output_mode='int',
      )
  # Make a text-only dataset (no labels) and call adapt to build the vocabulary.
  vectorize_layer.adapt([vocab_text])
  vocabulary = vectorize_layer.get_vocabulary()

  ### Save Vocabulary
  with open(PATH + "vocabulary.txt", "w") as file:
    for word in vocabulary:
        file.write(word + "\n")
else:
  ### Load Saved Vocabulary
  with open(PATH + "vocabulary.txt", "r") as file:
      vocabulary = [word.strip() for word in file.readlines()]
      vocabulary = vocabulary

  vectorize_layer = TextVectorization(
      vocabulary=vocabulary,
      standardize='lower',
      split='whitespace',
      max_tokens=VOCAB_SIZE,
      output_mode='int',
      )
  
# NOTE: testing
# print(vocabulary[:20])
# print(vocabulary[-20:])

I0000 00:00:1743034219.537008   29607 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1743034219.537194   29607 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13499 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4080 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9


In [5]:
# This function will generate our sequence pairs:
def split_input_target(sequence):
    input_ids = sequence[:-1]
    target_ids = sequence[1:]
    return input_ids, target_ids

# This function will create the dataset
def text_to_dataset_teacher_forced(text, *args, **kwargs):
  
    all_ids = vectorize_layer(text)
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    del all_ids
    sequences = ids_dataset.batch(SEQUENCE_LENGTH+1, drop_remainder=True)
    del ids_dataset

    # Call the function for every sequence in our list to create a new dataset
    # of input->target pairs
    dataset = sequences.map(split_input_target)
    del sequences

    return dataset

def text_to_dataset_curriculum_learning(text, initial_seq_length=24, max_seq_length=SEQUENCE_LENGTH*1.5, growth_factor=1.025, current_epoch=0):
    """
    Implements curriculum learning by gradually increasing sequence length as training progresses.
    
    Parameters:
        text (str): The input text data.
        initial_seq_length (int): The starting sequence length.
        max_seq_length (int): The maximum sequence length.
        growth_factor (float): The rate at which sequence length increases per epoch.
        current_epoch (int): The current training epoch.
    
    Returns:
        tf.data.Dataset: The processed dataset with gradually increasing sequence length.
    """
    # Compute the sequence length dynamically based on the current epoch
    sequence_length = min(int(max_seq_length), int(initial_seq_length * (growth_factor ** current_epoch)))
    
    print(f"Using sequence length: {sequence_length} for epoch {current_epoch + 1}")
    
    # Convert text to token IDs
    all_ids = vectorize_layer(text)
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    del all_ids

    # Create sequences with the dynamically determined length
    sequences = ids_dataset.batch(sequence_length, drop_remainder=True)
    del ids_dataset

    # Create input-target pairs (teacher forcing still applies, but curriculum learning controls sequence length)
    dataset = sequences.map(split_input_target)
    del sequences

    return dataset


def text_from_ids(ids):
    text = ''.join([vocabulary[index] for index in ids])
    return postprocess_text(text)

def setup_dataset(dataset):
    dataset = (
      dataset
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE, drop_remainder=True)
      .prefetch(tf.data.experimental.AUTOTUNE))
    return dataset

In [6]:
if TEACHER_FORCED:
    learning_mode_fn = text_to_dataset_teacher_forced
else:
    learning_mode_fn = text_to_dataset_curriculum_learning

vocabulary_adjusted = vocabulary
vocabulary_adjusted[0] = '[UNK]'
vocabulary_adjusted[1] = ''
words_from_ids = tf.keras.layers.StringLookup(vocabulary=vocabulary_adjusted, invert=True)

if RESTART:
  vocab_ds = learning_mode_fn(vocab_text)
  for input_example, target_example in vocab_ds.take(1):
    print("Input: ")
    print(input_example)
    print(text_from_ids(input_example))
    print(words_from_ids(input_example))
    print("Target: ")
    print(target_example)
    print(text_from_ids(target_example))
    
  vocab_ds = setup_dataset(vocab_ds)

  dataset = vocab_ds
  del vocab_text
  del vocab_ds
else:
  dataset = learning_mode_fn(getMyText())
  dataset = setup_dataset(dataset)

File 'all_talks_2000.txt' found locally. Using it.
Using sequence length: 24 for epoch 1


### Model Building


In [7]:
# Create our custom model. Given a sequence of characters, this
# model's job is to predict what character should come next.
class ConferenceTextModel(tf.keras.Model):

  # This is our class constructor method, it will be executed when
  # we first create an instance of the class
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__()

    # Our model will have three layers:

    # 1. An embedding layer that handles the encoding of our vocabulary into
    #    a vector of values suitable for a neural network
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    # 2. A GRU layer that handles the "memory" aspects of our RNN. If you're
    #    wondering why we use GRU instead of LSTM, and whether LSTM is better,
    #    take a look at this article: https://datascience.stackexchange.com/questions/14581/when-to-use-gru-over-lstm
    #    then consider trying out LSTM instead (or in addition to!)
    self.lstm1 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.lstm2 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.lstm3 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)


    self.hidden1 = tf.keras.layers.Dense(embedding_dim*64, activation='relu')
    self.hidden2 = tf.keras.layers.Dense(embedding_dim*16, activation='relu')

    # 3. Our output layer that will give us a set of probabilities for each
    #    character in our vocabulary.
    self.dense = tf.keras.layers.Dense(vocab_size)

    # Define default initial states as None (they will be created once)
    self.initial_states = None

  def get_initial_states(self, batch_size):
    """Generate initial states once per batch"""
    return [
        [tf.zeros([batch_size, self.lstm1.units]), tf.zeros([batch_size, self.lstm1.units])],
        [tf.zeros([batch_size, self.lstm2.units]), tf.zeros([batch_size, self.lstm2.units])],
        [tf.zeros([batch_size, self.lstm3.units]), tf.zeros([batch_size, self.lstm3.units])],
    ]
  
  # This function will be executed for each epoch of our training. Here
  # we will manually feed information from one layer of our network to the
  # next.

  def call(self, inputs, states=None, return_state=False, training=False):
    x = self.embedding(inputs, training=training)
    batch_size = tf.shape(inputs)[0]

    # Use pre-defined function to get initial states
    if states is None:
        states = self.get_initial_states(batch_size)

    # LSTM layers process input sequentially
    x, state_h_1, state_c_1 = self.lstm1(x, initial_state=states[0], training=training)
    states_out_1 = [state_h_1, state_c_1]

    x, state_h_2, state_c_2 = self.lstm2(x, initial_state=states[1], training=training)
    states_out_2 = [state_h_2, state_c_2]

    x, state_h_3, state_c_3 = self.lstm3(x, initial_state=states[2], training=training)
    states_out_3 = [state_h_3, state_c_3]

    states_out = [states_out_1, states_out_2, states_out_3]

    x = self.hidden1(x, training=training)
    x = self.hidden2(x, training=training)
    x = self.dense(x, training=training)

    return (x, states_out) if return_state else x
  
'''
----------------------------------------------------------------------------------------------------
'''
# Here's the code we'll use to sample for us. It has some extra steps to apply
# the temperature to the distribution, and to make sure we don't get empty
# characters in our text. Most importantly, it will keep track of our model
# state for us.

class OneStep(tf.keras.Model):
  def __init__(self, model, vectorize_layer, vocabulary, temperature=1):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.vectorize_layer = vectorize_layer
    self.vocabulary = vocabulary

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = StringLookup(vocabulary=list(vocabulary))(['', '[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(vocabulary)])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask,validate_indices=False)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_ids = self.vectorize_layer(inputs)

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states =  self.model(inputs=input_ids, states=states,
                                          return_state=True)
    del input_ids
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    del predicted_logits
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Return the characters and model state.
    return words_from_ids(predicted_ids), states

'''
----------------------------------------------------------------------------------------------------
'''

def produce_sample(model, vectorize_layer, vocabulary, temp, epoch, prompt, length=200, do_print=True):
  one_step_model = OneStep(model, vectorize_layer, vocabulary, temp)
  states = None
  next_char = tf.constant([preprocess_text(prompt)])
  result = [tf.constant([prompt])]

  for _ in range(length):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)
  if do_print:
    print(postprocess_text(result[0].numpy().decode('utf-8')))
    print('Epoch: ' + str(epoch) + '\n', file=open(PATH + 'tree.txt', 'a'))
    print('Temp: ' + str(temp) + '\n', file=open(PATH + 'tree.txt', 'a'))
    print(postprocess_text(result[0].numpy().decode('utf-8')), file=open(PATH + 'tree.txt', 'a'))
    print('\n\n', file=open(PATH + 'tree.txt', 'a'))
    del states
    del next_char
    del result
  else:
    return postprocess_text(result[0].numpy().decode('utf-8'))

In [8]:
embedding_dim = 128
rnn_units = 512
model = ConferenceTextModel(VOCAB_SIZE, embedding_dim, rnn_units)

# NOTE: testing
# Verify the output of our model is correct by running one sample through
# This will also compile the model for us. This step will take a bit.
# for input_example_batch, target_example_batch in dataset.take(1):
#     example_batch_predictions = model(input_example_batch)
#     print(example_batch_predictions.shape, "# (BATCH_SIZE, sequence_length, VOCAB_SIZE)")

# NOTE: more testing
# model.summary()

### Model Training

In [9]:
if TRANSFER:
  model_file_to_load = "conf-test2-400ep.keras"
  model.load_weights(PATH + 'models/' + model_file_to_load)

In [10]:
early_stop = EarlyStopping(
    monitor='loss',
    min_delta=0.01,
    patience=5,
    restore_best_weights=True
)
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=opt, loss=loss)

base_epochs = 100
transfer_epochs = 0
num_epochs_total = base_epochs + transfer_epochs
main_test_num = 5
transfer_test_num = 1

if TEACHER_FORCED:
    learn_type = 'teacher'
else:
    learn_type = 'curric'

if RESTART:
  start_epoch = 0
else:
  start_epoch = EPOCH_TO_PICKUP

In [11]:
def run(name='all_talks_2000'):
  stop_training = False
  for e in range(start_epoch, num_epochs_total):
    if stop_training:
      print(f"Early stopping triggered at epoch {e + 1}. Exiting training.")
      break
    success = False
    while(success == False):
      try:
        print(f'{time.strftime("%H:%M:%S", time.localtime(time.time()))} - epoch: {e + 1}/{num_epochs_total}')
        new_text = getMyText(f'{name}.txt')
        # dataset = learning_mode_fn(new_text, current_epoch=e)
        dataset = learning_mode_fn(new_text, growth_factor=1.05, current_epoch=e)
        # print("Dataset created...")
        del new_text
        dataset = setup_dataset(dataset)
        # print("Dataset set up...")
        model.optimizer.learning_rate.assign(LEARNING_RATE*(0.99**e))
        # print("Model is fitting...")
        model.fit(dataset, epochs=1, verbose=1, callbacks=[early_stop])
        if early_stop.stopped_epoch > 0:
          stop_training = True
        print(f"finished training epoch: {e + 1}...\n")
        del dataset
        for temp in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
          produce_sample(model, vectorize_layer, vocabulary, temp, e, 'As a disciple of Jesus Christ, I testify')
        print("\nsamples produced...")
        gc.collect()
        print("garbage collected...")
        tf.keras.backend.clear_session()
        print("session cleared (to save memory)...\n")
        success = True
      except:
        gc.collect()
        tf.keras.backend.clear_session()
        try:
          del dataset
        except:
          print("dataset was not produced or has already been deleted")
        print("retrying epoch: " , e + 1)

  print(f'\nFinal epoch ({num_epochs_total}) reached.')
  filename = f'models/conf-t{main_test_num}-{base_epochs}e_{learn_type}'
  if base_epochs < num_epochs_total:
    filename += f'_{name.split("-")[-1]}-t{transfer_test_num}-{transfer_epochs}e'
  filename += '.keras'
  model.save(filename)
  print(f"Model saved as {filename}")

In [20]:
# NOTE: testing
# dataset = learning_mode_fn(getMyText(), growth_factor=1.05, current_epoch=43)

In [None]:
run() # just once, for training from scratch

18:10:45 - epoch: 43/100
File 'all_talks_2000.txt' found locally. Using it.
Using sequence length: 186 for epoch 43


I0000 00:00:1743034252.312030   29771 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 43/333[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:36[0m 332ms/step - loss: 6.5525

In [None]:
speakers = ['david-a-bednar', 'dieter-f-uchtdorf', 'jeffrey-r-holland', 'russell-m-nelson', 'thomas-s-monson', 'patrick-kearon']
for speaker in speakers:
  filename = f'speakers/{speaker}'
  run(filename)

epoch: 201/250
File 'speakers/david-a-bednar.txt' found locally. Using it.
Using sequence length: 256 for epoch 200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 228ms/step - loss: 0.4447
finished training...
As a disciple of Jesus Christ, I testify to you that our Heavenly Father lives and loves each of us and that Jesus is the Christ, our Lord and Savior. This is their instruction, individually and collectively. He has called us and directed our prophet to serve President Hinckley. There is nothing more important than honoring the covenants it gives a full measure of love for the Lord and His children. Please remember this sure evening. We have heard again some of my most poignant missionary experiences. I don’t know about that, but I do
As a disciple of Jesus Christ, I testify to you that our Heavenly Father lives and loves each of us and that Jesus is the Christ, our Lord and Savior. Joseph Smith is the prophet through whom the Lord restored the gospel in these la

In [None]:
model.load_weights("models/conf-t4-200e.keras")
for temp in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    produce_sample(model, vectorize_layer, vocabulary, temp, num_epochs_total - 1, 'What does the fox say?')

In [None]:
final_seed_text = "The world seemed like such a peaceful place until the tree of life was discovered in London."
model_path = 'models/'
response_path = 'saved_files/responses/'

base_filename = f'conf-t{main_test_num}-{base_epochs}e_{learn_type}'
speaker_file_end = f'-t{transfer_test_num}-{transfer_epochs}e'

keras_ext = '.keras'
response_ext = '.txt'

In [None]:
def save_responses(model_file, response_file, seed_text):
    model.load_weights(model_file)
    for temp in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        response = \
            produce_sample(model,
                vectorize_layer,
                vocabulary,
                temp,
                num_epochs_total - 1,
                seed_text,
                length=1000,
                do_print=False)
        with open(response_file, 'a') as f:
            f.write(f"Temp: {temp}\n")
            f.write(response)
            f.write('\n\n')
    print(f"Response saved as {response_file}")

In [None]:
for speaker in speakers:
    last_name = '_' + speaker.split('-')[-1]
    speaker_keras_filename = model_path + base_filename + last_name + speaker_file_end + keras_ext
    speaker_response_filename = response_path + base_filename + last_name + speaker_file_end + response_ext
    save_responses(speaker_keras_filename, speaker_response_filename, final_seed_text)

  saveable.load_own_variables(weights_store.get(inner_path))


Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric_bednar-t1-50e.txt
Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric_uchtdorf-t1-50e.txt
Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric_holland-t1-50e.txt
Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric_nelson-t1-50e.txt
Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric_monson-t1-50e.txt
Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric_kearon-t1-50e.txt


In [26]:
general_keras_filename = model_path + base_filename + keras_ext
general_response_filename = response_path + base_filename + response_ext
save_responses(general_keras_filename, general_response_filename, final_seed_text)

Response saved as saved_files/responses/saved_files/responses/conf-t4-200e_curric.txt
