<a href="https://colab.research.google.com/github/Trev621/GenAI/blob/main/HW5/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Author: Trevor Lacoste
The purpose of this assignment is to develop an LSTM (Long Short-Term Memory) model to generate text.

In [1]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

# Parameters

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
BATCH_SIZE = 32
EPOCHS = 75

# Data Collection and Preparation

In [3]:
import requests

#List of URLs for additional texts (e.g., different Shakespeare plays)
urls = [
  "https://www.gutenberg.org/files/1041/1041-0.txt",  #Hamlet
  "https://www.gutenberg.org/files/152/152-0.txt",   #Macbeth
  "https://www.gutenberg.org/files/1112/1112-0.txt"   #Othello
]

#Initialize an empty string to hold all text
all_text = ""

#Download each text file and append to all_text
for url in urls:
  response = requests.get(url)
  text = response.text
  all_text += text + "\n\n"  #Separate texts by newlines

#Save combined text to a single file
with open("combined_shakespeare.txt", "w", encoding="utf-8") as file:
  file.write(all_text)

In [4]:
def preprocess_text(text):
    #Remove the header and footer
    start_index = text.find('*** START OF THIS PROJECT GUTENBERG EBOOK')
    end_index = text.find('*** END OF THIS PROJECT GUTENBERG EBOOK')
    if start_index != -1 and end_index != -1:
        text = text[start_index:end_index]

    text = re.sub(r'[^a-zA-Z\s]', '', text)  #Remove non-letter characters
    text = text.lower()  #Convert text to lowercase
    return text

cleaned_text = preprocess_text(all_text)

#Tokenization
tokens = cleaned_text.split() #Split the text into words (tokens)

In [5]:
#Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(tokens)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [6]:
#Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [7]:
#Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [19]:
#Create the training set, shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]  #Input sequence
    y = tokenized_sentences[:, -1]   #Predict next word
    return x, y

train_ds = text_ds.map(prepare_inputs)

# LSTM Baseline Model

In [20]:
#One layer LSTM model
inputs_base = layers.Input(shape=(None,), dtype="int32")  #Input layer

x_base = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs_base) #Embedding layer

x_base = layers.LSTM(N_UNITS)(x_base) #LSTM layer

outputs_base = layers.Dense(VOCAB_SIZE, activation="softmax")(x_base) #Output layer

base_lstm = models.Model(inputs_base, outputs_base)
base_lstm.summary() #Model summary

In [21]:
#Compiling base model
loss_base = losses.SparseCategoricalCrossentropy()
base_lstm.compile("adam", loss_base)

In [22]:
#Training base model
base_lstm.fit(
    train_ds,
    epochs=50
)

Epoch 1/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 0.6928
Epoch 2/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 2.6170e-04
Epoch 3/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 8.0717e-05
Epoch 4/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 3.3741e-05
Epoch 5/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 1.5854e-05
Epoch 6/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 7.8839e-06
Epoch 7/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - loss: 4.0791e-06
Epoch 8/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - loss: 2.1749e-06
Epoch 9/50
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 1.1919e-06
Epoch 10/50
[1m1353/13

<keras.src.callbacks.history.History at 0x7ae4fe552e90>

In [23]:
def generate_text(seed, model, vectorize_layer, vocab, temperature, max_length=100):
    for _ in range(max_length):
        #Convert the seed to tokenized numbers
        tokenized_input = vectorize_layer([seed])
        #Predict next word probabilities
        predictions = model.predict(tokenized_input, verbose=0)
        logits = predictions[0, :]  #For 2D output

        scaled_logits = logits / temperature #Apply temp scaling
        probabilities = tf.nn.softmax(scaled_logits).numpy()

        #Ensure valid sampling within vocab range
        next_word_index = np.random.choice(len(probabilities), p=probabilities)
        if next_word_index >= len(vocab):
            next_word_index = len(vocab) - 1

        #Get the next word and add it to the seed
        next_word = vocab[next_word_index]
        if next_word == "":  #If empty, continue
            continue
        seed += " " + next_word

    return seed

In [24]:
#Sample prompts
prompts = [
    "To be, or not to be",
    "Shall I compare thee to a summer's day",
    "All the world's a stage"
]

temperatures = [0.1, 0.5, 1.0]

#Test output for each prompt with varying temperatures
for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    for temp in temperatures:
        generated_text = generate_text(prompt, base_lstm, vectorize_layer, vocab, temperature=temp)
        print(f"\nTemperature {temp}:\n{generated_text}")



Prompt: To be, or not to be

Temperature 0.1:
To be, or not to be willing widowd abate shamed wine abate spring toile abate abate legions these shady goarie abate asked abate abate abate abate abate struck fell booke heartlesse respects abate abate juliet abate thinkest abate impiety hrefpolicytermsofusehtmlterms abate abate blots abate abate selfdoing worst abate abate wring goose feet

Temperature 0.5:
To be, or not to be abate abate angell hairs abate theeuish form abate feed abate abate orecouered fortunes weeps abate shoomaker language abate true ah much surly hall enjoyer hoares abate abate abate white beg beautious immortal excesse abate happlie abate abate crossd abate abate abate abate acted diseasd mousehunt goodness lxiii abate abate abate liud abate abate tarrie abate abate abate abate offend stayd abate perilous fruite according abate abate liuer abate anticipate abate diuers whereupon abate abate abate push goddess cut abroad arrest hath vnhallowed xxxii abate abate cons

# Evaluation

The basic LSTM, with one layer, trained in about half of the time of more complex LSTM. The results with 50 epochs and only layer are better than expected, although there are still clear issues with the model. It is able to generate a variety of different words, but some of the words like "carefull" are mispelled and other words like, "douefeatherd" seem to be made up. Another clear issue with this model is that the word "abate" seems to get repeated a lot. So overall, the model has a very basic idea of the words that Shakespeare uses. As for the style, none of the phrases really make sense. The words seem to be randomly ordered. To improve on the coherence and style, the model needs a lot more training.

# LSTM Model & Training

In [None]:
inputs = layers.Input(shape=(None,), dtype="int32") #Input layer

x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs) #Embedding layer

x = layers.LSTM(N_UNITS, return_sequences=True)(x) #First LSTM layer
x = layers.LSTM(N_UNITS, return_sequences=True)(x) #Second LstM layer
x = layers.LSTM(N_UNITS)(x) #Last LSTM Layer

outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x) #Output layer

lstm = models.Model(inputs, outputs)
lstm.summary() #Model summary

In [None]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn) #Compile the LSTM

In [None]:
#Training LSTM
lstm.fit(
    train_ds,
    epochs=EPOCHS
)

Epoch 1/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 24ms/step - loss: 0.6949
Epoch 2/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 24ms/step - loss: 2.6220e-04
Epoch 3/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 24ms/step - loss: 8.0888e-05
Epoch 4/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 24ms/step - loss: 3.3863e-05
Epoch 5/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 23ms/step - loss: 1.5956e-05
Epoch 6/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 23ms/step - loss: 7.9517e-06
Epoch 7/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 23ms/step - loss: 4.1216e-06
Epoch 8/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 23ms/step - loss: 2.1962e-06
Epoch 9/75
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 24ms/step - loss: 1.2028e-06
Epoch 10/75
[1m1353/13

<keras.src.callbacks.history.History at 0x7ab184ec6c20>

# Temperature and Prompt Variations

In [None]:
#Sample prompts
prompts = [
    "To be, or not to be",
    "Shall I compare thee to a summer's day",
    "All the world's a stage"
]

temperatures = [0.1, 0.5, 1.0]

#Test output for each prompt with varying temperatures
for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    for temp in temperatures:
        generated_text = generate_text(prompt, lstm, vectorize_layer, vocab, temperature=temp)
        print(f"\nTemperature {temp}:\n{generated_text}")



Prompt: To be, or not to be

Temperature 0.1:
To be, or not to be lath waxe richer abate cvi largess snow widowed loe retire abate ready abate abate abate abate abate meete water abate cozins replete abate linger cheuerell nourishd exchange cheares briefly abate sale uneard godgigoden abate

Temperature 0.5:
To be, or not to be francis abate abate displant abate orecouered earthquake christian argues fall wenches abate abate woos abate abate stand injury yeares steeld pencil abate abate chaunces abate abate afflicted abate abate curls ghostly read threatned abate abate sings murderous abate messenger enctypemultipartformdata propertyogtitle ground seemd abate suffers canons godden abate abate abate performe heartinflaming center seene abate abate abate lamentable abate abate abate fountaines harmful hated abate mind abate gloue value step abate arithmeticke abate miscarried streaks abate abate hrefebookssearch husbandry estate abate abate abate cozen pays niggarding abate filching aba

# Evaluation of Generated Text



*   The basic LSTM model with only one layer trained quickly, but the output was not good. most of the words were mispelled or just a bunch of random letters and the sentences made no sense.
*   Increasing the units in each LSTM layer also increased training time. However, using 128 units did produce better results than 64 units.
*   Looking at the results of the generated text with .1 temperature, the word "abate" gets repeated a lot. The output is incoherent and lacks variety of words. The generated text for .5 temperature has a lot more variation in word choice. It is more creative but it is also incoherent because the text doesn't make any sense. Lastly, the text using 1.0 temperature shows the most creativity. The word "abate" is repeated less but many of the words seem made up. For example, in the output from the second prompt, words like "valuesxclick" appear that are not actual words. So while there is more variety, the text still lacks coherence.
*   Overall, the model can form a lot of words that were used by Shakespeare, but the ordering is random. The generated text makes no sense because the model needs a lot more training. It has some of the basic vocabulary down, but needs more epochs in order to create phrases that are similar to Shakespeare's


