<a href="https://colab.research.google.com/github/Rtniewi/kcwiertniewicz-IDS/blob/main/GenAI/HW5/Assignment5_3_LSTM_Layers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

//***Katrina Cwiertniewicz
//*** CSC 330
//11/--/2024
//Assignment 5: Text Generation Using LSTM on Project Gutenberg Training Data
####The purpose of this assignment is to develop an LSTM model that generates text. The goal is to produce coherent and stylistically relevant text based on prompts.

In [None]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

## 0. Parameters <a name="parameters"></a>

In [None]:
VOCAB_SIZE = 10000
MAX_LEN = 500
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

## 1. Load the data <a name="load"></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%pwd

'/content'

In [None]:
import requests
import os


# List of files for additional texts (e.g., different Edgar Allan Poe Works)
file_paths = [
  os.path.join('/content/drive/MyDrive/text/The_Tell_Tale_Heart.txt'),            # The Tell Tale Heart
  os.path.join('/content/drive/MyDrive/text/The_Cask_of_Amontillado.txt'),        # The Cask of Amontillado
  os.path.join('/content/drive/MyDrive/text/The_Raven.txt'),                      # The Raven
  os.path.join('/content/drive/MyDrive/text/The_Masque.txt'),                     # The Masque of the Red Death
  os.path.join('/content/drive/MyDrive/text/Annabel_Lee.txt'),                    # Annabel Lee
  os.path.join('/content/drive/MyDrive/text/Lenore.txt'),                         # Lenore
  os.path.join('/content/drive/MyDrive/text/The_Bells.txt'),                      # The Bells
  os.path.join('/content/drive/MyDrive/text/The_Black_Cat.txt'),                  # The Black Cat
  os.path.join('/content/drive/MyDrive/text/The_Fall_of_the_House_of_Usher.txt'), # The Fall of the House of Usher
  os.path.join('/content/drive/MyDrive/text/The_Oval_Portrait.txt'),              # The Oval Portrait
  os.path.join('/content/drive/MyDrive/text/The_Pit_and_the_Pendulum.txt'),       # The Pit and the Pendulum
  os.path.join('/content/drive/MyDrive/text/The_Premature_Burial.txt'),           # The Premature Burial
  os.path.join('/content/drive/MyDrive/text/The_Narrative_of_Arthur_Gordon.txt'), # The Narrative of Arthur Gordon Pym of Nantucket
  os.path.join('/content/drive/MyDrive/text/Al Aaraaf.txt')                       # Al Aaraaf

]

# Initialize an empty string to hold all text
all_text = ""

# Download each text file and append to all_text
for file_path in file_paths:
  with open(file_path, 'r') as file:
    content = file.read()
    text = content
    all_text += text + "\n\n"  # Separate texts by newlines

# Save combined text to a single file
  with open('/content/combined_poe.txt', "w", encoding="utf-8") as file:
    file.write(all_text)


In [None]:
# Count the words of text
with open('/content/combined_poe.txt', "r", encoding="utf-8") as file:
  file_content = file.read()
  words = file_content.split()
  n_words = len(words)
print(f"{n_words} words loaded")

112272 words loaded


In [None]:
# Example Sentence of First Ten Words
example_sentence = words[:10]
print(f"Example Sentence: {example_sentence}")

Example Sentence: ['True!—nervous—very,', 'very', 'dreadfully', 'nervous', 'I', 'had', 'been', 'and', 'am;', 'but']


## 2. Tokenise the data

In [None]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


with open("combined_poe.txt", "r", encoding="utf-8") as file:
    text_data = [pad_punctuation(line) for line in file]

In [None]:
example_date = text_data[80]
print(example_date)

merely a cricket which has made a single chirp . ” Yes , he had been



In [None]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [None]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [None]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

# Length of Vocabulary
print(f"Length of Vocabulary: {len(vocab)}")

Length of Vocabulary: 10000


In [None]:
# Display some token:word mappings
for i, word in enumerate(vocab[10:30]):
    print(f"{i}: {word}")

0: i
1: was
2: -
3: that
4: it
5: with
6: had
7: my
8: as
9: we
10: which
11: he
12: at
13: from
14: for
15: his
16: by
17: this
18: ;
19: be


In [None]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(text_data)
print(example_tokenised.numpy())

[[ 572   49 5754 ...    0    0    0]
 [  34  736  124 ...    0    0    0]
 [6811 9475 9282 ...    0    0    0]
 ...
 [  60  170  174 ...    0    0    0]
 [  77  763   31 ...    0    0    0]
 [   0    0    0 ...    0    0    0]]


## 3. Create the Training Set

In [None]:
# Create the training set of text and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

## 4. Build the LSTM <a name="build"></a>

In [None]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

## 5. Train the LSTM <a name="train"></a>

In [None]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [None]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:   # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("Text:", max_tokens=500, temperature=1.0)

In [None]:
# Tokenize starting prompt

text_generator = TextGenerator(vocab)

In [None]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/25
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - loss: 1.9737
generated text:
Text: wearisomeness resounded successfully leaving 

[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 94ms/step - loss: 1.9699
Epoch 2/25
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - loss: 0.1519
generated text:
Text: stream fury appear accomplished stunted through - figure a between a that slept 

[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 94ms/step - loss: 0.1519
Epoch 3/25
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - loss: 0.1472
generated text:
Text: folly and ; force - indebted 3 of with many never different ; 

[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 95ms/step - loss: 0.1472
Epoch 4/25
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - loss: 0.1396
generated text:
Text: this one at apprehensions confirmat

<keras.src.callbacks.history.History at 0x7a9b615b3580>

## 6. Generate text using the LSTM

In [None]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [None]:
# Prompt 1: From "The Raven"
info = text_generator.generate(
    "Once upon a midnight dreary,", max_tokens=500, temperature=0.1
)


generated text:
Once upon a midnight dreary, these were , the obligation , 



In [None]:
print_probs(info, vocab)


PROMPT: Once upon a midnight dreary,
,:   	73.12%
.:   	4.48%
the:   	3.62%
;:   	1.1%
in:   	0.71%
--------


PROMPT: Once upon a midnight dreary, these
,:   	31.86%
:   	2.62%
[UNK]:   	2.03%
were:   	1.67%
!:   	1.57%
--------


PROMPT: Once upon a midnight dreary, these were
:   	14.23%
,:   	7.46%
the:   	6.74%
in:   	3.47%
-:   	2.95%
--------


PROMPT: Once upon a midnight dreary, these were ,
in:   	16.28%
and:   	13.17%
:   	8.85%
the:   	5.26%
with:   	3.64%
--------


PROMPT: Once upon a midnight dreary, these were , the
:   	20.62%
bells:   	1.49%
most:   	1.15%
sea:   	1.13%
[UNK]:   	1.09%
--------


PROMPT: Once upon a midnight dreary, these were , the obligation
,:   	93.65%
:   	3.25%
of:   	0.37%
!:   	0.32%
was:   	0.24%
--------


PROMPT: Once upon a midnight dreary, these were , the obligation ,
:   	91.64%
bells:   	0.65%
in:   	0.43%
i:   	0.34%
the:   	0.24%
--------



In [None]:
# Prompt 2: From "The Tell Tale Heart"
info = text_generator.generate(
    "And have I not told you that what you mistake for madness is but over-acuteness of the sense?", max_tokens=500, temperature=1.0
)


generated text:
And have I not told you that what you mistake for madness is but over-acuteness of the sense? 



In [None]:
print_probs(info, vocab)


PROMPT: And have I not told you that what you mistake for madness is but over-acuteness of the sense?
:   	100.0%
,:   	0.0%
of:   	0.0%
the:   	0.0%
i:   	0.0%
--------



In [None]:
# Prompt 3: From the Cask Of
info = text_generator.generate(
    "A million candles have burned themselves out. Still I read on.", max_tokens=500, temperature=0.5
)
print_probs(info, vocab)


generated text:
A million candles have burned themselves out. Still I read on. 


PROMPT: A million candles have burned themselves out. Still I read on.
:   	100.0%
,:   	0.0%
of:   	0.0%
in:   	0.0%
;:   	0.0%
--------



In [None]:
print_probs(info, vocab)


PROMPT: A million candles have burned themselves out. Still I read on.
:   	100.0%
,:   	0.0%
of:   	0.0%
in:   	0.0%
;:   	0.0%
--------

