# Scott Breitbach
## 28-May-2022
# LSTM AI Text Generator
## Trained using text of *The Ultimate Hitchiker's Guide to the Galaxy* by Douglas Adams
Source [text](https://archive.org/stream/TheultimateHitchhikersGuide/The%20Hitchhiker%27s%20Guide%20To%20The%20Galaxy_djvu.txt).

## Get the data

In [8]:
# Load libraries
import tensorflow as tf
import keras
import numpy as np
from keras import layers

# set seed
np.random.seed(seed=42)

# get data
path = 'data/h2g2.txt'
text = open(path).read().lower()
print(f'Corpus length: {len(text)}')

Corpus length: 1561887


## Vectorize the text

In [6]:
# Vectorizing sequences of characters
maxlen = 60     # Extract sequences of 60 characters
step = 3        # Sample a new sequence every three characters
sentences = []  # Holds extracted sequences
next_chars = [] # Holds targets (the follow-up characters)

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

print(f'Number of sequences: {len(sentences)}')

# List of unique characters in the corpus
chars = sorted(list(set(text))) 
print(f'Unique characters: {len(chars)}')

# Dict that maps unique characters to their index in the list `chars`
char_indices = dict((char, chars.index(char)) for char in chars) 

print('Vectorization...')
# One-hot encodes the caracters into binary arrays:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 520609
Unique characters: 60
Vectorization...


## Set up the model

In [7]:
# Set up single-layer LSTM model for next-character prediction
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

ImportError: cannot import name 'tf2' from 'tensorflow.python' (unknown location)

In [None]:
# Model compilation configuration
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
def sample(preds, temperature=1.0):
    '''
    Sample the next character given the model's predictions
    '''
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## Fit the model
Note: the text turned to gibberish around Epoch 25, so I set it to stop early at Epoch 23.

In [None]:
# Text-generation loop
import random
import sys
for epoch in range(1, 23):#60): # Trains model for 60 epochs
    print(f'\nEpoch {epoch}:')
    model.fit(x, y, batch_size=128, epochs=1) # Fits model for 1 iteration of data
    # Generate example text every third epoch while training:
    if epoch % 3 == 0:
        # Selects a text seed at random:
        start_index = random.randint(0, len(text) - maxlen - 1)
        generated_text = text[start_index: start_index + maxlen]
        print(f'---\nGenerating with seed:\n"{generated_text}"\n---')
        for temperature in [0.2, 0.5, 1.0]:#, 1.2]: # Tries a range of different sampling temperatures
            print(f'\n------ temperature: {temperature}\n')
            sys.stdout.write(generated_text)
            for i in range(200): # Generates 200 characters, starting from seed text
                # One-hot encodes characters generated so far:
                sampled = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(generated_text):
                    sampled[0, t, char_indices[char]] = 1.
                # Samples the next character
                preds = model.predict(sampled, verbose=0)[0]
                next_index = sample(preds, temperature)
                next_char = chars[next_index]
                generated_text += next_char
                generated_text = generated_text[1:]
                sys.stdout.write(next_char)
            print()


Epoch 1

Epoch 2

Epoch 3

Epoch 4

Epoch 5

Epoch 6

Epoch 7

Epoch 8

Epoch 9

Epoch 10

Epoch 11

Epoch 12

Epoch 13

Epoch 14

Epoch 15

Epoch 16

Epoch 17

Epoch 18

Epoch 19

Epoch 20

Epoch 21

Epoch 22


## Generate text:

In [None]:
# Selects a text seed at random:
start_index = random.randint(0, len(text) - maxlen - 1)
generated_text = text[start_index: start_index + maxlen]
print(f'---\nGenerating with seed:\n"{generated_text}"\n---')
for temperature in [0.2, 0.5, 1.0]: # Tries a range of different sampling temperatures
    print(f'\n------ temperature: {temperature}\n')
    sys.stdout.write(generated_text)
    for i in range(400): # Generates 400 characters, starting from seed text
        # One-hot encodes characters generated so far:
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.
        # Samples the next character
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)
    print()

--- Generating with seed: "id. "the most powerful computational force 
known to parasci"

------ temperature: 0.5 

id. "the most powerful computational force 
known to parascion and most time," said the old brie into and 
would be into the crowd money of more of the planet of a contrance 
and the door of the planet of the mind can about the beamer 
direction to the angric from the ship, so only sure a big was 
about in the stare of things were contrations of the answer 
to have something suddenly to do anything and means to be the 
thing it was controlobinitely because


## Generate 20 samples:

In [None]:
for i in range(0,20):
    print(f'\n== GENERATED TEXT #{i+1}: ==\n')
    # Selects a text seed at random:
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print(f'---\nGenerating with seed:\n"{generated_text}"\n---\n')
    temperature = 0.5
    sys.stdout.write(generated_text)
    for i in range(400): # Generates 400 characters, starting from seed text
        # One-hot encodes characters generated so far:
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.
        # Samples the next character
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)
    print()


==GENERATED TEXT #1:

--- Generating with seed: " to be 
beautiful. am i?" 

"you're pretty direct, aren't yo"

------ temperature: 0.5 

 to be 
beautiful. am i?" 

"you're pretty direct, aren't you to erotion you stream, and 
then at the trees of the planet that was one of the strange 
words because the other of the people of the sort of weight 
many money and stream, there was back at the million thing and 
tried in the a man. 

"i have been to absoluther," said arthur leaded, "oh it was the 
still in front of things again when it wasn't world the computer 
captain, from people on the who

==GENERATED TEXT #2:

--- Generating with seed: "eel that very strongly." 



chapter 33 


the sun was shini"

------ temperature: 0.5 

eel that very strongly." 



chapter 33 


the sun was shining as the noise. 

"you don't know what your manding and and the planet in the floor 
that was after and with the sheed was as firsted at them to 
the stories of a stared light and silence because the 

  after removing the cwd from sys.path.


rotines of the speed to be a hitchhas) vhhg !quached 
then the ship, then then it was the night of instead. 

"when it's eventually to see the stores were the your perfectl

==GENERATED TEXT #5:

--- Generating with seed: "e in their right minds would want 
to buy a nowwhattian bogh"

------ temperature: 0.5 

e in their right minds would want 
to buy a nowwhattian boghon foot. 

"he could have something you now," said the robot. 

"well, "i don't know well," he said and streeked again. 

"what do you was any year?" 

arthur stole looked to read to see a set to people the big one 
one that was now what it was a sand from the million again. 

"there was anything to see the moment of the door," said known 
all to the back of the strange as it was capering the stre

==GENERATED TEXT #6:

--- Generating with seed: "lier air." 

"that's one future," said harl. "that's your fu"

------ temperature: 0.5 

lier air." 

"that's one future," said harl. "that's your full with the earth. 

"there 