In [1]:
#The goal is to develop a model that can generate new text that is similar in style
# to the input document


#Breaks the text down, character by character - uses memory of previous character to predict the next one

#Three main steps:
# 1) Prepare the data
# 2) Build the RNN model
# 3) Perform next-character prediction

#Step 1A) Preparing the data

import numpy as np

##reading and processing text

with open('1268-0.txt', 'r') as fp:
    text = fp.read()
    
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')

text = text[start_indx:end_indx]
char_set = set(text) #unique characters

print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112350
Unique Characters: 80


In [2]:
# Step 1B) Mapping characters to integers using a dictonary
# NN and RNN cannot work with strings so must convert characters to integers 
# However, must reverse this process to get the results in text

#Create two different functions - char2int and char_array

chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)} # mapper converting string to numeric

char_array = np.array(chars_sorted) # reverse mapper

text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype = np.int32) # contains the encoded values for all the characters

print('Text encoded shape:', text_encoded.shape)

print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape: (1112350,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ==> ISLAND


In [3]:
# Step 1C) Creating a TF dataset from the array, text_encoded

import tensorflow as tf

ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)

for ex in ds_text_encoded.take(5):
    print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [4]:
# Step 1D) Splitting the text into chunks using batch()

# The hyperparameter for sequence length will be set at 40 char("sweet spot") 
# The inputs x and targer y are offset by 1 so batches will be 41 char. 
# We will then apply a transformation using the map() method to separate the x and the y sequences

seq_length = 40
chunk_size = seq_length + 1

ds_chunks = ds_text_encoded.batch(chunk_size, drop_remainder=True) # drops all that dont fit the 41 batch size

##define the function for splitting x & y
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

ds_sequences = ds_chunks.map(split_input_target)

#Example sequences from the transformed dataset

for example in ds_sequences.take(2):
    print(' Input (x): ',
         repr(''.join(char_array[example[0].numpy()])))
    print(' Target (y): ',
         repr(''.join(char_array[example[1].numpy()])))
    print()

 Input (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
 Target (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

 Input (x):  ' Anthony Matonak, and Trevor Carlson\n\n\n\n'
 Target (y):  'Anthony Matonak, and Trevor Carlson\n\n\n\n\n'



In [5]:
# Step 1E) Divide the dataset into mini-batches
#Shuffle the training examples and divide the inputs into mini-batches - each batch will contain
# multiple training examples - multiple sentences

BATCH_SIZE = 64
BUFFER_SIZE = 10000
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [6]:
# Step 2) Building a Character-Level RNN Model
# For reusability - we will build a function, build_model() that defines an RNN model
# using the Keras Sequential class

def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(
            rnn_units,
            return_sequences = True),
        tf.keras.layers.Dense(vocab_size) #activation = None - need logits
    ])    
    return model

## Setting the training parameters
charset_size = len(char_array)
embedding_dim = 256
rnn_units = 512

tf.random.set_seed(1)
model = build_model(
    vocab_size = charset_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units)

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         20480     
_________________________________________________________________
lstm (LSTM)                  (None, None, 512)         1574912   
_________________________________________________________________
dense (Dense)                (None, None, 80)          41040     
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


In [12]:
#Training the above model

model.compile(
    optimizer='adam',
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits = True))

model.fit(ds, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ff4d2c12990>

In [14]:
# Evaluating phase - generating new text passages

# Can use the softmax function to regularize the logit output into probabilities
# Need to randomly sample from the outputs though instead of just chosing the element with the maximum value

# An example is below:

tf.random.set_seed(1)

logits = [[1.0, 1.0, 1.0]] # equiprobable categories - same logits for the 3 categories [0, 1, 2]

print('Probabilities:', tf.math.softmax(logits).numpy()[0]) # equal probablities for each class per logits

samples = tf.random.categorical(
    logits = logits, num_samples = 10)

tf.print(samples.numpy())

#Would expect with infinite sample size - occurances would be 1/3 for each category

Probabilities: [0.33333334 0.33333334 0.33333334]
array([[0, 0, 1, 2, 0, 0, 0, 0, 1, 0]])


In [17]:
#Changing the logits to favor the third category

tf.random.set_seed(1)

logits = [[1.0, 1.0, 3.0]]

print('Probabilities:', tf.math.softmax(logits).numpy()[0]) #Expect more samples to be drawn from category 2

samples = tf.random.categorical(
    logits = logits, num_samples = 10)

tf.print(samples.numpy())

Probabilities: [0.10650698 0.10650698 0.78698605]
array([[2, 0, 2, 2, 2, 0, 1, 2, 2, 0]])


In [None]:
#Can use the tf.random.categorical() function to generate samples based on the logits from the model
# We can define a function, sample() that:
    #1) Receives a short starting string, starting_str
    #2) Generates a new string, generated_str. 
    #3) String of max_input_length is then taken from the end of generated_string and encoded
    # to a sequence of integers, encoded_input
    #4) Encoded_input is passed to the RNN to compute the logits
    #5) Last element of the output logits is passed to tf.random.categorical to generate a new sample
    #6) This is converted to a character then appended to generated string, generated_text
    #7) Repeat the above process until reaching a string of desire length
# Note that the output from the RNN is a sequence of logits with the same length as the input sequence
# since we specified return_sequence=True
# Each element in the output represents the logits (vector of size 80) for the next character after
# observing the input sequence by the model

In [None]:
# We will use the last element of the output logits (O^(t)) which is passed to the tf.random.categorical()
# to generate a new sample.
# The new sample is converted to a character, which is then appended to the end of the generate string
# generated_text, increasing its length by 1.
# The process is repeated taking the last max_input_length number of characters from the end of the 
# generated_str, and using that to generate a new character until the length of the string reaches the desire value
# This process is refered to as auto-regression

In [20]:
def sample(model, starting_str, 
          len_generated_text = 500,
          max_input_length = 40,
          scale_factor = 1.0):

    encoded_input = [char2int[s] for s in starting_str]
    encoded_input = tf.reshape(encoded_input, (1, -1))
    
    generated_str = starting_str
    
    model.reset_states()
    for i in range(len_generated_text):
        logits = model(encoded_input) # model with the new input
        logits = tf.squeeze(logits, 0)
        
        scaled_logits = logits * scale_factor
        new_char_indx = tf.random.categorical(
            scaled_logits, num_samples=1)
        
        new_char_indx = tf.squeeze(new_char_indx)[-1].numpy()
        
        generated_str += str(char_array[new_char_indx])
        
        new_char_indx = tf.expand_dims([new_char_indx], 0)
        encoded_input = tf.concat(
            [encoded_input, new_char_indx],
            axis=1)
        
        encoded_input = encoded_input[:, -max_input_length:]
        
    return generated_str

In [21]:
#generating new text

tf.random.set_seed(1)
print(sample(model, starting_str='The Island'))

The Island is not deceived. The tide were not certain? It had landed on, he must listen, mingled with the cart soil, not even seemed as if he would have been irrows and point of discovery.

All first necessary for several days to Port Balloon. They halted seen in some determined even top of a fine sension of fact? It appeared sail now to reach
the voyage
violence to roughly, without any sounday.

The settlers had not believe that the rocks for heard. We will celled, it was composed to sail in basadiars. B


In [8]:
#You can alter how the text is generated to make it be less random and follow the 
# learned text patterns better
# This altered via the scaling factor, alpha (in the model it was 1.0), < 1 is more predictable and >1 is less

import numpy as np


logits = np.array([[1.0, 1.0, 3.0]])

print('Probabilities before scaling:      ',
     tf.math.softmax(logits).numpy()[0])

print('Probabilities after scaling with 0.5:    ',
     tf.math.softmax(logits * 0.5).numpy()[0])

print('Probabilities after scaling with 0.1:     ',
     tf.math.softmax(logits * 0.1).numpy()[0])

#By scaling with <1 - the probabilties computed by softmax become more uniform

Probabilities before scaling:       [0.10650698 0.10650698 0.78698604]
Probabilities after scaling with 0.5:     [0.21194156 0.21194156 0.57611688]
Probabilities after scaling with 0.1:      [0.31042377 0.31042377 0.37915245]
