# Generating Shakespear
The following is the building of a character-based model that learns English on its own

In [12]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

In [45]:
tf.config.list_physical_devices()


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [13]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [14]:
path_to_file

'/home/serialguitarist/.keras/datasets/shakespeare.txt'

In [15]:
# read, then decode for py2 compatibility
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of the text as the number of characters
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [16]:
# first 250 characters
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [17]:
# number of unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

65 unique characters


In [18]:
vocab

['\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

## Preprocessing
### Vectorizing
`preprocessing.StringLookup` layer can convert each character into numeric ID, but needs the text to be split into tokens first, hence:

In [19]:
example_texts = ['abcdefg', 'xyz', '\n !$']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z'], [b'\n', b' ', b'!', b'$']]>

In [20]:
# the StringLookup layer
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab),
    mask_token=None
)

In [21]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65], [1, 2, 3, 4]]>

To recover human readable text, use `preprocessing.StringLookup(..., invert=True)`

In [22]:
chars_from_ids = preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(),
    invert=True,
    mask_token=None
)

In [23]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z'], [b'\n', b' ', b'!', b'$']]>

`tf.strings.reduce_join` can be used to join the characters back into strings

In [24]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz', b'\n !$'], dtype=object)

In [25]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

## Prediction
We're training the model to predict, given a sequence of text, what's the most likely next character

The text needs to be divided into example sequences. For each input sequence, the target is the same sequence, but shifted one character to the right from the text. So we're breaking the text into chunks of `seq_length+1`

`tf.data.Dataset.from_tensor_slices` can convert the text vector into a stream of character indices

In [26]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

In [27]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [28]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [29]:
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

`batch` method will easily convert these individual characters to sequences of desired length

In [30]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
    print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


To join them back into strings:

In [31]:
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


A function that takes said sequences, and shifts and creates the input, label pairs:

In [32]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    
    return input_text, target_text

In [33]:
split_input_target('Factorio')

('Factori', 'actorio')

In [34]:
dataset = sequences.map(split_input_target)

In [35]:
for X, y in dataset.take(1):
    print('Input: ', text_from_ids(X).numpy())
    print('Target: ', text_from_ids(y).numpy())

Input:  b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target:  b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


## Training Batches

In [36]:
# batch size
BATCH_SIZE = 64

# buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffled the entire sequence in memory.
# Instead, it maintains a buffer in which it shuffles elements)
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [37]:
for X, y in dataset.take(1):
#     print('Input: ', text_from_ids(X).numpy())
#     print('Target: ', text_from_ids(y).numpy())
    print(X)

tf.Tensor(
[[15 60 59 ... 59 54  2]
 [62  2 53 ... 14 31 17]
 [ 2 45 54 ...  2 62 44]
 ...
 [36 48 59 ... 50  2 40]
 [ 2 45 40 ...  7  2 58]
 [44  2 57 ... 64  2 47]], shape=(64, 100), dtype=int64)


## Building the Model
The GRU layer can be replaced by an LSTM

In [38]:
len(ids_from_chars.get_vocabulary())

66

In [40]:
len(vocab)

65

In [47]:
# length of the vocabulary in chars
vocab_size = len(vocab)

# embeddimg dimension
embedding_dim = 256

# number of RNN units
rnn_units = 1024

The internals of the model is being managed here like this to make it much simpler to generate text later on

But keras.Sequential CAN be used to train

In [48]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [49]:
model = MyModel(
    # vocab size must match the `StringLookup` layers
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units
)

## Running the Model
First, check the shape of the output

In [50]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [51]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  16896     
_________________________________________________________________
gru_1 (GRU)                  multiple                  3938304   
_________________________________________________________________
dense_1 (Dense)              multiple                  67650     
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


### Getting Actual Predictions
Need to look at the output distribution to get the indicies

In [52]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([20, 64, 15,  9, 61, 62, 43, 50, 20,  7,  2, 42, 35, 64, 44,  8, 54,
       31, 21, 34,  0, 54, 35, 45, 63, 33, 57, 35, 43, 23, 45, 47, 65, 34,
       27, 25, 49, 17, 25, 34,  8, 17, 10, 63, 32, 18, 44, 36, 41, 40, 48,
       14, 40, 43, 57, 34, 34, 64, 11, 37,  8, 42, 60, 20, 31,  6, 27, 27,
       19, 55, 37, 21, 15, 31, 17, 51, 36, 19, 57, 22, 37,  1, 11, 59, 37,
       33, 18, 41, 17, 60, 34, 61, 62, 13, 24, 51, 16, 15, 58, 48])

In [53]:
# decoding the indices
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next char predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'rother; and then the prince my brother and\nthe princess my sister called my father father; and\nso we'

Next char predictions:
 b"GyB.vwdkG, cVye-oRHU[UNK]oVfxTrVdJfhzUNLjDLU-D3xSEeWbaiAadrUUy:X-cuGR'NNFpXHBRDlWFrIX\n:tXTEbDuUvw?KlCBsi"


## Training
Now it's a standard classification problem: Given the previous RNN state, and the input this time step, predict the class of the next character

### Optimizer and a Loss function
The standard `tf.keras.losses.sparse_categorical_crossentropy` loss works here because it is applied across the last dimension of the predictions

Because the model returns logits, `from_logits` flag needs to be true

In [55]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [57]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print('Prediction Shape: ', example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')
print('Mean loss:        ', mean_loss)

Prediction Shape:  (64, 100, 66) # (batch_size, sequence_length, vocab_size)
Mean loss:         4.188983


A newly initialized model should be unsure of itself, giving everything more or less similar values. To make sure of this, we can check the loss raised to the power of e is more or less similar to the vocabulary size. Being much higher means the model is sure o fits wrong answers, and is badly initialized

In [58]:
tf.exp(mean_loss).numpy()

65.95567

In [59]:
model.compile(optimizer='adam', loss=loss)

### Checkpoints
`tf.keras.classbacks.ModelCheckpoint` ensures checkpoints are saved during training

In [61]:
# directory where the choickpoints will be saved
checkpoint_dir = './models/training_checkpoints'

# name of the choickpoint file
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

### Training itself

In [78]:
EPOCHS = 30

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Text Generation
The easiest method is to run it in a loop, and keep track of the internal state as you do it

In [79]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                              return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [80]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

Now we just run it in a loop to generate text

In [81]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO:
By rote of him, the time is very neither.

COMINIUS:
O, teephess; gast thou, the news is spraw fond corks?

MENENIUS:
Come, contrad, to the garden for the mind hate the word:
Go to the maid; my loving Montague,
When now, to assive from hence hath sorrow
Or eye, with known too light and princely gaze;
Show 'tis shed post: he shall renown lady. But, as I said.

Second Citizen:
Construe till a Richard kill'd him;
I call'd thee thus partly to bed.
Sir, that he wounded hence,
For Warwick and what wit innocent mine,
With repetition of Hortensio.

ROMEO:
Take honour on him.

First Servingman:
Nay, come to me, Tyrrel, soon do your bless
Unto my nothing. Women are miseries!
O, well she's yet; the doint of York as bonn
gently conscience, and tell them at are already, and he
is even: while the term of thine.

POLIXENES:
I have let fitting for his wander: she
welcome home: he would not say how true--
Widow, tell me where setting a week. We have ask,
When true gant tyrannous lambs that treas

## Exporting the generator

In [82]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')






FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


In [83]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
    next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
    result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))


ROMEO:
How now, fair sire! Now the greater place of star,
Having no other abed, and cannot woe wangs draw-
