In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

### Dataset
For this example, we only need one piece of training data. In fact, we can write our own poem/play and pass them to the network for training if we'd like. However, to make things easy, we'll use an extract from a shakespeare play


In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


### Loading Your Own Data
To load your own data you'll need to upload a file from the dialog below. Then, you'll need to follow the steps from above but load in this new file instead

In [None]:
from google.colab import files
path_to_file = list(files.upload().keys())[0]

### Read Contents of File
Let's look at the contents of the file

In [3]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# Length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
# Take a look aht the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



### Encoding
Since this text isn't encoded yet, we'll need to do that ourself. We are going to encode each unique character as a different integer.

In [5]:
vocab = sorted(set(text))
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [6]:
# let's look at how part of our text is encoded
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


And here we will make a function that can convert our numeric values to text.

In [7]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


### Creating Training Examples
Remember our task is to feed the model a sequence and have it return to us the next character. THis means we need to split our text data from above into many shorter sequences that we can pass to the model as training examples.
The training examples we will prepare will use a seq_length sequence as input and a seq_length sequence as the output where that sequence is the original sequence shifted one letter to the right. For example:
> input: Hell | output: ello

Our first step will be to create a stream of characters from our text data.

In [8]:
seq_length = 100 # length of sequence
example_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Next, we can use the batch method to turn this stream of characters into batches of desired length.

In [9]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

Now we need to use these sequences of length 101 and split them into input and output

In [10]:
def split_input_target(chunk): # for the example: Hello
  input_text = chunk[:-1] # hell
  target_text = chunk[1:] # ello
  return input_text, target_text # hell, ello

dataset = sequences.map(split_input_target) # We use map to apply the above function to every entry

In [11]:
for x,y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))



EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


Finally, we need to make training batches.

In [12]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab) # vocab is number of unique characters
EMBEDDING_DIM = 256 # Embedding dimension
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences.
# So it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

### Building the Model
Now, it is time to build the model. We will use an embedding layer a LSTM and one dense layer that contains a node for each unique character in our training data. The dense layer will give us a probability distribution over all nodes.

In [13]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                batch_input_shape=[batch_size, None]),
      tf.keras.layers.LSTM(rnn_units,
                           return_sequences=True,
                           stateful=True,
                           recurrent_initializer='glorot_uniform'),
      tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5330241 (20.33 MB)
Trainable params: 5330241 (20.33 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Creating a Loss Function
Now, we are actually going to create our own loss function for this problem. This is because our model will output a (64, sequence_length, 65) shaped tensor that represents the probability distribution of each character at each timestep for every sequence in the batch.


However, before we do that let's have a look at a sample input and the output from our untrained model. This is so we can understand what the model is actually giving us.

In [14]:
for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch) # ask our model for a prediction on our first batch of training data
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)") # print out the output shape

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [15]:
# we can see that the prediction is an array of 64 arrays, one for each entry in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[ 1.24310679e-03 -1.38775771e-03  7.80540239e-03 ... -5.07937372e-03
   -2.30725214e-04  3.16768931e-03]
  [ 3.43893608e-03 -4.32379870e-03  5.31213637e-03 ... -5.38396975e-03
   -1.12600857e-03  2.63892487e-03]
  [-1.36967446e-03 -1.89426041e-03 -1.67591323e-03 ... -3.42722866e-03
    1.25501095e-03  1.35820045e-03]
  ...
  [-1.88882537e-02  1.02675278e-02  1.08020399e-02 ... -9.61457193e-03
   -2.32856983e-04  5.05560404e-03]
  [-2.01928206e-02  1.23879360e-02  1.08411992e-02 ... -3.10784951e-03
   -1.83413678e-03  1.00526237e-03]
  [-1.82020515e-02  1.24959527e-02  1.50073916e-02 ... -5.21414075e-03
   -1.14089798e-03  1.66952447e-03]]

 [[ 2.53625354e-03 -3.39479838e-03 -5.32922160e-04 ... -1.42911170e-03
    2.08990343e-04  1.01084192e-03]
  [ 4.96663526e-03 -2.33011879e-03  2.52894568e-03 ...  2.47258181e-03
    4.21032170e-03  1.18804863e-04]
  [ 1.59818504e-03 -3.21005005e-03 -8.94901808e-04 ...  9.15822200e-03
    3.37169529e-03  1.03145430e-03]
  ...
  [-2.781

In [16]:
# Lets examine one preidction
pred = example_batch_predictions[0]
print(len(pred))
print(pred)
# Notice this is a 2d array of length 100, where each interior array is the prediction for the next character at each time step

100
tf.Tensor(
[[ 0.00124311 -0.00138776  0.0078054  ... -0.00507937 -0.00023073
   0.00316769]
 [ 0.00343894 -0.0043238   0.00531214 ... -0.00538397 -0.00112601
   0.00263892]
 [-0.00136967 -0.00189426 -0.00167591 ... -0.00342723  0.00125501
   0.0013582 ]
 ...
 [-0.01888825  0.01026753  0.01080204 ... -0.00961457 -0.00023286
   0.0050556 ]
 [-0.02019282  0.01238794  0.0108412  ... -0.00310785 -0.00183414
   0.00100526]
 [-0.01820205  0.01249595  0.01500739 ... -0.00521414 -0.0011409
   0.00166952]], shape=(100, 65), dtype=float32)


In [17]:
# and finally, we'll look at a prediction at the first time step
time_pred = pred[0]
print(len(time_pred))
print(time_pred)
# and of course, its 65 values representing the probability of each character occuring next

65
tf.Tensor(
[ 1.2431068e-03 -1.3877577e-03  7.8054024e-03  1.7059306e-03
  2.2184555e-03  1.8448047e-03 -3.2172566e-03 -4.9815448e-03
 -1.2645014e-03  7.4485540e-03 -2.0052339e-03  1.9681163e-03
  3.5501525e-03 -3.1083501e-03  6.6014356e-05 -1.6745169e-03
 -4.3036276e-04 -1.4305249e-03 -1.1192957e-03 -1.2423301e-03
 -1.6009051e-03 -3.9800727e-03 -8.1567082e-04 -4.0770480e-03
  2.2465423e-05  1.5362388e-03 -2.1774821e-04  1.9189571e-03
 -1.2404469e-03  2.6046536e-03  1.7554802e-03  4.3156156e-03
  4.4477230e-05 -1.6377487e-03 -5.4761600e-03  2.3041535e-03
 -1.0817337e-03 -3.2985792e-04 -3.4538247e-03  2.0026586e-03
  9.2095369e-04  4.0474272e-04 -1.0759853e-03  5.3865230e-03
 -5.2514439e-04 -4.2396132e-06  4.8871781e-04 -3.6668645e-03
  3.7289164e-03  3.8604781e-03 -7.7950573e-03 -2.2898263e-03
 -3.8184682e-03  6.3271909e-03 -4.2641275e-03 -6.9144234e-04
 -2.8268935e-03  1.8944460e-04  3.3309371e-03  1.3953017e-03
 -4.2734900e-04 -4.1541073e-04 -5.0793737e-03 -2.3072521e-04
  3.167689

In [18]:
# if we want to determine the predicted character, we need to sample the output distribution (pick a bvalue based on probability)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# now, we can reshape that array and convert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars # and this is what the model predicted for training sequence 1

"GClQvqp3ErV!FowXU?\nkzoYMbNXfXq!dC,iOSniN'AU;mJDY?-fHMztder\nm;zqvJAV&aTyJ?wuE& o,M3d:manveL!Sr3XrDD,!"

So now, we need to create a loss function that can compare that output to the expected output and give us some numeric value representing how close the two were.

In [19]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

### Compiling the Model
At this point, we can think of our problem as a classification problem where the model predicts the probability of each unique letter coming next.

In [20]:
model.compile(optimizer='adam', loss=loss)

### Creating Checkpoints
Now, we are going to setup and configure our model to save checkpoints as it trains. This will allow us to load our model from a checkpoint and continue training it.

In [21]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only=True
)

### Training
Finally, we will start training the model.
If this is taking a while, go to change Runtime type and choose 'GPU' under the hardware accelerator

In [22]:
history = model.fit(data, epochs=2, callbacks=[checkpoint_callback])
# For this kind of model, the more the epoch, the more the accuracy because what we want to do is to learn how the language works and to replicate that but we use 2 because of time

Epoch 1/2
Epoch 2/2


Loading the Model
We'll rebuild the model from a checkpoint using a batch_size of 1 so that we can feed one peice of text to the model and have it make a prediction.

In [23]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

Once the model is finished training, we can find the latest checkpoint that stores the models weights using the following line

In [29]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

We can load any checkpoint we want by specifying teh exact file to load.

In [31]:
checkpoint_num = 1
model.load_weights(tf.train.load_checkpoint("./training_checkpoints/ckpt_" + str(checkpoint_num)))
model.build(tf.TensorShape([1, None]))

AttributeError: 'tensorflow.python.util._pywrap_checkpoint_reader.C' object has no attribute 'endswith'

### Generating Text
Now, we can use the lovely function provided by tensorflow to generate some text using any starting string we'd like

In [35]:
def generate_text(model, start_string):
  # Evaluate step (generating text using the learned model)

  # Number of characters to gnerate
  num_generate = 800

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  # Turning our input_eval into a double list , i.e [[]]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperature results in more predictable text.
  # Higher temperature results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size = 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension

    # This makes our predictions a simple list and removes the extra dimension
    predictions = tf.squeeze(predictions, 0)

    # Using a categorical distrivution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [36]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))

Type a starting string: Moses
Moses the hand;
Mewarce you kned jower so save the ginse to breathter,
Peivit as you.
The kned quemere Kath,
Mesboves fat a marmeed, as go-thy fair with but spracesing
Therefied's tootherfs, and in of it your visted,
The kinstul mo lobder; and low'd my; we
leave onen,
His dint him forrather of gove unt and fromnears.
Sputice.

LOD ANGES:
Yot son, you, both chrewennch on sour ag soched
O an show know gony I sugenfuls
Hadhing aplands ow brading.

QUEEN MARGURE:
I wire Betararge, for you peach you are me: me tore!

CUMIRGUS:
Which I thy unfin them our fatcerpan than,
that's givion. O?
How and comes of my edervantes.
But digrsty thant: no most hel haspented to be mads
Ke twill he ofterty whan's, shan's exsent? which cosedn to bean disteror,
What, you.
Mestrow you con ut friending.
Speak I ame is wh
