## Text to text generation using RNN model

This model takes a txt file and tokenize it on character basis. These tokens get feed to our RNN model which make our model context aware. This will help use to generate text when a prompt is given to it.

In [None]:
%tensorflow_version 2.x

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
import tensorflow as tf
import os
import numpy as np

In [None]:
from tensorflow.keras.preprocessing import sequence


The below text file contains a shakespeare play on which I am gonna work.

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [None]:
text = open(path_to_file, 'rb').read().decode(encoding = 'utf-8')

In [None]:
len(text)

1115394

In [None]:
text[:50]

'First Citizen:\nBefore we proceed any further, hear'

The number of characters in out play is 1115394.

In [None]:
vocab = sorted(set(text))

In [None]:
# length og out vocabulary
len(vocab)

65

In [None]:
# This is the mapping which converts each chraracter into an interger ans specifically positive number.
char_to_idx = {u:i for i,u in enumerate(vocab)}
char_to_idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [None]:
# mapping of index to char is simply a np.array like this
idx_to_char = np.array(vocab)
idx_to_char

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [None]:
def encoder(sentence):
  return np.array([char_to_idx[ch] for ch in sentence])

def decoder(arr):
  try:
    arr = arr.numpy()
  except:
    pass
  return ''.join(idx_to_char[arr])

In [None]:
encoder("Oh Juliet!")

array([27, 46,  1, 22, 59, 50, 47, 43, 58,  2])

In [None]:
decoder([27, 46,  1, 22, 59, 50, 47, 43, 58,  2])

'Oh Juliet!'

Training examples would be the shift of our text by one character from the input to the output.

input example are bounder by vertical bars : |fdslakfjdsl|k jfasdlk fj;adslkjf ....
ouput example are bounder by vertical bars : f|dslakfjdslk| jfasdlk fj;adslkjf ....
input: fdslakfjdsl then output: dslakfjdslk


In [None]:
seq_length = 200 # length of input sequence
examples_per_epoch = len(text) // (seq_length + 1)

In [None]:
encoded_text = encoder(text)

In [None]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [None]:
sequence = char_dataset.batch(seq_length + 1, drop_remainder = True )

In [None]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequence.map(split_input_target)

In [None]:
BATCH_SIZE = 32
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 2048

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

Building the model using LSTM (Long Short Term Memory)

In [None]:
def build_model(vocab_size, embedding_dim ,rnn_units, batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
      tf.keras.layers.LSTM(
          rnn_units, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'
      ),
      tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (32, None, 256)           16640     
                                                                 
 lstm_2 (LSTM)               (32, None, 2048)          18882560  
                                                                 
 dense_2 (Dense)             (32, None, 65)            133185    
                                                                 
Total params: 19032385 (72.60 MB)
Trainable params: 19032385 (72.60 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Loss Function
Desc :

In [None]:
for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(32, 200, 65) # (batch_size, sequence_length, vocab_size)


In [None]:
print("length of example_batch_predictions",len(example_batch_predictions), "\n")
print(example_batch_predictions)

length of example_batch_predictions 32 

tf.Tensor(
[[[ 2.7246634e-03  1.1690214e-03 -1.5965368e-03 ... -2.0077222e-03
    3.6962546e-04  3.3993216e-04]
  [ 1.5784656e-03 -2.2637506e-03 -1.5222246e-04 ... -1.4191504e-03
   -1.2149583e-03 -3.1869390e-04]
  [ 2.8345603e-03 -5.2033388e-03 -2.3174698e-03 ... -6.4780549e-03
   -5.3607593e-03  6.2577655e-03]
  ...
  [ 1.4481845e-03  4.8759533e-03 -7.3092151e-03 ...  1.9128630e-03
   -2.6438916e-03  4.8190202e-03]
  [ 2.8555584e-03 -1.5968952e-03 -8.7337010e-03 ... -4.3838006e-03
   -7.0376880e-03  1.0341837e-02]
  [ 3.5035997e-03 -2.2408848e-03 -6.9076419e-03 ... -6.8659405e-03
   -6.3328124e-03  1.0142503e-02]]

 [[ 1.7843676e-03 -7.7717390e-04  8.3439099e-03 ... -6.2653527e-04
   -3.1500403e-03  4.5896473e-04]
  [-8.4852567e-05  1.6541654e-03  5.0980095e-03 ... -5.6973461e-04
   -4.9515618e-03 -8.6011225e-04]
  [-6.3149852e-04 -5.5379770e-04  4.4541005e-03 ... -4.1442085e-04
   -4.8808600e-03 -7.0883322e-04]
  ...
  [ 6.7332471e-03  1.3161

Creating a custom Loss function to perform out optimization

In [None]:
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

200
tf.Tensor(
[[ 0.00272466  0.00116902 -0.00159654 ... -0.00200772  0.00036963
   0.00033993]
 [ 0.00157847 -0.00226375 -0.00015222 ... -0.00141915 -0.00121496
  -0.00031869]
 [ 0.00283456 -0.00520334 -0.00231747 ... -0.00647805 -0.00536076
   0.00625777]
 ...
 [ 0.00144818  0.00487595 -0.00730922 ...  0.00191286 -0.00264389
   0.00481902]
 [ 0.00285556 -0.0015969  -0.0087337  ... -0.0043838  -0.00703769
   0.01034184]
 [ 0.0035036  -0.00224088 -0.00690764 ... -0.00686594 -0.00633281
   0.0101425 ]], shape=(200, 65), dtype=float32)


In [None]:
time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[ 2.7246634e-03  1.1690214e-03 -1.5965368e-03 -1.7640952e-03
 -5.9527368e-04  1.1023791e-03 -1.4383618e-03  2.0313880e-03
 -2.8294602e-03  3.1035452e-03 -4.1371291e-03  1.7659286e-03
 -5.9656601e-04 -2.2176290e-03  3.7069845e-03 -7.8749261e-04
  8.6131843e-04 -4.2455550e-04 -1.4757439e-03 -5.8861001e-04
 -3.9184517e-03  3.1359824e-03 -1.6766804e-03 -6.5812643e-04
  7.0843118e-05 -1.3016215e-04  1.1619422e-04  6.7235163e-04
 -1.0893401e-03  1.7666025e-06  5.0595973e-04  3.2460345e-03
  3.1148572e-04 -8.2633906e-04 -3.3998252e-03  3.3215834e-03
  2.2598309e-03 -1.7614985e-03  4.4242060e-04  3.3725540e-03
  1.4756802e-03  1.6007364e-03  2.2256949e-03 -4.1714702e-03
 -5.6756556e-04 -1.8880441e-03  1.1056193e-04  1.4885502e-03
  8.1109419e-04 -1.5448087e-03  2.3874517e-03  2.6266424e-03
  6.2636170e-03 -1.4396058e-03 -3.4890294e-03 -3.0538521e-03
  3.9501791e-03 -5.2246096e-04  4.5357074e-04  1.6164255e-03
 -3.7670250e-03 -3.8552284e-04 -2.0077222e-03  3.6962546e-04
  3.399321

In [None]:
sampled_indeces = tf.random.categorical(pred, num_samples = 1)
sampled_indeces.shape

TensorShape([200, 1])

In [None]:
sampled_indeces

<tf.Tensor: shape=(200, 1), dtype=int64, numpy=
array([[12],
       [18],
       [50],
       [28],
       [20],
       [62],
       [23],
       [50],
       [16],
       [45],
       [59],
       [37],
       [25],
       [40],
       [ 0],
       [58],
       [33],
       [ 8],
       [ 6],
       [51],
       [40],
       [57],
       [25],
       [10],
       [52],
       [ 5],
       [47],
       [18],
       [26],
       [26],
       [25],
       [62],
       [39],
       [51],
       [42],
       [ 5],
       [31],
       [21],
       [ 8],
       [ 0],
       [49],
       [60],
       [52],
       [40],
       [62],
       [38],
       [40],
       [ 7],
       [ 8],
       [ 1],
       [21],
       [25],
       [25],
       [49],
       [39],
       [29],
       [59],
       [60],
       [42],
       [44],
       [ 7],
       [51],
       [61],
       [33],
       [30],
       [57],
       [25],
       [31],
       [43],
       [31],
       [46],
       [ 9],
       [34],
   

In [None]:
sampled_indeces = np.reshape(sampled_indeces, (1,-1))[0]
sampled_indeces

array([12, 18, 50, 28, 20, 62, 23, 50, 16, 45, 59, 37, 25, 40,  0, 58, 33,
        8,  6, 51, 40, 57, 25, 10, 52,  5, 47, 18, 26, 26, 25, 62, 39, 51,
       42,  5, 31, 21,  8,  0, 49, 60, 52, 40, 62, 38, 40,  7,  8,  1, 21,
       25, 25, 49, 39, 29, 59, 60, 42, 44,  7, 51, 61, 33, 30, 57, 25, 31,
       43, 31, 46,  9, 34, 47, 46, 20, 41, 31, 39, 29, 40,  1, 35, 57, 36,
       49,  5,  2, 56,  2, 27, 18, 33, 43,  7, 35,  7, 60, 46, 17, 58, 52,
       12, 37, 60,  5, 47, 58, 63, 12, 48, 46, 16, 34, 42,  8, 41, 39, 34,
        4, 50, 64, 16, 46,  3, 26, 50, 48, 56, 37, 62,  8, 42,  4, 28, 41,
       42, 47, 35, 15, 18, 53, 12, 50, 16, 37,  3, 17, 13, 60, 39, 56, 12,
       26,  7, 63, 13, 10, 63,  1, 60, 25, 43, 50, 20, 32, 47, 33,  2, 62,
       30, 37, 30, 28, 55,  8,  5, 64, 34, 43,  8, 27, 23, 41,  9,  8, 50,
       57, 28, 57, 51, 41, 47, 30, 17, 15, 39, 53, 10,  5])

In [None]:
predicted_chars = decoder(sampled_indeces)
predicted_chars

"?FlPHxKlDguYMb\ntU.,mbsM:n'iFNNMxamd'SI.\nkvnbxZb-. IMMkaQuvdf-mwURsMSeSh3VihHcSaQb WsXk'!r!OFUe-W-vhEtn?Yv'ity?jhDVd.caV&lzDh$NljrYx.d&PcdiWCFo?lDY$EAvar?N-yA:y vMelHTiU!xRYRPq.'zVe.OKc3.lsPsmciRECao:'"

In [None]:
def custom_loss(labels, logits):
  loss_fun =  tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
  return loss_fun(labels, logits)


### Model Compilation

In [None]:
model.compile(optimizer = 'adam', loss = custom_loss)

In [None]:
checkpoint_dr = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dr, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

In [None]:
history = model.fit(data, epochs = 4 ,callbacks = [checkpoint_callback])

In [None]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size = 1)

In [None]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dr))
model.build(tf.TensorShape([1, None]))

In [None]:
def generate_text(model, start_string):
  num_generated = 500 # Number of characters will get generate

  input_eval = encoder(start_string)
  input_eval = tf.expand_dims(input_eval, 0)

  result_arr = []

  # tempearture varies from 0 to 1 and it decides the randomness or predictivity of our result.
  # lower the temperature results more predictable results.

  temperature = 1.0

  model.reset_states()
  for i in range(num_generated):
    predictions = model(input_eval)

    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1,0].numpy()

    # we pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(decoder(predicted_id))

  return (start_string + ''.join(text_generated))

In [None]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))