<a href="https://colab.research.google.com/github/SethurajS/DeepLearning_Snippets/blob/master/Text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IMPORTING THE MODULES**

In [0]:
%tensorflow_version 2.x

import numpy as np
import tensorflow as tf
import os
import time

print("TensorFlow : {}".format(tf.__version__))
print("Numpy : {}".format(np.__version__))
print("GPU is","available" if tf.config.experimental.list_physical_devices("GPU") else "not available")

TensorFlow : 2.2.0-rc3
Numpy : 1.18.3
GPU is available


# **IMPORTING THE DATA**

In [0]:
file_path = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [0]:
text = open(file_path, 'rb').read().decode(encoding='utf-8')

print("First 250 chars : ----------> \n\n{}".format(text[:250]),)
print("Length of the text : ----------> {}".format(len(text)))

First 250 chars : ----------> 

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

Length of the text : ----------> 1115394


# **PREPROCESSING THE DATA**

**FETCHING UNIQUE CHARACTERS**

In [0]:
vocab = sorted(set(text))
print("Vocab List : {}".format(vocab))
print("Vocab Length : {}".format(len(vocab)))

Vocab List : ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Vocab Length : 65


**DATA CONVERTION**

In [0]:
char_to_idx = {char: index for index, char in enumerate(vocab)}
idx_to_char = np.array(vocab)

print("Char --> Index dict : {}".format(char_to_idx))
print("Index --> Char array : {}".format(idx_to_char))

Char --> Index dict : {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
Index --> Char array : ['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


**LOOKING INTO THE DATA AFTER CONVERSION**

In [0]:
text_as_int = [char_to_idx[c] for c in text]

print("Actual text : -------------> \n\n{}".format(text[:250]))
print("Converted text : -------------> \n\n{}".format(text_as_int[:250]))

Actual text : -------------> 

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

Converted text : -------------> 

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59, 1, 39, 56, 43, 1, 39, 50, 50, 1, 56, 43, 57, 53, 50, 60, 43, 42, 1, 56, 39, 58, 46, 43, 56, 1, 58, 53, 1, 42, 47, 43, 1, 58, 46, 39, 52, 1, 58, 53, 1, 44, 39, 51, 47, 57, 46, 12, 0, 0, 13, 50, 50, 10, 0, 30, 43, 57, 53, 50, 60, 43, 42, 8, 1, 56, 43, 57, 53, 50, 60, 43, 42, 8, 0, 0, 18, 47, 56, 57, 58, 1

**CONVERTING THE DATA INTO TF DATASETS**

In [0]:
datasets = tf.data.Dataset.from_tensor_slices(text_as_int)

**CREATING INPUT SEQUENCE OF LENGTH - (100)**

In [0]:
sequence_length = 100
sequence = datasets.batch(sequence_length+1, drop_remainder=True)

**SPLITING THE SEQUNECES INTO INPUT AND OUTPUT DATA**

In [0]:
def input_output(data):
  input_data = data[:-1]
  output_data = data[1:]
  return input_data, output_data

data = sequence.map(input_output)

for inputs, outputs in data.take(1):
  print("Input data : ------> \n\n{}".format(inputs), end="\n\n")
  print("Input data text format : ------> \n\n{}".format(repr(''.join(idx_to_char[inputs.numpy()]))), end="\n\n\n\n")
  print("Output data : ------> \n\n{}".format(outputs), end="\n\n")
  print("Output data text format : ------> \n\n{}".format(repr(''.join(idx_to_char[outputs.numpy()]))))

Input data : ------> 

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]

Input data text format : ------> 

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'



Output data : ------> 

[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]

Output data text format : ------> 

'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


## **CREATING THE TRAINING BATCHES**

In [0]:
BATCH_SIZE = 64
SHUFFLE_SIZE = 10000

dataset = data.shuffle(SHUFFLE_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

# **BUILDING THE MODEL**

In [0]:
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024
BATCH_SIZE = 64

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]))
  model.add(tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
  model.add(tf.keras.layers.Dense(vocab_size))

  return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


**TRYING MODEL**

In [0]:
for input_data, output_data in dataset.take(1):
  print("Input_data shape : {}".format((model(input_data)).shape))

Input_data shape : (64, 100, 65)


# **COMPILING THE MODEL**

In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

## **CONFIGURING CHECKPOINTS**

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

# **TRAINING THE MODEL**

In [0]:
EPOCHS = 50

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# **GENERATING TEXT**

**RESTORING THE LATEST CHECKPOINTS**

In [0]:
# tf.train.latest_checkpoint(checkpoint_dir)

#model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


**TEXT GENERATION**

In [0]:
def generate_text(model, start_string):

  num_generate = 1000

  input_eval = [char_to_idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  temp = 1.0

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
    
      predictions = tf.squeeze(predictions, 0)
      predictions = predictions / temp
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx_to_char[predicted_id])

  return (start_string + ''.join(text_generated))

In [0]:
inp = input(u"Type a starting string: \n\n")
print(generate_text(model, inp))

# print(generate_text(model, start_string=u"ROMEO: "))

Type a starting string: 

romeo
romeones not found? O that may let it friend, I cry thee banishment: I receive my
natious youngest-day never cut for Claudio:
Ba taste the streets, and so storm
That we have broken shows a wife to thee.

QUEEN ELIZABETH:
Oh, who shall poison here,
Whose arms and soldiers, and the thorn be seen as I can
learnedge to this proud we'll nor slarpestagener, like my breast!
How nearly will command thee say that I have King Lewns,
Hath he keposs he may live: and so do I will abservant villain!
Well, dead my marriage!

Both Tribunes, which strike upon my father's laid,
Whose
' would be ready with her to imprison't be done,
Enforceal summons of our commonwealth
'Gainst fair Exellikence honour of a pleasure;
Thy slaughter'd birth, you'll me, ghands:
Hold huntsmine honour to loving spring;
But sees you think the worst can curds she will be oat.

ROMEO:
Did they be satisfied. Give me a travel,
Till he had gone at your power to do thee this place.

CLARENCE:
My LADY V

# **CUSTOMIZED MODEL**

In [0]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function
def train(inp, target):
  with tf.GradientTape() as tape:
    prediction = model(inp)
    loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(target, prediction, from_logits=True))
  grad = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grad, model.trainable_variables))
  
  return loss


In [0]:
EPOCHS = 50

for epoch in range(EPOCHS):
  start = time.time()

  hidden = model.reset_states()

  for batch, (inp, target) in enumerate(dataset):

    loss = train(inp, target)

    if batch % 100 == 0:
      print("Epochs {} -- Batchs {} -- Loss {}".format(epoch+1, batch, loss))

  if (epoch+1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print("Epochs {} -- Loss {}".format(epoch+1, loss))
  print("Time for epoch {} -- {} sec\n".format(epoch+1, time.time() - start))

Epochs 1 -- Batchs 0 -- Loss 4.175083160400391
Epochs 1 -- Batchs 100 -- Loss 2.3756961822509766
Epochs 1 -- Loss 2.121181011199951
Time for epoch 1 -- 10.38376522064209 sec

Epochs 2 -- Batchs 0 -- Loss 2.601860284805298
Epochs 2 -- Batchs 100 -- Loss 1.949493408203125
Epochs 2 -- Loss 1.8026353120803833
Time for epoch 2 -- 9.266782760620117 sec

Epochs 3 -- Batchs 0 -- Loss 1.78058660030365
Epochs 3 -- Batchs 100 -- Loss 1.6772582530975342
Epochs 3 -- Loss 1.6218703985214233
Time for epoch 3 -- 9.30790376663208 sec

Epochs 4 -- Batchs 0 -- Loss 1.5847848653793335
Epochs 4 -- Batchs 100 -- Loss 1.5768003463745117
Epochs 4 -- Loss 1.510960578918457
Time for epoch 4 -- 9.38113808631897 sec

Epochs 5 -- Batchs 0 -- Loss 1.4702422618865967
Epochs 5 -- Batchs 100 -- Loss 1.4624484777450562
Epochs 5 -- Loss 1.3865175247192383
Time for epoch 5 -- 9.364110469818115 sec

Epochs 6 -- Batchs 0 -- Loss 1.3845782279968262
Epochs 6 -- Batchs 100 -- Loss 1.397548794746399
Epochs 6 -- Loss 1.33430624