In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Deep Neural Networks 
## Session 20a : Lecture

## Text Generation using RNN
<img src='../../prasami_images/prasami_color_tutorials_small.png' style = 'width:400px;' alt="By Pramod Sharma : pramod.sharma@prasami.com" align="left"/>

### Import TensorFlow and other libraries

In [2]:
# Lets import some libraries
import os
import time
import datetime
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import tensorflow as tf

%matplotlib inline




In [3]:
def fn_verify_dir(_path):
    '''
    Arg:
        path: path to verify the directory
    returns:
        create dir if it does not exists
    '''
    if os.path.exists(_path): # check if the path exists. Maybe a file or a folder
        
        print(_path, ' exists') # advised the user
        
    else:
        
        os.makedirs(_path) # create the path
        
        print("Created folder : ", _path)

In [43]:
# Some basic parameters

inpDir = '../../input' # location where input data is stored
outDir = '../output' # location to store outputs
modelDir = '../../models' # location to store models
subDir = 'text_gen' # location to store models


RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production

np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible  results

BATCH_SIZE = 64

EPOCHS = 50 # number of cycles to run

ALPHA = 0.1 # learning rate

In [5]:
physical_devices = tf.config.list_physical_devices('GPU') 

if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Shakespeare dataset

In [6]:
filePath = os.path.join(inpDir, subDir, 'shakespeare.txt')
filePath

'../../input\\text_gen\\shakespeare.txt'

In [7]:
text = open(filePath, 'rb').read().decode(encoding='utf-8')

len(text)

#tf.io.read_file(filePath).numpy()..decode(encoding='utf-8')

1115395

In [8]:
#text

In [9]:
print(text[:400])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


In [10]:
vocab = sorted(set(text))
len(vocab)

65

In [11]:
char2idx = {u:i for i, u in enumerate(vocab)} # 
#creat indexing of vocab

idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

text_as_int.shape

(1115395,)

In [12]:
text_as_int

array([18, 47, 56, ...,  8,  0,  0])

In [13]:
type(text_as_int)

numpy.ndarray

In [14]:
text_as_int.shape

(1115395,)

In [15]:
idx2char[47]

'i'

In [42]:
#char2idx

In [17]:
dataset = tf.data.Dataset.from_tensor_slices([1.,2.,3.])

print (list(dataset.as_numpy_iterator()))

[1.0, 2.0, 3.0]


In [18]:
seq_length = 100

example_per_epoch = len(text) // (seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(10):
    
    print (i.numpy(), '|', idx2char[i.numpy()])

18 | F
47 | i
56 | r
57 | s
58 | t
1 |  
15 | C
47 | i
58 | t
47 | i


In [19]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(2):
    
    print (item)

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)


In [20]:
for item in sequences.take(2):
    
    print (repr( ''.join(idx2char[item.numpy()] ) ) )

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [21]:
def split_input_target(chunk):
    
    input_text = chunk[:-1]
    
    target_text = chunk[1:]
    
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [22]:
for inp_ex, tar_ex in dataset.take (2):
    print (repr( ''.join(idx2char[inp_ex.numpy()] ) ))
    print (repr( ''.join(idx2char[tar_ex.numpy()] ) ))
    print ('*'*50, '\n')

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
************************************************** 

'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
************************************************** 



In [23]:
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>

In [24]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024


In [45]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    
    model = tf.keras.models.Sequential([
        
        tf.keras.layers.Embedding(vocab_size, 
                                  embedding_dim, 
                                  batch_input_shape= [batch_size, None]),
        
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True, 
                            stateful=True, 
                            recurrent_initializer='glorot_uniform'
                           ),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

In [46]:
# vocab_size, embedding_dim, rnn_units, batch_size
model = build_model(vocab_size= len(vocab), 
                    embedding_dim=embedding_dim, 
                    rnn_units = rnn_units,
                    batch_size= BATCH_SIZE)

In [47]:
for input_ex_batch, target_ex_batch in dataset.take(1):
    ex_batch_pred = model(input_ex_batch)

In [48]:
ex_batch_pred.shape

TensorShape([64, 100, 65])

In [49]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (64, None, 256)           16640     
                                                                 
 gru_2 (GRU)                 (64, None, 1024)          3938304   
                                                                 
 dense_2 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [50]:
sampled_indices = tf.random.categorical(ex_batch_pred[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

sampled_indices

array([12, 36,  2, 64, 51,  0, 12,  8, 64,  7, 27, 55, 57, 22, 49, 38, 57,
       41, 59, 11, 59, 11, 17, 12, 21, 63, 55, 47, 31, 53, 29, 30,  4, 35,
        7,  1, 28, 36, 39,  8, 36, 25, 27,  3, 44, 32, 27, 41, 29, 63, 49,
       60, 25, 14, 64, 21, 13, 62, 22, 24, 25, 19, 15,  3, 31,  4,  6, 27,
       42, 39, 34, 11, 29, 17, 13, 36, 51, 24, 40,  1, 48, 56, 47,  6, 54,
       18, 26, 33, 37, 34, 33, 56, 40, 42, 52,  0, 24, 45, 54, 38],
      dtype=int64)

In [51]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [52]:
model.compile(optimizer = 'adam', loss=loss_fn)

In [53]:
chkPtPath = os.path.join(modelDir, subDir)

chkPtPrefix = os.path.join(chkPtPath, 'chkpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=chkPtPrefix,
                                                        save_weights_only=True)

In [54]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [55]:
tf.train.latest_checkpoint(chkPtPath)

'../../models\\text_gen\\chkpt_50'

In [59]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(chkPtPath))

model.build ( tf.TensorShape ( [1, None ] ) )

In [56]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (64, None, 256)           16640     
                                                                 
 gru_2 (GRU)                 (64, None, 1024)          3938304   
                                                                 
 dense_2 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [57]:
def generate_text(model, start_string):
    
    num_generate =  1000
    input_eval = [char2idx[s] for s in start_string] # [37, 48, 56 ]
    print (f'Input: {start_string} | {input_eval}\n')
    input_eval = tf.expand_dims(input_eval, 0) # tf.Tensor (1, 1, 5)
    text_generated = []
    
    model.reset_states()
    
    for i in range(num_generate):
        
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predict_td = tf.random.categorical(predictions, 
                                            num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predict_td], 0)
        text_generated.append(idx2char[predict_td])
        
    return start_string+''.join(text_generated)

In [60]:
print (generate_text(model, start_string=u'ROMEO:'))

Input: ROMEO: | [30, 27, 25, 17, 27, 10]

ROMEO:
The begging.

VOLUNNA:
Now of our will, sit faults
Before I have told this firmle hurt not; that it may nay
Young Romeo let pale counsel!
The Expose is misleigh,
To offerce well for Rome.

First Soldier:
I shall, sir. Fare you well.

BAUNT:
Come, come, tready met I speak blown with the bone.
But what of him?

BIONDELLO:
Where lies hard,
Which sorrow wither.

SAMPSON:
Great lord, myself, and that.

BENVOLIO:
Say, what! get thee my lord?

BUCKINGHAM:
Great Aufeous slave,
From our freech-beauteful Clarence,
Our darges banish'd and faster to thy dagger toward that names' twenty you will go whis ended question.
My soul.

Third Servingman:
Who might have kept that name is omis, his lady may speak
Of what you lose her that would mus apactor?
Thou foot may please the house of Lancaster.

BUCKINGHAM:
Welcome, sweet house?

KATHARINA:
Part of merity.

GREMIO:
Adieu, good fellows; but I cannot meet him; he hath
After the case of Longord? But
he wou