In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Deep Neural Networks 

## Text Generation using RNN (LSTM)

### Import TensorFlow and other libraries

In [1]:
# Lets import some libraries
import os
import time
import datetime
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import tensorflow as tf




In [2]:
# Some basic parameters

inpDir = '../..\Classwork/input' # location where input data is stored
outDir = '../output' # location to store outputs
modelDir = '../models' # location to store models
subDir = 'text_gen' # location to store models


RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production
np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible  results

BATCH_SIZE = 64

EPOCHS = 50 # number of cycles to run

ALPHA = 0.001 # learning rate

### Shakespeare dataset

In [4]:
filePath = os.path.join(inpDir, 'text_gen', 'shakespeare.txt')
filePath

'../..\\Classwork/input\\text_gen\\shakespeare.txt'

In [5]:
text = open(filePath, 'rb').read().decode(encoding='utf-8')

len(text)

#tf.io.read_file(filePath).numpy()..decode(encoding='utf-8')

1115395

In [6]:
#text

In [7]:
print(text[:400])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


In [8]:
## character level vocab
vocab = sorted(set(text))
len(vocab)

65

In [10]:
char2idx = {u:i for i, u in enumerate(vocab)} # 

idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

text_as_int.shape

(1115395,)

In [13]:
text_as_int

array([18, 47, 56, ...,  8,  0,  0])

In [14]:
type(text_as_int)

numpy.ndarray

In [15]:
text_as_int.shape

(1115395,)

In [18]:
idx2char[18]

'F'

In [17]:
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [19]:
dataset = tf.data.Dataset.from_tensor_slices([1.,2.,3.])

print (list(dataset.as_numpy_iterator()))

[1.0, 2.0, 3.0]


In [20]:
seq_length = 100

example_per_epoch = len(text) // (seq_length+1) ## 1 added to denomenator which gives (2nd-101st) elementsacting as element 
                                                ## to be predicted for (st-100th)elements
                                                ## output needs extra character
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(10):
    
    print (i.numpy(), '|', idx2char[i.numpy()])

18 | F
47 | i
56 | r
57 | s
58 | t
1 |  
15 | C
47 | i
58 | t
47 | i


In [21]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(2):
    
    print (item)

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)


In [22]:
for item in sequences.take(2):
    
    print (repr( ''.join(idx2char[item.numpy()] ) ) )

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [23]:
def split_input_target(chunk):
    
    input_text = chunk[:-1]
    
    target_text = chunk[1:]
    
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [24]:
for inp_ex, tar_ex in dataset.take (2):
    print (repr( ''.join(idx2char[inp_ex.numpy()] ) ))
    print (repr( ''.join(idx2char[tar_ex.numpy()] ) ))
    print ('*'*50, '\n')

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
************************************************** 

'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
************************************************** 



In [25]:
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>

In [27]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024


In [28]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    
    model = tf.keras.models.Sequential([
        
        tf.keras.layers.Embedding(vocab_size, 
                                  embedding_dim, 
                                  batch_input_shape= [batch_size, None]),
        
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True, 
                            stateful=True, 
                            recurrent_initializer='glorot_uniform'
                           ),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

In [29]:
# vocab_size, embedding_dim, rnn_units, batch_size
model = build_model(vocab_size= len(vocab), 
                    embedding_dim=embedding_dim, 
                    rnn_units = rnn_units,
                    batch_size= BATCH_SIZE)




In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [31]:
for input_ex_batch, target_ex_batch in dataset.take(1):
    ex_batch_pred = model(input_ex_batch)

In [32]:
ex_batch_pred.shape

TensorShape([64, 100, 65])

In [33]:
sampled_indices = tf.random.categorical(ex_batch_pred[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

sampled_indices

array([16, 11, 40, 57, 53,  3,  7, 37, 52, 55, 33, 14, 30, 36, 22, 17,  7,
       26, 16, 26, 35, 18, 35, 15,  9, 19, 10, 43, 53, 16, 38, 36, 62, 12,
       48, 43, 45,  8, 29,  4, 53, 53,  1,  1, 63, 11, 36, 49, 30, 57, 30,
       62, 20, 53, 38, 11, 41, 46, 46, 57, 44, 54, 49, 46, 31, 13,  0,  8,
       35, 48, 56, 31, 30, 57, 37, 34, 16, 43, 30, 42,  4, 28, 39, 28, 60,
       44, 13, 33, 39, 15, 57, 28, 62, 21,  5, 56, 40, 29,  1, 37],
      dtype=int64)

In [37]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer = 'adam', loss=loss_fn)




In [38]:
chkPtPath = os.path.join(modelDir, subDir)

chkPtPrefix = os.path.join(chkPtPath, 'chkpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=chkPtPrefix,
                                                        save_weights_only=True)

In [39]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [40]:
tf.train.latest_checkpoint(chkPtPath)

'../models\\text_gen\\chkpt_50'

In [41]:
### model for generating output

In [42]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(chkPtPath))

model.build ( tf.TensorShape ( [1, None ] ) ) ## shape 1 

In [43]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_1 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_1 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
def generate_text(model, start_string):
    
    num_generate =  1000
    input_eval = [char2idx[s] for s in start_string] # [37, 48, 56 ]
    print (f'Input: {start_string} | {input_eval}\n')
    input_eval = tf.expand_dims(input_eval, 0) # tf.Tensor (1, 1, 5)
    text_generated = []
    
    model.reset_states()
    
    for i in range(num_generate):
        
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predict_td = tf.random.categorical(predictions, 
                                            num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predict_td], 0)
        text_generated.append(idx2char[predict_td])
        
    return start_string+''.join(text_generated)

In [45]:
print (generate_text(model, start_string=u'ROMEO:'))

Input: ROMEO: | [30, 27, 25, 17, 27, 10]

ROMEO:
Not a doubt it Edward, as we see it;
And soon pay death may be king, if 'em
To make thee joy were thou would have spoke.

BRUTUS:
We stand by, we can: much up those rough against
The gates of Milan, and much more, gration
They say it was, but he is come of Mine,
Put in your mother!' then; O thou tyrant!
Do not yet that untide my hope whereon you?

CORIOLANUS:
Yes, madam: learned Neighbour, this is no liquid in this wide,
And more come but form: such is this in
pull?

AEdile:
Pray you, go with me this most we saddly in the time.

BENVOLIO:
Good king so fair and affards.

Servant:
What, is the may be entreated.
Pow, Paulina,
Who calls so our disords and mortalius to so.

BAPTISTA:
Now, good sir!

POMPEY:
I thank your worship.

PETRUCHIO:
Conscience, master; whiles I say to thee?
Deep fow's temple triumph? Or my secret shameful:
Which tre our cousin' us withal?

GLOUCESTER:
Would you think I unto these words:
Bound to him with ruth and be i