# Char-RNN-TensorFlow

In [1]:
import numpy as np
import os
import sys
import time

import tensorflow as tf
tf.enable_eager_execution()

In [2]:
print(f'sys.version: {sys.version}')
print(f'tf.__version__: {tf.__version__}')
!cat /usr/local/cuda/version.txt

sys.version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
tf.__version__: 1.11.0
CUDA Version 9.0.176


## 1. Load the data
`./data/input.txt` contains 3 Harry Potter books (1st, 3rd, 4th).

In [3]:
fi = open('./data/input.txt')
text = fi.read()
fi.close()

print(f'Length of text: {len(text)} characters')

Length of text: 2162018 characters


### 1.1 Look at the first 1000 characters in the text

In [4]:
print(text[:1000])

Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. 

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. 

The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters

### 1.2 The unique characters in the file

In [5]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

89 unique characters


## 2. Process the text

### 2.1 Vectorize the text

In [6]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [7]:
# Integer representation for each character
for char,_ in zip(char2idx, range(30)):
    print(f'{repr(char):6s} ---> {char2idx[char]:4d}')

'\n'   --->    0
' '    --->    1
'!'    --->    2
'"'    --->    3
'&'    --->    4
"'"    --->    5
'('    --->    6
')'    --->    7
'*'    --->    8
','    --->    9
'-'    --->   10
'.'    --->   11
'/'    --->   12
'0'    --->   13
'1'    --->   14
'2'    --->   15
'3'    --->   16
'4'    --->   17
'5'    --->   18
'6'    --->   19
'7'    --->   20
'8'    --->   21
'9'    --->   22
':'    --->   23
';'    --->   24
'?'    --->   25
'A'    --->   26
'B'    --->   27
'C'    --->   28
'D'    --->   29


In [8]:
# Show how the first 16 characters from the text are mapped to integers
print(f'{text[:16]} ---- characters mapped to int ---- > {text_as_int[:16]}')

Harry Potter and ---- characters mapped to int ---- > [33 56 73 73 80  1 41 70 75 75 60 73  1 56 69 59]


## 3. Creating training examples and targets
Each training example will contain `seq_length` characters from the text. The corresponding targets contain the same  length of text, except shifted one character to the right.

In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 100

# Create training examples / targets
chunks = tf.data.Dataset.from_tensor_slices(text_as_int).batch(seq_length+1, drop_remainder=True)

for item in chunks.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

"Harry Potter and the Sorcerer's Stone \n\nCHAPTER ONE \n\nTHE BOY WHO LIVED \n\nMr. and Mrs. Dursley, of nu"
'mber four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They'
" were the last people you'd expect to be involved in anything strange or mysterious, because they jus"
"t didn't hold with such nonsense. \n\nMr. Dursley was the director of a firm called Grunnings, which ma"
'de drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. '


In [10]:
# Create the input and target texts from this chunk
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = chunks.map(split_input_target)

In [11]:
# Let's print the first 10 values of the first example
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  "Harry Potter and the Sorcerer's Stone \n\nCHAPTER ONE \n\nTHE BOY WHO LIVED \n\nMr. and Mrs. Dursley, of n"
Target data: "arry Potter and the Sorcerer's Stone \n\nCHAPTER ONE \n\nTHE BOY WHO LIVED \n\nMr. and Mrs. Dursley, of nu"


## 4. Creating batches and shuffling them using tf.data

In [12]:
# Batch size 
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

## 5. The Model

### 5.1 Implement the model

In [13]:
class Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super(Model, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        if tf.test.is_gpu_available():
            self.lstm = tf.keras.layers.CuDNNLSTM(self.units,
                                                  return_sequences=True, 
                                                  recurrent_initializer='glorot_uniform',
                                                  stateful=True)
        else:
            self.lstm = tf.keras.layers.LSTM(self.units, 
                                             return_sequences=True,
                                             recurrent_activation='sigmoid', 
                                             recurrent_initializer='glorot_uniform', 
                                             stateful=True)

        self.fc = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        embedding = self.embedding(x)

        # output at every time step
        # output shape == (batch_size, seq_length, hidden_size) 
        output = self.lstm(embedding)

        # The dense layer will output predictions for every time_steps(seq_length)
        # output shape after the dense layer == (seq_length * batch_size, vocab_size)
        prediction = self.fc(output)

        # states will be used to pass at every step to the model while training
        return prediction

### 5.2 Instantiate the model

In [14]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 256

# Number of RNN units
units = 512

model = Model(vocab_size, embedding_dim, units)

### 5.3 Instantiate optimizer  and loss

In [15]:
# Using adam optimizer with default arguments
optimizer = tf.train.AdamOptimizer()

# Using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors
def loss_function(real, preds):
    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)

### 5.4 Checkpoints

In [16]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
# Checkpoint instance
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

### 5.5 Train the model

In [17]:
model.build(tf.TensorShape([BATCH_SIZE, seq_length]))

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  22784     
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       multiple                  1576960   
_________________________________________________________________
dense (Dense)                multiple                  45657     
Total params: 1,645,401
Trainable params: 1,645,401
Non-trainable params: 0
_________________________________________________________________


In [19]:
def train_model(EPOCHS=10):
    for epoch in range(EPOCHS):
        start = time.time()

        # initializing the hidden state at the start of every epoch
        # initially hidden is None
        hidden = model.reset_states()

        for (batch, (inp, target)) in enumerate(dataset):
            with tf.GradientTape() as tape:
                # feeding the hidden state back into the model
                # This is the interesting step
                predictions = model(inp)
                loss = loss_function(target, predictions)

            grads = tape.gradient(loss, model.variables)
            optimizer.apply_gradients(zip(grads, model.variables))

            if batch % 100 == 0:
                print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,
                                                            batch,
                                                            loss))
        # saving (checkpoint) the model every 5 epochs
        if (epoch + 1) % 5 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)

        print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
        print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [20]:
# Training step
EPOCHS = 150
train_model(EPOCHS)

Epoch 1 Batch 0 Loss 4.4889
Epoch 1 Batch 100 Loss 2.4287
Epoch 1 Batch 200 Loss 2.0609
Epoch 1 Batch 300 Loss 1.8467
Epoch 1 Loss 1.8402
Time taken for 1 epoch 25.274771451950073 sec

Epoch 2 Batch 0 Loss 1.8853
Epoch 2 Batch 100 Loss 1.7140
Epoch 2 Batch 200 Loss 1.6073
Epoch 2 Batch 300 Loss 1.5782
Epoch 2 Loss 1.6051
Time taken for 1 epoch 24.304714918136597 sec

Epoch 3 Batch 0 Loss 1.5724
Epoch 3 Batch 100 Loss 1.5270
Epoch 3 Batch 200 Loss 1.5086
Epoch 3 Batch 300 Loss 1.4021
Epoch 3 Loss 1.3950
Time taken for 1 epoch 24.57616400718689 sec

Epoch 4 Batch 0 Loss 1.4677
Epoch 4 Batch 100 Loss 1.4006
Epoch 4 Batch 200 Loss 1.3581
Epoch 4 Batch 300 Loss 1.3453
Epoch 4 Loss 1.3637
Time taken for 1 epoch 24.421303272247314 sec

Epoch 5 Batch 0 Loss 1.4008
Epoch 5 Batch 100 Loss 1.3350
Epoch 5 Batch 200 Loss 1.3138
Epoch 5 Batch 300 Loss 1.3027
Epoch 5 Loss 1.3430
Time taken for 1 epoch 24.512234449386597 sec

Epoch 6 Batch 0 Loss 1.2917
Epoch 6 Batch 100 Loss 1.3202
Epoch 6 Batch 200 

Epoch 44 Batch 300 Loss 0.9432
Epoch 44 Loss 0.9186
Time taken for 1 epoch 23.932286024093628 sec

Epoch 45 Batch 0 Loss 0.9418
Epoch 45 Batch 100 Loss 0.9506
Epoch 45 Batch 200 Loss 0.9101
Epoch 45 Batch 300 Loss 0.9506
Epoch 45 Loss 0.9441
Time taken for 1 epoch 24.099271059036255 sec

Epoch 46 Batch 0 Loss 0.9141
Epoch 46 Batch 100 Loss 0.9378
Epoch 46 Batch 200 Loss 0.9628
Epoch 46 Batch 300 Loss 0.9403
Epoch 46 Loss 0.9463
Time taken for 1 epoch 23.94114089012146 sec

Epoch 47 Batch 0 Loss 0.9334
Epoch 47 Batch 100 Loss 0.9311
Epoch 47 Batch 200 Loss 0.9363
Epoch 47 Batch 300 Loss 0.8969
Epoch 47 Loss 0.9173
Time taken for 1 epoch 23.98555874824524 sec

Epoch 48 Batch 0 Loss 0.9099
Epoch 48 Batch 100 Loss 0.9189
Epoch 48 Batch 200 Loss 0.9389
Epoch 48 Batch 300 Loss 0.9336
Epoch 48 Loss 0.9430
Time taken for 1 epoch 23.911340713500977 sec

Epoch 49 Batch 0 Loss 0.9198
Epoch 49 Batch 100 Loss 0.8979
Epoch 49 Batch 200 Loss 0.9213
Epoch 49 Batch 300 Loss 0.8921
Epoch 49 Loss 0.9336


Epoch 88 Batch 0 Loss 0.8350
Epoch 88 Batch 100 Loss 0.8533
Epoch 88 Batch 200 Loss 0.8927
Epoch 88 Batch 300 Loss 0.8705
Epoch 88 Loss 0.8552
Time taken for 1 epoch 23.76588273048401 sec

Epoch 89 Batch 0 Loss 0.8508
Epoch 89 Batch 100 Loss 0.8815
Epoch 89 Batch 200 Loss 0.8684
Epoch 89 Batch 300 Loss 0.8502
Epoch 89 Loss 0.8822
Time taken for 1 epoch 23.278547286987305 sec

Epoch 90 Batch 0 Loss 0.8397
Epoch 90 Batch 100 Loss 0.8327
Epoch 90 Batch 200 Loss 0.8705
Epoch 90 Batch 300 Loss 0.8466
Epoch 90 Loss 0.8670
Time taken for 1 epoch 24.09212040901184 sec

Epoch 91 Batch 0 Loss 0.8588
Epoch 91 Batch 100 Loss 0.8663
Epoch 91 Batch 200 Loss 0.8715
Epoch 91 Batch 300 Loss 0.8917
Epoch 91 Loss 0.8516
Time taken for 1 epoch 24.06751799583435 sec

Epoch 92 Batch 0 Loss 0.8573
Epoch 92 Batch 100 Loss 0.8645
Epoch 92 Batch 200 Loss 0.9080
Epoch 92 Batch 300 Loss 0.8475
Epoch 92 Loss 0.8626
Time taken for 1 epoch 24.31036376953125 sec

Epoch 93 Batch 0 Loss 0.8518
Epoch 93 Batch 100 Loss 0

Epoch 130 Batch 300 Loss 0.8263
Epoch 130 Loss 0.8329
Time taken for 1 epoch 24.293970108032227 sec

Epoch 131 Batch 0 Loss 0.8212
Epoch 131 Batch 100 Loss 0.8263
Epoch 131 Batch 200 Loss 0.8365
Epoch 131 Batch 300 Loss 0.8384
Epoch 131 Loss 0.8666
Time taken for 1 epoch 25.46133804321289 sec

Epoch 132 Batch 0 Loss 0.8177
Epoch 132 Batch 100 Loss 0.8725
Epoch 132 Batch 200 Loss 0.8350
Epoch 132 Batch 300 Loss 0.8401
Epoch 132 Loss 0.8617
Time taken for 1 epoch 25.064318656921387 sec

Epoch 133 Batch 0 Loss 0.7943
Epoch 133 Batch 100 Loss 0.8228
Epoch 133 Batch 200 Loss 0.8761
Epoch 133 Batch 300 Loss 0.8300
Epoch 133 Loss 0.8375
Time taken for 1 epoch 24.281461477279663 sec

Epoch 134 Batch 0 Loss 0.8083
Epoch 134 Batch 100 Loss 0.8615
Epoch 134 Batch 200 Loss 0.8552
Epoch 134 Batch 300 Loss 0.8722
Epoch 134 Loss 0.8187
Time taken for 1 epoch 23.725439071655273 sec

Epoch 135 Batch 0 Loss 0.8269
Epoch 135 Batch 100 Loss 0.8582
Epoch 135 Batch 200 Loss 0.8627
Epoch 135 Batch 300 Loss 0

In [21]:
# checkpoint.save(file_prefix = checkpoint_prefix)

### 5.6 Restore the latest checkpoint

In [22]:
!ls {checkpoint_dir}

checkpoint		     ckpt-24.data-00000-of-00001
ckpt-10.data-00000-of-00001  ckpt-24.index
ckpt-10.index		     ckpt-25.data-00000-of-00001
ckpt-11.data-00000-of-00001  ckpt-25.index
ckpt-11.index		     ckpt-26.data-00000-of-00001
ckpt-12.data-00000-of-00001  ckpt-26.index
ckpt-12.index		     ckpt-27.data-00000-of-00001
ckpt-13.data-00000-of-00001  ckpt-27.index
ckpt-13.index		     ckpt-28.data-00000-of-00001
ckpt-14.data-00000-of-00001  ckpt-28.index
ckpt-14.index		     ckpt-29.data-00000-of-00001
ckpt-15.data-00000-of-00001  ckpt-29.index
ckpt-15.index		     ckpt-2.data-00000-of-00001
ckpt-16.data-00000-of-00001  ckpt-2.index
ckpt-16.index		     ckpt-30.data-00000-of-00001
ckpt-17.data-00000-of-00001  ckpt-30.index
ckpt-17.index		     ckpt-3.data-00000-of-00001
ckpt-18.data-00000-of-00001  ckpt-3.index
ckpt-18.index		     ckpt-4.data-00000-of-00001
ckpt-19.data-00000-of-00001  ckpt-4.index
ckpt-19.index		     ckpt-5.data-00000-of-00001
ckpt-1.data-00000-of-00001   ck

In [23]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt-30'

In [24]:
model = Model(vocab_size, embedding_dim, units)

checkpoint = tf.train.Checkpoint(model=model)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

### 5.7 Generate text using the learned model

In [25]:
def generate_text(start_string='A', num_generate=1000):
    # Converting our start string to numbers (vectorizing) 
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
        predicted_id = tf.argmax(predictions[-1]).numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [31]:
print(generate_text(start_string='H', num_generate=500))

H"

Harry felt a great witch in front of him, and there was a soft thump and dangerous as his feet and set off at the sight of the first years. 

A sudden seats draw under his brain.

He could see a silence so that his face was still staring at the egg in the dark and silver and glasses.

"Harry!" she said. "They were trying to stop us!"

Harry didn't say anything. He pulled out the stairs to the ground behind him.

"So that's what you mean, they were playing competing," said Lupin, still sitting


In [32]:
print(generate_text(start_string='Harry is', num_generate=500))

Harry is going to get there!"

"I think I will be able to see the bushes," said Hagrid in a very fine at Krum's voice. He was looking at the first task filled the castle and started to show the house­elves in the shadows, and the sound of the solidement was so dark they could see through the darkness, the hand pointed out of the way to Harry to his feet. "It was a minute ­­"

"The Dark Mark what they were talking about?" said Ron, who was still sitting on the stone steps into the castle and the stairs t


In [33]:
print(generate_text(start_string='Azkaban', num_generate=500))

Azkaban trees behind him.

"So what are you doing here?" said Harry. "I don't know what I see the third task."

"What?" said Harry. "There was the best school aloud. 

"And what are you talking about?" said Ron, who was still sitting on the stone steps into the castle and the stairs to the door, whose handle was sitting at the stands around the side of the staircase and pulled on the stone steps, to the floor. He was wearing a long sigh and started to do it. 

"It's too late," said Harry. "I would all 
