In [7]:
import os
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
from tensorflow import keras

path_2_file = tf.keras.utils.get_file('shakespeare.txt',
                                      'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')



TensorFlow 2.x selected.
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [8]:
text = open(path_2_file, 'rb').read().decode(encoding='utf-8')
print(text[:250])
vocab = sorted(set(text))
print("{} unique characters".format(len(vocab)))


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

65 unique characters


In [9]:
char2int = {unique:intg for intg, unique in enumerate(vocab)}
int2char = np.array(vocab)

text_as_int = np.array([char2int[char] for char in text])
print("{")
for char,_ in zip(char2int, range(20)):
  print("    {:4s}: {:3d},".format(repr(char), char2int[char]))

print('...\n')
print('{} ----> characters append to int ----> {}'.format(repr(text[:13]),
                                                          text_as_int[:13]))

{
    '\n':   0,
    ' ' :   1,
    '!' :   2,
    '$' :   3,
    '&' :   4,
    "'" :   5,
    ',' :   6,
    '-' :   7,
    '.' :   8,
    '3' :   9,
    ':' :  10,
    ';' :  11,
    '?' :  12,
    'A' :  13,
    'B' :  14,
    'C' :  15,
    'D' :  16,
    'E' :  17,
    'F' :  18,
    'G' :  19,
...

'First Citizen' ----> characters append to int ----> [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [10]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
for i in char_dataset.take(5):
  print(int2char[i.numpy()])

F
i
r
s
t


In [11]:
seq_length = 100
examples_per_epoch = len(text) // (seq_length +1)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
for item in sequences.take(5):
  print(repr("".join(int2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [12]:
def split_input(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input)

for input_eg, target_eg in dataset.take(1):
  print("input data", repr(''.join(int2char[input_eg.numpy()])))
  print("target data", repr(''.join(int2char[target_eg.numpy()])))
  

input data 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
target data 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [13]:
for i, (input_int, target_int) in enumerate(zip(input_eg[:5],
                                            target_eg[:5])):
  print('step {:4d}'.format(i))
  print('  input {} ({:s})'.format(input_int, repr(int2char[input_int])))
  print('expected output {} ({:s})'.format(target_int, repr(int2char[target_int])))
  

step    0
  input 18 ('F')
expected output 47 ('i')
step    1
  input 47 ('i')
expected output 56 ('r')
step    2
  input 56 ('r')
expected output 57 ('s')
step    3
  input 57 ('s')
expected output 58 ('t')
step    4
  input 58 ('t')
expected output 1 (' ')


In [14]:
batch_size = 64
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
                               tf.keras.layers.Embedding(
                                   vocab_size, embedding_dim,
                                   batch_input_shape=[batch_size,None]),
                               tf.keras.layers.GRU(rnn_units, return_sequences=True,
                                                   stateful=True,
                                                   recurrent_initializer='glorot_uniform'),
                               tf.keras.layers.Dense(vocab_size)
  ])
  return model
model = build_model(vocab_size = len(vocab), 
                    embedding_dim = embedding_dim,
                    rnn_units= rnn_units, 
                    batch_size= batch_size
                    )
for input_eg_batch, target_eg_batch in dataset.take(1):
  eg_batch_pred = model(input_eg_batch)
  print(eg_batch_pred.shape)
model.summary()


(64, 100, 65)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [16]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels,
                                                         logits, from_logits=True)
model.compile(optimizer='adam', loss=loss)
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'chkpr_{epoch}')
checkpoint_call = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)
epochs = 25
history = model.fit(dataset, epochs = epochs, callbacks = [checkpoint_call])

Train for 172 steps
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [19]:
example_batch_loss  = loss(target_eg_batch, eg_batch_pred)
print("Prediction shape: ", input_eg_batch.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", eg_batch_pred.numpy().mean())

Prediction shape:  (64, 100)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       0.00038520095


In [20]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

model.summary()

def generate_text(model, start_string):
  generate_num = 10000
  input_eval = [char2int[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval,0)

  text_generated = []
  temperature = 1.0

  model.reset_states()
  for i in range(generate_num):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions,0)

    predictions = predictions/ temperature
    predict_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    input_eval = tf.expand_dims([predict_id],0)

    text_generated.append(int2char[predict_id])

  return(start_string + ''.join(text_generated))

print(generate_text(model, start_string='Romeo: '))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________
Romeo: he is gone here in safety
Of boans but well, and what the humour wounds
With his reporting.

COMINIUS:
He's simple at the legs.

BLUNT:
Hargin, commend me to: the twice stand to see.

KING RICHARD III:
Servant, sir, a gentlemannoryork and beauty sound,
A thousand tybants; for I have heard to do
Wilt thou had?

POMPEY:
Sir, so shame too true: and to't o' the moon!

