In [2]:
import tensorflow as tf

import os
import time

In [3]:
text = open('shakespeare.txt', 'rb').read().decode(encoding='utf-8')

print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [4]:
print(f'Length of text: {len(text)} characters')

vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Length of text: 1115394 characters
65 unique characters


In [5]:
# convert from tokens to character IDs:
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)

ids_from_chars(tf.strings.unicode_split('abc', input_encoding='UTF-8'))

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([40, 41, 42], dtype=int64)>

In [6]:
# recover readable strings from IDs:
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [7]:
def text_from_ids(m_ids):
    return tf.strings.reduce_join(chars_from_ids(m_ids), axis=-1)

In [8]:
# Validate text
text = text.lower()

import string
text = text.translate(str.maketrans('', '', string.punctuation))

text = text.replace('\n', ' ')

text[:250]

'first citizen before we proceed any further hear me speak  all speak speak  first citizen you are all resolved rather to die than to famish  all resolved resolved  first citizen first you know caius marcius is chief enemy to the people  all we knowt '

In [9]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1060997,), dtype=int64, numpy=array([45, 48, 57, ..., 53, 46,  2], dtype=int64)>

In [10]:
# An iterable over the elements of the dataset, with their tensors converted to numpy arrays.
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
ids_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [11]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

f
i
r
s
t
 
c
i
t
i


In [12]:
seq_length = 100

In [13]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq).numpy())

[b'f' b'i' b'r' b's' b't' b' ' b'c' b'i' b't' b'i' b'z' b'e' b'n' b' '
 b'b' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o' b'c'
 b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h' b'e'
 b'r' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p' b'e' b'a'
 b'k' b' ' b' ' b'a' b'l' b'l' b' ' b's' b'p' b'e' b'a' b'k' b' ' b's'
 b'p' b'e' b'a' b'k' b' ' b' ' b'f' b'i' b'r' b's' b't' b' ' b'c' b'i'
 b't' b'i' b'z' b'e' b'n' b' ' b'y' b'o' b'u' b' ' b'a' b'r' b'e' b' '
 b'a' b'l' b'l']


In [14]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'first citizen before we proceed any further hear me speak  all speak speak  first citizen you are all'
b' resolved rather to die than to famish  all resolved resolved  first citizen first you know caius mar'
b'cius is chief enemy to the people  all we knowt we knowt  first citizen let us kill him and well have'
b' corn at our own price ist a verdict  all no more talking ont let it be done away away  second citize'
b'n one word good citizens  first citizen we are accounted poor citizens the patricians good what autho'


In [15]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [16]:
dataset = sequences.map(split_input_target)

In [17]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'first citizen before we proceed any further hear me speak  all speak speak  first citizen you are al'
Target: b'irst citizen before we proceed any further hear me speak  all speak speak  first citizen you are all'


In [18]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = (
    dataset
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE) # Creates a Dataset that prefetches elements from this dataset.
)

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [19]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [20]:
class Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
          states = self.lstm.get_initial_state(x)
        x, state_h, state_c = self.lstm(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
          return x, [state_h, state_c]
        else:
          return x

In [21]:
model = Model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [22]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [23]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 lstm (LSTM)                 multiple                  5246976   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total params: 5,331,522
Trainable params: 5,331,522
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(optimizer = tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [25]:
EPOCHS = 20

In [26]:
# history = model.fit(dataset, epochs=EPOCHS)
# model.save_weights('./mw.h5')

In [27]:
class OneStep(tf.keras.Model):
  def __init__(self, m_model, m_chars_from_ids, m_ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = m_model
    self.chars_from_ids = m_chars_from_ids
    self.ids_from_chars = m_ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    predicted_logits, states = self.model(inputs=input_ids, states=states, return_state=True)
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits / self.temperature
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    predicted_chars = self.chars_from_ids(predicted_ids)

    return predicted_chars, states

In [28]:
reload_model = Model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = reload_model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

reload_model.load_weights("./mw3.h5")

one_step_model = OneStep(reload_model, chars_from_ids, ids_from_chars)

(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [30]:
import functools


def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        print(f"Elapsed time: {(end_time - start_time):.6f} seconds\n" + '_'*80 + '\n\n')
        return result
    return wrapper

@timer
def text(next_char, states=None):
    next_char = tf.constant([next_char])
    result = [next_char]
    for n in range(1000):
        next_char, states = one_step_model.generate_one_step(next_char, states=states)
        result.append(next_char)
    return tf.strings.join(result)[0].numpy().decode('utf-8')

print(text('ROMEO:'))

Elapsed time: 5.986987 seconds
________________________________________________________________________________


ROMEO:
Go, seve at her news into beheld the him
And labbed the pay and saved his unlawful fed
Murders not sent footing less. This is the sword:
Yet yout persuasion bad it is not in.'

Nurse:
My lord, what false Ladys you be drawn'd sound?

First Officer:
Marry: masters! First.

MERCUTIO:
O, though they are fornick I had been enemies,
An't. Inless a soar plainly bishopouse,
Which almost Soliciar-sarricands, worthy, as they
with those tongue, to turn thy chair her shed;
Be-chasuded that thy schoop'd my brother
With revenges diked thee on thy speech,
In the peace to have him leave a value veil
The vall of your majesty is almost,
As dog into something be brief.

LEONTES:
Is it they gosset and fair son?
Can you are well in Secanday? and the ream
Luintle imina much flown as Capulet
Fit for differences the patrenting sweets!

ROMEO:
Aufidius coriol-Bent for, it send
How he indeed 