In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

In [2]:
import numpy as np
import os
import time

In [3]:
import pickle
with open("data/wsb_vocab.pickle", 'rb') as f:
    vocab = pickle.load(f)

# Vectorizing Text

In [6]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [7]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab))

In [8]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[73, 74, 75, 76, 77, 78, 79], [96, 97, 98]]>

In [9]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

In [10]:
chars_from_ids(ids)

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [11]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [12]:
text_from_ids(ids)

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'abcdefg', b'xyz'], dtype=object)>

# Create dataset

In [13]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [14]:
TEXT_SIZE = 250

data = open("data/wsb_script.txt", "r")
def data_gen():
    while True:
        text = next(data)
        yield ids_from_chars((list(text)+['']*TEXT_SIZE)[:TEXT_SIZE])

In [15]:
next(data_gen())

<tf.Tensor: shape=(250,), dtype=int64, numpy=
array([52, 87, 84, 24, 10, 65, 77, 73, 80, 22, 10, 63, 77, 84, 88, 24,  4,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,

In [16]:
dataset = tf.data.Dataset.from_generator(data_gen, output_types=tf.int64)

In [17]:
for ids in dataset.take(10):
    print(ids)

tf.Tensor(
[59 87 84 76 10 73 10 75 87 94 77 90 77 76 10 75 73 84 84 10 73 92 10 31
 32 26 24 10 49 10 73 84 85 87 91 92 10 95 81 91 80 10 81 92 10 76 90 87
 88 91 10 74 73 75 83 10 76 87 95 86 10 73 10 74 81 92 24 60 87 10 92 80
 81 86 83 10 49 10 76 81 76 86 17 92 10 74 93 97 10 73 86 10 77 96 92 90
 73 10 31 30 26 10 75 73 84 84 10 74 77 75 73 93 91 77 10 92 80 77 10 88
 90 81 75 77 10 76 81 76 86 17 92 10 76 90 87 88 10 73 86 87 92 80 77 90
 10 31 75 24 10 55 80 10 95 77 84 84 11  4  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0], shape=(250,), dtype=int64)
tf.Tensor(
[54 46 52 64 10 74 77 75 73 93 91 77 10 77 73 90 86 81 86 79 10 90 77 88
 87 90 92 91 10 90 77 84 77 73 91 77 91 24 10 21 27 26 23 27 28 15 10 78
 87 90 10 92 80 77 10 95 77 77 83 10 81 91

In [18]:
dataset = dataset.map(split_input_target)
dataset

<MapDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>

In [19]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 20

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(1))

dataset

<PrefetchDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>

In [20]:
for ids in dataset.take(1):
    print(ids)

(<tf.Tensor: shape=(64, 249), dtype=int64, numpy=
array([[49, 10, 84, ...,  0,  0,  0],
       [46, 81, 90, ...,  0,  0,  0],
       [25, 90, 25, ...,  0,  0,  0],
       ...,
       [53, 81, 79, ...,  0,  0,  0],
       [92, 80, 81, ...,  0,  0,  0],
       [60, 80, 77, ...,  0,  0,  0]])>, <tf.Tensor: shape=(64, 249), dtype=int64, numpy=
array([[10, 84, 87, ...,  0,  0,  0],
       [81, 90, 91, ...,  0,  0,  0],
       [90, 25, 81, ...,  0,  0,  0],
       ...,
       [81, 79, 80, ...,  0,  0,  0],
       [80, 81, 91, ...,  0,  0,  0],
       [80, 77, 10, ...,  0,  0,  0]])>)


# Model

In [21]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [22]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [23]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [24]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 249, 2061) # (batch_size, sequence_length, vocab_size)


In [25]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  527616    
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  2112525   
Total params: 6,578,445
Trainable params: 6,578,445
Non-trainable params: 0
_________________________________________________________________


In [27]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [28]:
sampled_indices

array([1416,  669,  988, 1024,  627, 1138,  493, 1584,  148,  131,  432,
       1336, 1469, 1613,   97, 1347,   78, 1805, 1905, 1025,  252,  639,
         55,  537, 1529, 2042, 1219, 1906,  618,  680,  352,  812, 1750,
       1635,  487, 1803, 1967, 1161, 1375,  118, 1539, 1105, 1565, 1216,
       1342,  582,  583,  529, 1573, 1060, 2005,  680,  672, 1498, 1513,
       1638, 1042, 1722,  805,  237,  997, 2048,  400,   97,  528,  297,
       2024,  213,  256, 1654, 1612,  804, 1589,  909,  423, 1324, 1764,
        427,  961,   92, 1151, 1110, 1388, 1901,  560,  953,  818, 1042,
         63,  596, 1815, 1261, 1901, 1291, 1484, 1154, 1114, 1659, 1718,
        581,  813,  609,  581,  858,  991, 2045, 1563,  828,  803, 1548,
       1055,  955, 1382, 1246,  872,  597, 1204, 1516, 1286,  426, 1604,
       1875,  201,   57, 1343,   27, 1209, 2023,  999,  674, 2011, 1027,
       1294, 1780,  585, 1483, 2014,  166, 1888, 1170, 1838, 1931,  921,
        219,  331, 1673,   14, 1639, 1935,  389,  2

In [29]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())


Input:
 b"That's silly, softbank has ADRs and all\n"

Next Char Predictions:
 b'\xe6\x82\xa8\xe1\xb6\x85\xe2\xa1\x81\xe3\x80\x8b\xe0\xbc\xac\xe3\x83\xba\xd1\x97\xe7\xa0\x94\xc3\x9b\xc2\xbf\xcf\x8d\xe5\xa7\xbf\xe6\x96\xb0\xe7\xbb\xbfy\xe5\xae\x97f\xeb\x9e\x9c\xef\xbc\x98\xe3\x80\x8c\xca\x8e\xe1\x95\xa6O\xd8\xa5\xe6\xb2\xa1\xf0\x9f\x98\xa0\xe5\x81\x9a\xef\xbc\x9a\xe0\xb9\x84\xe1\xba\xa7\xcd\x88\xe2\x94\xbb\xe9\xab\x94\xe8\x89\xba\xd1\x8d\xeb\x94\x94\xef\xbe\x9f\xe4\xb8\xaa\xe5\xb2\x81\xc2\xb0\xe6\xb7\xb7\xe3\x82\xb9\xe7\x94\xa8\xe5\x80\x8b\xe5\xad\xa6\xe0\xae\x87\xe0\xae\x9c\xd7\xa4\xe7\x9b\xae\xe3\x81\xa9\xf0\x9f\x92\x9d\xe1\xba\xa7\xe1\xb6\xa0\xe6\x9e\x9c\xe6\xad\x89\xe8\x8b\xa5\xe3\x81\x8f\xe9\x87\x8c\xe2\x94\x98\xc9\x9f\xe2\xa3\x80\xf0\x9f\x98\xaa\xce\x9cy\xd7\xa2\xcc\x91\xf0\x9f\x98\x80\xc5\xab\xca\x96\xe8\xa1\x97\xe7\xbb\xad\xe2\x94\x97\xe7\xa5\x96\xe2\x99\xad\xcf\x80\xe5\xa4\xb1\xea\x80\xb8\xcf\x84\xe2\x9c\xbft\xe3\x85\x9b\xe3\x83\x86\xe5\xb8\xbd\xef\xbc\x8f\xd9\x85\xe2\x9c\x97\xe

# Train

In [30]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [31]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 249, 2061)  # (batch_size, sequence_length, vocab_size)
Mean loss:         7.624325


In [32]:
tf.exp(mean_loss).numpy()

2047.3976

In [33]:
model.compile(optimizer='adam', loss=loss)

In [34]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [35]:
EPOCHS=20
SPE=100 

In [36]:
def train_model():
    acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
    for epoch in range(EPOCHS):
        for (x_batch, y_batch) in dataset.range(epoch*SPE, epoch*SPE + SPE):
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss = loss_fn(y_batch, y_pred)
            
            gradients = tape.gradients(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
            acc_metric.update_state(y_batch, y_pred)
        print(f"Accuracy over epoch {acc_metric}")
        acc_metric.reset_states()

In [37]:
for X, y in dataset:
  model.fit(X, y)



























































KeyboardInterrupt: 

In [38]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['', '[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [39]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars, 1.0)

In [111]:
one_step_model.temperature = 1.0

In [6]:
import random

In [10]:
start = time.time()
states = None
next_char = tf.constant(['moo, '])
result = [next_char]

tweet_len = random.randint(12, 104)
i = 0
next_char, states = one_step_model.generate_one_step(next_char, states=states)
while (not next_char in [' ', '\n']) or i<tweet_len:
    result.append(next_char)
    i+=1
    next_char, states = one_step_model.generate_one_step(next_char, states=states)

print(f"{tweet_len}|{len(result)}")
result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

82|86
moo, gotta see Thursday, how did you buy down ? Grat.
oooo stock RARD and AIA for laughing 

________________________________________________________________________________

Run time: 0.2729949951171875


In [9]:
text = str(result[0].numpy().decode('utf-8'))
print(f"{text}\n{text.encode()}")

moo, joining it at your newsger to that little, -15%
b'moo, joining it at your newsger to that little, -15%'


In [41]:
tf.saved_model.save(one_step_model, 'data/one_step')





INFO:tensorflow:Assets written to: data/one_step/assets


INFO:tensorflow:Assets written to: data/one_step/assets


In [4]:
one_step_model = tf.saved_model.load('data/one_step')