In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

In [2]:
import numpy as np
import os
import time

In [3]:
import pickle
with open("../data/wsb_vocab.pickle", 'rb') as f:
    vocab = pickle.load(f)

# Vectorizing Text

In [4]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [5]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab))

In [6]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[73, 74, 75, 76, 77, 78, 79], [96, 97, 98]]>

In [7]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

In [8]:
chars_from_ids(ids)

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [9]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [10]:
text_from_ids(ids)

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'abcdefg', b'xyz'], dtype=object)>

# Create dataset

In [11]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [12]:
TEXT_SIZE = 250

data = open("../data/wsb_script.txt", "r")
def data_gen():
    while True:
        text = next(data)
        yield ids_from_chars((list(text)+['']*TEXT_SIZE)[:TEXT_SIZE])

In [13]:
next(data_gen())

<tf.Tensor: shape=(250,), dtype=int64, numpy=
array([52, 87, 84, 24, 10, 65, 77, 73, 80, 22, 10, 63, 77, 84, 88, 24,  4,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,

In [14]:
dataset = tf.data.Dataset.from_generator(data_gen, output_types=tf.int64)

In [15]:
for ids in dataset.take(10):
    print(ids)

tf.Tensor(
[59 87 84 76 10 73 10 75 87 94 77 90 77 76 10 75 73 84 84 10 73 92 10 31
 32 26 24 10 49 10 73 84 85 87 91 92 10 95 81 91 80 10 81 92 10 76 90 87
 88 91 10 74 73 75 83 10 76 87 95 86 10 73 10 74 81 92 24 60 87 10 92 80
 81 86 83 10 49 10 76 81 76 86 17 92 10 74 93 97 10 73 86 10 77 96 92 90
 73 10 31 30 26 10 75 73 84 84 10 74 77 75 73 93 91 77 10 92 80 77 10 88
 90 81 75 77 10 76 81 76 86 17 92 10 76 90 87 88 10 73 86 87 92 80 77 90
 10 31 75 24 10 55 80 10 95 77 84 84 11  4  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0], shape=(250,), dtype=int64)
tf.Tensor(
[54 46 52 64 10 74 77 75 73 93 91 77 10 77 73 90 86 81 86 79 10 90 77 88
 87 90 92 91 10 90 77 84 77 73 91 77 91 24 10 21 27 26 23 27 28 15 10 78
 87 90 10 92 80 77 10 95 77 77 83 10 81 91

In [16]:
dataset = dataset.map(split_input_target)
dataset

<MapDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>

In [17]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 20

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(1))

dataset

<PrefetchDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>

In [18]:
for ids in dataset.take(1):
    print(ids)

(<tf.Tensor: shape=(64, 249), dtype=int64, numpy=
array([[49, 10, 92, ...,  0,  0,  0],
       [54, 87, 81, ...,  0,  0,  0],
       [44, 87, 86, ...,  0,  0,  0],
       ...,
       [85, 77, 10, ...,  0,  0,  0],
       [60, 80, 73, ...,  0,  0,  0],
       [49, 86, 92, ...,  0,  0,  0]])>, <tf.Tensor: shape=(64, 249), dtype=int64, numpy=
array([[10, 92, 80, ...,  0,  0,  0],
       [87, 81, 91, ...,  0,  0,  0],
       [87, 86, 17, ...,  0,  0,  0],
       ...,
       [77, 10, 92, ...,  0,  0,  0],
       [80, 73, 92, ...,  0,  0,  0],
       [86, 92, 77, ...,  0,  0,  0]])>)


# Model

In [19]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [20]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [21]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [22]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 249, 2061) # (batch_size, sequence_length, vocab_size)


In [23]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  527616    
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  2112525   
Total params: 6,578,445
Trainable params: 6,578,445
Non-trainable params: 0
_________________________________________________________________


In [24]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [25]:
sampled_indices

array([ 698, 1017, 1045,  277, 1648, 1683,  378, 1796, 1583,   85,  776,
       1872, 1456, 1690, 1777, 1412, 1834, 1780, 1458,  298,   37, 1178,
        966, 1343, 1192,  227, 1346, 1924, 1185,  513, 1887, 2059,  680,
       1892,  368,  333,  983,  578,  710, 1112, 1318,  446, 1366,  329,
       1716,  835, 1501,   99, 1741, 1399, 1417,  990,   20,  333,  260,
        207, 1631, 1051, 1495,  178,  117,   90,  688,  549, 1317,  127,
        725,  329,  884, 1134, 1786, 1614, 1952,   10, 1871, 1430,  948,
        318, 1841,  201, 1156,  769,  584,  272,  592, 1526,   29,  786,
        489, 1161,  348,  435, 1733, 1802,  911,  479, 1609, 1410,  578,
       1600,  730, 1238,  557, 1819, 1700, 1661,  715, 2045, 1798, 1031,
       1268,   68,  689,  716, 1253,  207, 1883, 1228, 1500, 1053, 2040,
        141,  366,  677, 1310, 1952,  564,  722, 2026,  496, 1133,  515,
       1677, 1998, 1812,  560, 1391,  929, 1406, 1711, 1646, 2049,  253,
        448, 1540, 1681, 1203,  724, 1684, 1292,  7

In [26]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())


Input:
 b"Thays funny because my bullshit alarm blew out all the windows in the room I'm in upon reading YOUR comment.Oncs is the real deal bruh.\n"

Next Char Predictions:
 b'\xe2\x80\x98\xe2\xb8\xae\xe3\x81\x95\xcb\xb5\xe8\x92\xb8\xe8\xb0\xb7\xcd\xa2\xeb\x8a\x94\xe7\x9f\xa5m\xe2\x89\xa1\xed\x82\xa8\xe6\x92\xad\xe8\xb4\xb4\xea\xb0\x80\xe6\x80\xbb\xec\x8b\x9c\xea\xb0\x9c\xe6\x93\x94\xcc\x92;\xe4\xba\x8e\xe2\x9d\x97\xe5\xad\xa9\xe4\xbb\x98\xc6\xbd\xe5\xae\x8c\xef\xbc\xb0\xe4\xba\xb2\xd7\x92\xef\xb8\xba\xf0\x9f\xa4\x91\xe1\xba\xa7\xef\xbb\xbf\xcd\x98\xcc\xb5\xe2\xa0\xb0\xdb\xa9\xe2\x80\xb2\xe3\x83\x88\xe5\xa4\x96\xd0\x9d\xe5\xb0\x94\xcc\xb1\xe9\x81\xa0\xe2\x96\x88\xe6\x9f\xb4{\xe9\x9b\x9e\xe5\xbc\x95\xe6\x82\xaa\xe2\xa1\x9b*\xcc\xb5\xca\x9f\xc5\x91\xe8\x85\xbe\xe3\x81\x9f\xe6\x9d\xa1\xc3\xba\xc2\xafr\xe2\x80\x8a\xd8\xb3\xe5\xa4\x87\xc2\xba\xe2\x82\xaa\xcc\xb1\xe2\x98\xad\xe3\x83\xaa\xea\xb8\x95\xe7\xbe\x8e\xef\xbd\x90 \xec\xb9\x9c\xe6\x89\x8d\xe2\x9c\x8f\xcc\xa6\xec\x95\xbc\xc4\xb9\xe4\x

# Train

In [27]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [28]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 249, 2061)  # (batch_size, sequence_length, vocab_size)
Mean loss:         7.6326137


In [29]:
tf.exp(mean_loss).numpy()

2064.4387

In [30]:
model.compile(optimizer='adam', loss=loss)

In [31]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [32]:
EPOCHS=20
SPE=100 

In [33]:
def train_model():
    acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
    for epoch in range(EPOCHS):
        for (x_batch, y_batch) in dataset.range(epoch*SPE, epoch*SPE + SPE):
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss = loss_fn(y_batch, y_pred)
            
            gradients = tape.gradients(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
            acc_metric.update_state(y_batch, y_pred)
        print(f"Accuracy over epoch {acc_metric}")
        acc_metric.reset_states()

In [34]:
for X, y in dataset:
  model.fit(X, y)



KeyboardInterrupt: 

In [35]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['', '[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [36]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars, 1.0)

In [37]:
one_step_model.temperature = 1.0

In [38]:
import random

In [39]:
start = time.time()
states = None
next_char = tf.constant(['moo, '])
result = [next_char]

tweet_len = random.randint(12, 104)
i = 0
next_char, states = one_step_model.generate_one_step(next_char, states=states)
while (not next_char in [' ', '\n']) or i<tweet_len:
    result.append(next_char)
    i+=1
    next_char, states = one_step_model.generate_one_step(next_char, states=states)

print(f"{tweet_len}|{len(result)}")
result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

93|95
moo, ⏳יyUdu3 tsaoedsj.a.cl ihet t anayn sor tnlmQ0ni mot .i!tcdtttbbth ,lo  aea,i csmTett 岛5eak:tef 

________________________________________________________________________________

Run time: 1.7885942459106445


In [None]:
text = str(result[0].numpy().decode('utf-8'))
print(f"{text}\n{text.encode()}")

In [None]:
tf.saved_model.save(one_step_model, '../data/one_step')

In [44]:
one_step_reloaded = tf.saved_model.load('../data/one_step')



In [47]:
one_step_reloaded.model.save_weights('../data/weights')


AttributeError: '_UserObject' object has no attribute 'save_weights'

In [None]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))


In [41]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:']*5)
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)


tf.Tensor(
[b"ROMEO:\nI s !raey'e e ea m\n uotee n arrl fr  ahceo o in  ytorn hsaodlooosnews.-st earldG WeefMatawneuasstaeta sdy. dj ghhn aer3 t iw sieaeolnleio  awtdutleeoesb eat/bmh tc kawodw i nowti. vtr yuud.hdidRtwaHneveFtrtr oullt istyeh ns mri  taaa iFeni3srnumr ttheetryke'ihhfual.eev   remtetut e'r h tm.t ainttet a'ieitpetf$.thl3tu tpt.oeifsCace1iais e d 0oe astrtefDmkome'5w5  artltneslyo Tate.gi t?klwe\nt ria t o  eaerettibhuenehe\ntjMw   atrvrg  inftraa!oka0athoaeIafiaaisros    tiOd de nc pw drrctrifei ar.  nrdt .ei sCooreinai sb  onfsho geuw at yovdeesrimty llsostyyn\ndi:erio37ieyegg3t h /YrdIe th nLsaenyaol okl  i dpeh tdetbr  M.fe aoisnah oay ws te  'e\nuee btt onr.ebl a sei sdoeeh0m.sw ageha b,fhoe hf w awo sE tjis\njtrdhn0buuentoinffnifnowtonsoot rynm om ma \ngaaouhkowcpsssem swoeteLm emlreqCn d  .NkI\npet,,bvaoensitNritttiaher Tt\n  isd Yider/msoo,dt .tmmx t yeel1rbywga utka a t lomoa teeuihae.fail0,eo  it i,sew reentpeim tu0rgaehj i ti 4e,1dgyh,si! 0iaeftdni t.ry oyetw