# Greedy decoding

In [None]:
import tensorflow as tf

In [None]:
# Download all Shakespeare's works
shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [None]:
# Encode the text
text_vec_layer = tf.keras.layers.TextVectorization(split="character", # only characters
                                                   standardize="lower") # all to lower case

text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [None]:
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [None]:
# Let's count characters
encoded -= 2 # drop tokens 0 (pad) and 1 (unknown), which won't be used
n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct chars = 39
dataset_size = len(encoded)
dataset_size

1115394

In [None]:
# Cerate input/target window pairs
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length+1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
  if shuffle:
    ds = ds.shuffle(buffer_size=100_000, seed=seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [None]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True,
                       seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

In [None]:
# Let's build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=2,
                    callbacks=[model_ckpt], verbose=1)

Epoch 1/2
  31246/Unknown [1m408s[0m 12ms/step - accuracy: 0.5449 - loss: 1.5073



[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m425s[0m 13ms/step - accuracy: 0.5449 - loss: 1.5073 - val_accuracy: 0.5329 - val_loss: 1.6036
Epoch 2/2
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 12ms/step - accuracy: 0.5987 - loss: 1.2895 - val_accuracy: 0.5394 - val_loss: 1.5806


In [None]:
model.save("char_rnn.keras")

In [None]:
# Preprocessing
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2), # No pad (0) and unknown tokens (1)
    model
])

NameError: name 'model' is not defined

In [None]:
predicted_index = tf.argmax(model.predict(text_vec_layer(["Wear my heart upon my sleev"]))[0, -1])
text_vec_layer.get_vocabulary()[predicted_index + 2]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


np.str_('s')

# Let's randomize our Data!

In [None]:
import tensorflow as tf

In [None]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]]) # class 0 = 50%, class 1 = 40%, 2 = 10%
tf.random.set_seed(2138)
tf.random.categorical(log_probas, num_samples=8)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 1, 0, 0, 1, 1, 0, 0]])>

In [None]:
logits = tf.math.log([3.3, 1.2, 4.3, 10.]) / 0.5
print(logits)
int(tf.random.categorical(tf.expand_dims(logits, 0), num_samples=1)[0, 0])

tf.Tensor([2.387845  0.3646432 2.9172301 4.6051702], shape=(4,), dtype=float32)


3

In [None]:
def next_char(text, temperature=1):
  y_proba = shakespeare_model.predict([text])[0, -1:]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
  return text_vec_layer.get_vocabulary()[char_id + 2]

next_char(tf.constant(['Wear my heart upon my sleev']))

In [None]:
def extend_test(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

extend_test(tf.constant(["My name is"]), temperature=0.01)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42

<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'My name is the season, and therefore see\nher the death to th'],
      dtype=object)>

# Stateful RNN

In [None]:
# Since Stateful RNNs should receive not overlapping (like windows 1 to 32 and 2 to 33) input data, so we use batch 1
def to_dataset_for_stateful_rnn(sequence, length):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length + 1, shift=length, drop_remainder=True)
  ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1) # flat_map flattens into a single continuous dataset
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)
stateful_valid_set = to_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000], length)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)

In [None]:
list(stateful_test_set)[0]

(<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
 array([[ 0, 19,  3,  8,  0, 15,  3, 13,  8,  0, 12,  4, 13, 20,  6,  2,
          1,  8,  0, 21, 15,  0, 15,  3, 13,  8,  0,  7,  1,  8, 25,  4,
          9,  2,  0,  6,  1,  8,  1, 23, 10, 14, 15,  0, 21,  3, 15,  0,
          7,  6,  4, 11, 11,  0, 19,  1,  2, 18,  6,  0,  2,  6,  1,  0,
          7, 18,  8,  5, 25,  1,  9,  1,  8,  0, 22,  8,  1,  7,  1,  9,
          2, 11, 15, 26, 10,  2,  6,  1,  0, 16,  3,  8,  7,  2,  0,  5,
          7,  0,  2,  6]])>,
 <tf.Tensor: shape=(1, 100), dtype=int64, numpy=
 array([[19,  3,  8,  0, 15,  3, 13,  8,  0, 12,  4, 13, 20,  6,  2,  1,
          8,  0, 21, 15,  0, 15,  3, 13,  8,  0,  7,  1,  8, 25,  4,  9,
          2,  0,  6,  1,  8,  1, 23, 10, 14, 15,  0, 21,  3, 15,  0,  7,
          6,  4, 11, 11,  0, 19,  1,  2, 18,  6,  0,  2,  6,  1,  0,  7,
         18,  8,  5, 25,  1,  9,  1,  8,  0, 22,  8,  1,  7,  1,  9,  2,
         11, 15, 26, 10,  2,  6,  1,  0, 16,  3,  8,  7,  2,  0,  5,  

In [None]:
# Extra code – shows one way to prepare a batched dataset for a stateful RNN
import numpy as np

def to_non_overlapping_windows(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    return ds.flat_map(lambda window: window.batch(length + 1))

def to_batched_dataset_for_stateful_rnn(sequence, length, batch_size=32):
    parts = np.array_split(sequence, batch_size)
    datasets = tuple(to_non_overlapping_windows(part, length) for part in parts)
    ds = tf.data.Dataset.zip(datasets).map(lambda *windows: tf.stack(windows))
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

list(to_batched_dataset_for_stateful_rnn(tf.range(20), length=3, batch_size=2))[0]

(<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
 array([[ 0,  1,  2],
        [10, 11, 12]], dtype=int32)>,
 <tf.Tensor: shape=(2, 3), dtype=int32, numpy=
 array([[ 1,  2,  3],
        [11, 12, 13]], dtype=int32)>)

In [None]:
# Let's build a Stateful RNN itself
model = tf.keras.Sequential([
    tf.keras.Input(shape=[1], batch_size=1),
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

# We need to reset the states before we go back to the beginning of the text
class ResetStatesCallback(tf.keras.callbacks.Callback):
  def on_epoch_begin(self, epoch, logs):
    self.model.layers[1].reset_states()

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)
history = model.fit(stateful_train_set, validation_data=stateful_valid_set,
                    epochs=2, callbacks=[ResetStatesCallback(), model_ckpt])

Epoch 1/2
   9995/Unknown [1m87s[0m 8ms/step - accuracy: 0.3896 - loss: 2.1094



[1m9999/9999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 9ms/step - accuracy: 0.3897 - loss: 2.1093 - val_accuracy: 0.4930 - val_loss: 1.6966
Epoch 2/2
[1m9999/9999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 9ms/step - accuracy: 0.5227 - loss: 1.5849 - val_accuracy: 0.5193 - val_loss: 1.6031


In [None]:
model.save_weights("model.weights.h5", overwrite=True)

In [None]:
# Preprocessing
stateful_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2), # No pad (0) and unknown tokens (1)
    model
])

In [None]:
def next_char(text, model, temperature=1):
  prep_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2), # No pad (0) and unknown tokens (1)
    model
  ])
  y_proba = prep_model.predict(tf.constant([text]), verbose=0)[0, -1:]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
  return text_vec_layer.get_vocabulary()[char_id + 2]

def next_chars(text, length, model, temperature=1):
  for _ in range(length):
    text += next_char(text, model, temperature)
  return text

next_char("Hello my name i", model=model)
next_chars("Hello my", 10, model=model, temperature=0.1)



'Hello my lord,\nand'

In [None]:
# Apply weights from stateful RNN to stateless RNN
stateless_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

stateless_model.build(tf.TensorShape([None, None]))

stateless_model.set_weights(model.get_weights())

In [None]:
next_chars("Hello my nam", length=10, model=stateless_model, temperature=0.5)

'Hello my name,\nwe will'

# Sentiment Analysis

In [None]:
# Imoprt IMDb dataset
import tensorflow_datasets as tfds

raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name="imdb_reviews",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True
)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.W4MNTD_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.W4MNTD_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.W4MNTD_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
# Let's check the some dataset samples and labels
for review, label in raw_train_set.take(4):
  print(review.numpy().decode("utf-8"))
  print("Label:", label.numpy())

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
Label: 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development

In [None]:
# Tokenize
vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda review, label: review))

In [None]:
# Create the model
embed_size = 128
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=2)

Epoch 1/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.5061 - loss: 0.6935 - val_accuracy: 0.5012 - val_loss: 0.6930
Epoch 2/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 32ms/step - accuracy: 0.4948 - loss: 0.6933 - val_accuracy: 0.5008 - val_loss: 0.6936


### Masking

In [None]:
# Accuracy is usually by 50% because model considers padding tokens.
# Set mask_zero=True parameter in Embedding Layer to avoid that
# Create the model
embed_size = 128
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, mask_zero=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=2)

Epoch 1/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 34ms/step - accuracy: 0.6696 - loss: 0.5887 - val_accuracy: 0.8392 - val_loss: 0.3935
Epoch 2/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 33ms/step - accuracy: 0.8421 - loss: 0.3803 - val_accuracy: 0.8648 - val_loss: 0.3210


In [None]:
embed_size = 128

class MaskLayer(tf.keras.Layer):
    def call(self, x):
        return tf.math.not_equal(token_ids, 0)

# Model with manual Masking
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
token_ids = text_vec_layer(inputs)
mask = MaskLayer()
Z = tf.keras.layers.Embedding(vocab_size, embed_size)(token_ids)
Z = tf.keras.layers.GRU(128, dropout=0.2)(Z, mask=mask)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(Z)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

In [None]:
# Include ragged tensor, to feed model with ragged tensors
text_vec_layer_ragged = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, ragged=True
)

text_vec_layer_ragged.adapt(train_set.map(lambda reviews, labels: reviews))
text_vec_layer_ragged(["Shit movie!", "This is bullshit"]), text_vec_layer(["Shit movie!", "This is bullshit"])

(<tf.RaggedTensor [[1, 18], [11, 7, 1]]>,
 <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
 array([[ 1, 18,  0],
        [11,  7,  1]])>)

In [None]:
embed_size = 128

# Model with manual Masking
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
token_ids = text_vec_layer(inputs)
Z = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)(token_ids)
Z = tf.keras.layers.GRU(128, dropout=0.2)(Z)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(Z)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

In [None]:
# Try using tf.keras.callbacks.TensorBoard() to visualize the embeddings
tensor_board = tf.keras.callbacks.TensorBoard(histogram_freq=1)
model.compile(optimizer="nadam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(train_set, validation_data=valid_set, epochs=3, callbacks=[tensor_board])

Epoch 1/3
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.6345 - loss: 0.6249 - val_accuracy: 0.8092 - val_loss: 0.4302
Epoch 2/3
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 34ms/step - accuracy: 0.7456 - loss: 0.5302 - val_accuracy: 0.8408 - val_loss: 0.3601
Epoch 3/3
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 32ms/step - accuracy: 0.8588 - loss: 0.3341 - val_accuracy: 0.8640 - val_loss: 0.3187


<keras.src.callbacks.history.History at 0x78f876c94770>

In [None]:
model.evaluate(test_set)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 17ms/step - accuracy: 0.8571 - loss: 0.3280


[0.3254099488258362, 0.8587999939918518]

In [None]:
test_sample = ""

TypeError: Can't instantiate abstract class DatasetV2 without an implementation for abstract methods '_inputs', 'element_spec'

In [None]:
test_sample

<tf.Tensor: shape=(), dtype=string, numpy=b'What the hell is that?'>

In [None]:
for i in test_set.take(1):
  print(i[0][1])

tf.Tensor(b"A blackly comic tale of a down-trodden priest, Nazarin showcases the economy that Luis Bunuel was able to achieve in being able to tell a deeply humanist fable with a minimum of fuss. As an output from his Mexican era of film making, it was an invaluable talent to possess, with little money and extremely tight schedules. Nazarin, however, surpasses many of Bunuel's previous Mexican films in terms of the acting (Francisco Rabal is excellent), narrative and theme.<br /><br />The theme, interestingly, is something that was explored again in Viridiana, made three years later in Spain. It concerns the individual's struggle for humanity and altruism amongst a society that rejects any notion of virtue. Father Nazarin, however, is portrayed more sympathetically than Sister Viridiana. Whereas the latter seems to choose charity because she wishes to atone for her (perceived) sins, Nazarin's whole existence and reason for being seems to be to help others, whether they (or we) like it 

In [None]:
sample_good = "This is a very good and nice awesome movie, nice!"
sample_bad = "This is a bad and very overrated and shit movie shit, bad!"

print(f"{sample_good} is: {float(model.predict(tf.cast([sample_good], tf.string))[0][0])*100:.1f}% positive")
print(f"{sample_bad} is: {float(model.predict(tf.cast([sample_bad], tf.string))[0][0])*100:.1f}% positive")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
This is a very good and nice awesome movie, nice! is: 98.2% positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
This is a bad and very overrated and shit movie shit, bad! is: 6.5% positive


In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs

In [None]:
from tensorboard import notebook
notebook.list() # View open TensorBoard instances

Known TensorBoard instances:
  - port 6006: logdir logs (started 0:01:32 ago; pid 9990)


In [None]:
# Let's use embeddings from other models
import os
import tensorflow_hub as hub

os.environ["TFHUB_CACHE_DIR"] = "my_thhub_cache" # Save downloaded modules

model = tf.keras.Sequential([
    hub.KerasLayer("URL HERE universal-sentence-encoder/4",
                   trainable=True, dtype=tf.string, input_shape=[]),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit(train_set, validation_data=valid_set, epochs=10)

### An Encoder Decoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# I will use English - France dataset
import os
from pathlib import Path
import tensorflow as tf
import pandas as pd

en_fr_df = pd.read_csv("/content/drive/MyDrive/tymur_arduch/Data/en_fr/eng_fr.csv")
en_fr_df.rename(columns={"English words/sentences": "en", "French words/sentences": "fr"}, inplace=True)

In [None]:
en_fr_df[10000:10005]

Unnamed: 0,en,fr
10000,Be very careful.,Sois très prudente !
10001,Bees make honey.,Les abeilles font du miel.
10002,Behave yourself.,Comporte-toi bien.
10003,Bite the bullet.,Serre les dents.
10004,Bite the bullet.,Serrez les dents.


In [None]:
# Divide dataset into English and French parts
en_df = en_fr_df["en"]
fr_df = en_fr_df["fr"]

In [None]:
# Convert data into lists
en_list = list(en_df.values)
fr_list = list(fr_df.values)

In [None]:
n_en = len(en_list)
n_fr = len(fr_list)

print(f"Amount English sentences: {n_en} \nAmount French sentences: {n_fr}")

Amount English sentences: 175621 
Amount French sentences: 175621


In [None]:
import numpy as np

f"{en_list[400]} => {fr_list[400]}"

'Leave it. => Laisse tomber !'

In [None]:
# Get the mean length of a sentence in both lists
mean_en_sentences_len = sum([len(i) for i in en_list]) / n_en
mean_fr_sentences_len = sum([len(i) for i in fr_list]) / n_fr
mean_sentence_length_en_fr = int((mean_fr_sentences_len + mean_en_sentences_len) / 2)
f"Mean length of a sentence {mean_sentence_length_en_fr}"

'Mean length of a sentence 33'

In [None]:
# Create two TextVecorization layers; one per language
vocab_size = 7000
max_length = mean_sentence_length_en_fr
text_vec_layer_en = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, output_sequence_length=max_length,
    standardize="lower_and_strip_punctuation")
text_vec_layer_fr = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, output_sequence_length=max_length,
    standardize="lower_and_strip_punctuation")
# Adapt the TextVectorization layers
text_vec_layer_en.adapt(en_list)
text_vec_layer_fr.adapt([f"sos {s} eos" for s in fr_list]) # sos - start of sequence
# eos - end of sequence

In [None]:
text_vec_layer_fr.get_vocabulary()[:5]

['', '[UNK]', np.str_('sos'), np.str_('eos'), np.str_('je')]

In [None]:
# Define the slices for training and validation sets
train_split = int(n_en*0.85)
val_split = train_split + int(n_en*0.1)

train_split, val_split

(149277, 166839)

In [None]:
# Let's create training and validation sets
X_train = tf.constant(en_list[:train_split])
X_valid = tf.constant(en_list[train_split:val_split])
X_test = tf.constant(en_list[val_split:])

X_train_dec = tf.constant([f"sos {s}" for s in fr_list[:train_split]])
X_valid_dec = tf.constant([f"sos {s}" for s in fr_list[train_split:val_split]])
X_test_dec = tf.constant([f"sos {s}" for s in fr_list[val_split:]])

y_train = text_vec_layer_fr([f"{s} eos" for s in fr_list[:train_split]])
y_valid = text_vec_layer_fr([f"{s} eos" for s in fr_list[train_split:val_split]])
y_test = text_vec_layer_fr([f"{s} eos" for s in fr_list[val_split:]])

In [None]:
text_vec_layer_fr(["Je ne sais pas"])

<tf.Tensor: shape=(1, 33), dtype=int64, numpy=
array([[ 4,  8, 72,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]])>

In [None]:
len(X_train), len(X_train_dec), len(y_train)

(149277, 149277, 149277)

In [None]:
# Let's build the model with functional API
embed_size = 128
# 2 Inputs (one for encoder and second for decoder)
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

# Use TextVectorization
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_fr(decoder_inputs)

# Use 2 Embeddings (one for encoder and second for decoder)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                   mask_zero=True)(encoder_input_ids)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                   mask_zero=True)(decoder_input_ids)

# Define the Encoder
encoder = tf.keras.layers.LSTM(256, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embedding_layer)
# *encoder_state containes long- and short-term states in a list

# Define the Decoder
decoder = tf.keras.layers.LSTM(256, return_sequences=True)
decoder_outputs = decoder(decoder_embedding_layer, initial_state=encoder_state)

# Define Dense layer with softmax activation to get the word probabilities
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")(decoder_outputs)

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[output_layer])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
# model.fit((X_train, X_train_dec), y_train, epochs=4,
#           validation_data=((X_valid, X_valid_dec), y_valid))

In [None]:
def translate(sentence_en):
  translation = ""
  for word_idx in range(max_length):
    X = np.array([sentence_en], dtype=object) # encoder input
    X_dec = np.array(["sos " + translation], dtype=object)
    y_proba = model.predict([X, X_dec], verbose=None)[0, word_idx] #
    predicted_word_id = np.argmax(y_proba)
    predicted_word = text_vec_layer_rus.get_vocabulary()[predicted_word_id]
    if predicted_word == "eos":
      break
    translation += " " + predicted_word
  return translation.strip()

translate("Hello my name is")

'mon nom de mon grandpère est [UNK]'

In [None]:
# Make some evaluations
model.evaluate((X_test, X_test_dec), y_test)

[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.1456 - loss: 3.3118


[3.637115955352783, 0.14541111886501312]

### Bidirectional RNNs

In [None]:
# Define the Bidirectional Encoder
encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_state=True))
encoder_outputs, *encoder_state = encoder(encoder_embedding_layer)
# *encoder_state containes long- and short-term states in a list
encoder_state = [tf.concat(encoder_state[::2], axis=-1), # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)] # long-term (1 & 3)

ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.ops`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


### Attention Mechanisms

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_state=True, return_sequences=True))

In [None]:
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

In [None]:
# Let's build the model with functional API
embed_size = 128
# 2 Inputs (one for encoder and second for decoder)
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

# Use TextVectorization
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_fr(decoder_inputs)

# Use 2 Embeddings (one for encoder and second for decoder)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                   mask_zero=True)(encoder_input_ids)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                   mask_zero=True)(decoder_input_ids)

# Define the Encoder
encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_state=True, return_sequences=True))
encoder_outputs, *encoder_state = encoder(encoder_embedding_layer)
# *encoder_state containes long- and short-term states in a list

# Define the Decoder
decoder = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_embedding_layer, initial_state=encoder_state)

# Attention
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])

# Output
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")(attention_outputs)

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[output_layer])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), y_train, epochs=20,
          validation_data=((X_valid, X_valid_dec), y_valid))

In [None]:
# Implementation of Positional Encodings for encoder and decoder inputs
max_length = 50 # max length of the whole training set
embed_size = 128
pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batah_max_len_dec))

NameError: name 'encoder_embeddings' is not defined

In [None]:
# Positional Encoding layer implementation
class PositionalEncoding(tf.keras.layers.Layer):
  def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
    super().__init__(dtype=dtype, **kwargs)
    assert embed_size % 2 == 0, "embed_size must be even"
    p, i = np.meshgrid(np.arange(max_length),
                       2 * np.arange(embed_size // 2))
    pos_emb = np.empty((1, max_length, embed_size))
    pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
    pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
    self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))
    self.supports_masking = True

  def call(self, inputs):
    batch_max_length = tf.shape(inputs)[1]
    return inputs + self.pos_encodings[:, :batch_max_length]

In [None]:
# Add the positional encoding to the encoder's inputs
max_length = 50 # max length of the whole training set
embed_size = 128

pos_embed_layer = PositionalEncoding(max_length, embed_size)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

NameError: name 'encoder_embeddings' is not defined

In [None]:
# Multi-head Attention
N = 2
num_heads = 8
dropout_rate = 0.1
n_units = 128 # for the first dense layer in each feedforward block
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
Z = encoder_in
for _ in range(N):
  skip = Z
  attn_layer = tf.keras.layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
  Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
  z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
  skip = Z
  Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
  Z = tf.keras.layers.Dense(embed_size)(Z)
  Z = tf.keras.layers.Dropout(dropout_rate)(Z)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.ops`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


In [None]:
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
causal_mask = tf.linalg.band_part( # creates a lowe triangular matrix
    tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0
)

ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.ops`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


In [None]:
encoder_outputs = Z # let's save the encoder's final outputs
Z = decoder_in # the decoder starts with its own inputs
for _ in range(N):
  skip = Z
  attn_layer = tf.keras.layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
  Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
  skip = Z
  attn_layer = tf.keras.layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
  Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
  skip = Z
  Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
  Z = tf.keras.layers.Dense(embed_size)(Z)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

NameError: name 'Z' is not defined

In [None]:
Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

NameError: name 'Z' is not defined

# Hugging Face's Transformers library

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis") # many other tasks are available

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
result = classifier("It is a nice movie")
result

[{'label': 'POSITIVE', 'score': 0.9998711347579956}]

In [None]:
classifier(["I am from India.", "I am from Iraq.", "Truck"])

[{'label': 'POSITIVE', 'score': 0.9896161556243896},
 {'label': 'NEGATIVE', 'score': 0.9811071157455444},
 {'label': 'NEGATIVE', 'score': 0.9639517664909363}]

In [None]:
model_name = "huggingface/distilbert-base-uncased-finetuned-mnli"
classifier_mnli = pipeline("text-classification", model=model_name)

Device set to use cuda:0


In [None]:
classifier_mnli("She made it. She didn't")

[{'label': 'contradiction', 'score': 0.9987062215805054}]

TypeError: Cannot convert the argument `type_value`: torch.float16 to a TensorFlow DType.

# Exercises

### 8.

In [None]:
import pandas as pd
import tensorflow as tf

train_df = pd.read_csv("train_data.csv")
validation_df = pd.read_csv("val_data.csv")

In [None]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,string,valid
0,0,BPTVXPVXPTTVXVE,0
1,1,BPVXPVXPVXPTTVXPTTVSE,0
2,2,BNAFUCCAFUCCAFUADE,0
3,3,BNWWWWWVZUWWVZRE,0
4,4,BPTVPXVPXTVPXTTTTTTVPSE,1


In [None]:
# Encode the text
text_vec_layer = tf.keras.layers.TextVectorization(split="character") # only characters
text_vec_layer.adapt(train_df["string"].values)

In [None]:
text_vec_layer(train_df["string"].values)

<tf.Tensor: shape=(18750, 49), dtype=int64, numpy=
array([[ 8,  5,  2, ...,  0,  0,  0],
       [ 8,  5,  3, ...,  0,  0,  0],
       [ 8, 23, 13, ...,  0,  0,  0],
       ...,
       [ 8, 21, 22, ...,  0,  0,  0],
       [ 8,  5,  4, ...,  0,  0,  0],
       [ 8,  2,  2, ...,  0,  0,  0]])>

In [None]:
X_train, y_train = tf.constant(text_vec_layer(train_df["string"].values)), tf.constant(train_df["valid"])
X_val, y_val = tf.constant(text_vec_layer(validation_df["string"].values)), tf.constant(validation_df["valid"])

In [None]:
len(text_vec_layer.get_vocabulary())

28

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Input([None], ragged=True),
  tf.keras.layers.Embedding(len(text_vec_layer.get_vocabulary()), 128, mask_zero=True),
  tf.keras.layers.GRU(32, dropout=0.2),
  tf.keras.layers.Dense(1, activation="sigmoid")
])

optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95,
                                    nesterov=True)
model.compile(loss="binary_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=5,
                    validation_data=(X_val, y_val))

Epoch 1/5
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 19ms/step - accuracy: 0.8175 - loss: 0.4001 - val_accuracy: 0.9357 - val_loss: 0.1953
Epoch 2/5
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 0.9624 - loss: 0.1140 - val_accuracy: 0.9992 - val_loss: 0.0101
Epoch 3/5
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - accuracy: 0.9995 - loss: 0.0062 - val_accuracy: 0.9992 - val_loss: 0.0082
Epoch 4/5
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 20ms/step - accuracy: 0.9992 - loss: 0.0065 - val_accuracy: 0.9992 - val_loss: 0.0070
Epoch 5/5
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20ms/step - accuracy: 0.9984 - loss: 0.0083 - val_accuracy: 0.9992 - val_loss: 0.0075


In [None]:
print(model.predict(text_vec_layer(["BTXXVVSE"]))[0, 0]), # Fake
print(model.predict(text_vec_layer(["BTSSXXTVVE"]))[0, 0]) # True

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
0.060731865
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
0.98584425


### 9.

In [None]:
import numpy as np
from datetime import date

MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()

    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]

    x = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
    y = [dt.isoformat() for dt in dates]
    return x, y

In [None]:
string_date, num_date = random_dates(5000)

In [None]:
string_date[:5], num_date[:5]

(['August 13, 8702',
  'August 22, 4383',
  'April 05, 2678',
  'November 24, 4824',
  'May 13, 1206'],
 ['8702-08-13', '4383-08-22', '2678-04-05', '4824-11-24', '1206-05-13'])

In [None]:
INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
INPUT_CHARS

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [None]:
OUTPUT_CHARS = "0123456789-"

In [None]:
def string_to_ids(date, chars):
  return [chars.index(i) for i in date]

In [None]:
X_train = list(map(lambda x: string_to_ids(date=x, chars=INPUT_CHARS), string_date))
y_train = list(map(lambda x: string_to_ids(date=x, chars=OUTPUT_CHARS), num_date))

string_date_val, num_date_val = random_dates(500)
X_val = list(map(lambda x: string_to_ids(date=x, chars=INPUT_CHARS), string_date_val))
y_val = list(map(lambda x: string_to_ids(date=x, chars=OUTPUT_CHARS), num_date_val))

In [None]:
X_train_tensor = tf.ragged.constant(X_train, ragged_rank=1)
X_val_tensor = tf.ragged.constant(X_val, ragged_rank=1)

In [None]:
# Use 0 as padding
X_train_tensor = (X_train_tensor + 1).to_tensor()
X_val_tensor = (X_val_tensor + 1).to_tensor()

In [None]:
y_train_tensor = tf.ragged.constant(y_train, ragged_rank=1)
y_val_tensor = tf.ragged.constant(y_val, ragged_rank=1)

In [None]:
# Use 0 as padding
y_train_tensor = (y_train_tensor + 1).to_tensor()
y_val_tensor = (y_val_tensor + 1).to_tensor()

In [None]:
embedding_size = 32
max_output_length = y_train_tensor.shape[1]

encoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1,
                           output_dim=embedding_size,
                           input_shape=[None]),
    tf.keras.layers.LSTM(128)
])

decoder = tf.keras.Sequential([
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")
])

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = tf.keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(X_train_tensor, y_train_tensor, epochs=20,
                    validation_data=(X_val_tensor, y_val_tensor))

Epoch 1/20


  super().__init__(**kwargs)


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.2403 - loss: 2.1571 - val_accuracy: 0.4350 - val_loss: 1.5215
Epoch 2/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.4758 - loss: 1.4399 - val_accuracy: 0.3088 - val_loss: 1.9642
Epoch 3/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.4755 - loss: 1.5178 - val_accuracy: 0.6094 - val_loss: 1.0927
Epoch 4/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.6318 - loss: 1.0344 - val_accuracy: 0.6808 - val_loss: 0.8747
Epoch 5/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.6982 - loss: 0.8206 - val_accuracy: 0.7338 - val_loss: 0.7012
Epoch 6/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.6807 - loss: 0.8892 - val_accuracy: 0.6050 - val_loss: 1.1142
Epoch 7/20
[1m157/157[0m [32m

In [None]:
y_val_tensor[0]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 4,  7,  1,  7, 11,  1,  8, 11,  3,  9], dtype=int32)>

In [None]:
date = "April 18, 2025"
date = np.expand_dims(tf.ragged.constant(string_to_ids(date, INPUT_CHARS)), 0)
model.predict(date).argmax(-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step


array([[ 2,  2,  2,  5, 11,  1, 10, 11,  3,  8]])

In [None]:
y_pred = model.predict(date)
predicted_ids = y_pred.argmax(axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step


In [None]:
OUTPUT_CHARS_LIST = " ".join(OUTPUT_CHARS).split() + ["-"]

In [None]:
"".join([OUTPUT_CHARS_LIST[int(i)] for i in list(predicted_ids[0])])

'2225-1--38'