# Exercise 9:

Train an Encoder–Decoder model that can convert a date string
from one format to another (e.g., from “April 22, 2019” to “2019-
04-22”).

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from datetime import date

Let's start by creating dataset. We will use random days between 1000-01-01 and 9999-12-31. (Format: YYYY-MM-DD)

In [5]:

MONTHS = ["January", "February", "March", "April", "May", "June", "July", 
            "August", "September", "October", "November", "December"]

def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal() # toordinal() returns the proleptic Gregorian ordinal of the date. Gregorian ordinal = number of days since 1st January 1 AD
    max_date = date(9999, 12, 31).toordinal()
    
    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]
    
    X = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
    y = [dt.isoformat() for dt in dates] # YYYY-MM-DD
    
    return X,y

Here are few random dates, displayed in both input format and output format

In [10]:
np.random.seed(42)

n_dates = 3
X_example, y_example = random_dates(n_dates)
print("{:25s}{:25s}".format("Input", "Target"))
print("-" * 50)
for idx in range(n_dates):
    print("{:25s}{:25s}".format(X_example[idx], y_example[idx]))

Input                    Target                   
--------------------------------------------------
September 20, 7075       7075-09-20               
May 15, 8579             8579-05-15               
January 11, 7103         7103-01-11               


Let's get list of all the possible characters in the inputs:

In [11]:
INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
INPUT_CHARS

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [12]:
OUTPUT_CHARS = "0123456789-"

Let's write a function to convert a strng to a list of character IDs.

Here's the list of possible characters in the outputs:

In [13]:
def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str] # ID = index of that character

In [14]:
date_str_to_ids(X_example[0], INPUT_CHARS)

[19, 23, 31, 34, 23, 28, 21, 23, 32, 0, 4, 2, 1, 0, 9, 2, 9, 7]

In [15]:
date_str_to_ids(y_example[0], OUTPUT_CHARS)

[7, 0, 7, 5, 10, 0, 9, 10, 2, 0]

In [16]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    # to_tensor() will create tensor of all same dimension by padding. Need to pad because RNNs can't handle variable length sequences
    return (X + 1).to_tensor() # 0 will be used as padding. Therefore need to shift all the values of X by 1

def create_dataset(n_dates):
    X, y = random_dates(n_dates)
    return prepare_date_strs(X, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

In [17]:
np.random.seed(42)

X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)

In [18]:
Y_train[0]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 8,  1,  8,  6, 11,  1, 10, 11,  3,  1], dtype=int32)>

### Model-1: Very basic Seq2Seq model

Let's first try the simplest possible model: we feed in the input sequence, which first goes through the encoder (an embedding layer followed by a single LSTM layer), which outputs a vector, then it goes through a decoder (a single LSTM layer, followed by a dense output layer), which outputs a sequence of vectors, each representing the estimated probabilities for all possible output character.

Since the decoder expects a sequence as input, we repeat the vector (which is output by the encoder) as many times as the longest possible output sequence.

In [22]:
keras.backend.clear_session()

embedding_size = 32
max_output_length = Y_train.shape[1] # 10

np.random.seed(42)
tf.random.set_seed(42)

encoder = keras.models.Sequential([
    keras.layers.InputLayer(shape=(None,), ragged=True),
    keras.layers.Embedding(input_dim=len(INPUT_CHARS)+1, output_dim=embedding_size),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = keras.optimizers.Nadam()

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

h = model.fit(X_train, Y_train, epochs=20, validation_data=(X_valid, Y_valid))

Epoch 1/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 45ms/step - accuracy: 0.2982 - loss: 1.9901 - val_accuracy: 0.5701 - val_loss: 1.2087
Epoch 2/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 57ms/step - accuracy: 0.5911 - loss: 1.1647 - val_accuracy: 0.6970 - val_loss: 0.8390
Epoch 3/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 68ms/step - accuracy: 0.7165 - loss: 0.7564 - val_accuracy: 0.7741 - val_loss: 0.5747
Epoch 4/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 68ms/step - accuracy: 0.7493 - loss: 0.6931 - val_accuracy: 0.8189 - val_loss: 0.5089
Epoch 5/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - accuracy: 0.8438 - loss: 0.4408 - val_accuracy: 0.9000 - val_loss: 0.2987
Epoch 6/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - accuracy: 0.9165 - loss: 0.2582 - val_accuracy: 0.9569 - val_loss: 0.1679
Epoch 7/20
[1m3

Let's use model to make some predictions but first we will need a function to convert sequence of character IDs to a readable string.

In [23]:
def id_to_strs(ids, chars=OUTPUT_CHARS):
    return ["".join([("?" + chars)[index] for index in sequence])
            for sequence in ids]

Now we can use the model to convert some dates

In [55]:
X_new_str = ["September 17, 2009", "July 14, 1789"]
X_new = prepare_date_strs(X_new_str)

In [56]:
X_new[1].shape

TensorShape([18])

In [48]:
ids = np.argmax(model.predict(X_new, verbose=0), axis=-1)
print("{:25s}{:25s}".format("Input Date", "Date in ISO Format"))
print("-" * 50)

date_str = id_to_strs(ids)
for idx in range(len(date_str)):
    print("{:25s}{:25s}".format(X_new_str[idx], date_str[idx]))

Input Date               Date in ISO Format       
--------------------------------------------------
September 17, 2009       2009-09-17               
July 14, 1789            1789-07-14               


Perfect :)

However, since the model was trained on input strings of length 18 (which is length of the longest date), it does not perform well if we try to make predictions using a shorter sequences:

In [57]:
X_new_str = ["May 02, 2020", "July 14, 1789"]
X_new = prepare_date_strs(X_new_str)

In [58]:
X_new[0].shape # longest is 13

TensorShape([13])

In [59]:
ids = np.argmax(model.predict(X_new, verbose=0), axis=-1)
print("{:25s}{:25s}".format("Input Date", "Date in ISO Format"))
print("-" * 50)

date_str = id_to_strs(ids)
for idx in range(len(date_str)):
    print("{:25s}{:25s}".format(X_new_str[idx], date_str[idx]))

Input Date               Date in ISO Format       
--------------------------------------------------
May 02, 2020             2020-01-02               
July 14, 1789            1789-09-14               


Oops! We need to ensure that we always pas sequences of the same length as during training, using padding if necessary. Let's write a little helper function for the same:

In [62]:
max_input_length = X_train.shape[1]

def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    if X.shape[1] < max_input_length:
        X = tf.pad(X, [[0,0], [0, max_input_length - X.shape[1]]])
    return X

def convert_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    ids = np.argmax(model.predict(X, verbose=0), axis=-1)
    return id_to_strs(ids)

In [65]:
X_new_str = ["May 02, 2020", "July 14, 1789"]

preds = convert_date_strs(X_new_str)

print("{:25s}{:25s}".format("Input Date", "Date in ISO Format"))
print("-" * 50)

for idx in range(len(preds)):
    print("{:25s}{:25s}".format(X_new_str[idx], preds[idx]))

Input Date               Date in ISO Format       
--------------------------------------------------
May 02, 2020             2020-05-02               
July 14, 1789            1789-07-14               


Cool!

However, real-life sequence-to-sequence problems will usually be harder, so far for sake of completeness, let's build more powerful model.

### Model-2: Feeding the shifted targets to the decoder (teacher forcing)

