In [1]:
import numpy as np
import random
import os
import time
from babel.dates import format_date
from faker import Faker

import tensorflow as tf
import tensorflow.keras.layers as tfl

# Dataset Generator

The data for this model is generated using the **Faker** python library. It's a simple tool that allows us to create fake data. The data is then preprocessed such that each character in an example is a one-hot vector.

In [2]:

def generate_dataset(m):
    """Generates a dataset with X, Y where X is the human readable date and Y is the machine readable ISO standard. 
    It also returns 3 dictionaries which are used to encode the dataset.
    """
    formats = ['short','medium', 'full', 'full', 'full', 'full', 'full', 'full', 'full',
            'd MMM YYY', 'd MMMM YYY', 'dd MMM YYY', 'd MMM, YYY', 'd MMMM, YYY',
            'dd, MMM YYY', 'd MM YY', 'd MMMM YYY', 'MMMM d YYY', 'MMMM d, YYY', 'dd.MM.YY']
    fake = Faker()
    Faker.seed(42)

    dataset = []
    # Create a set which will store all the unique characters. Adding
    human_vocab = set(('<unk>', '<pad>')) 
    machine_vocab = set() # set(('<pad>',)) 
    error_counter = 0

    for i in range(m):
        date = fake.date_object()
        try: 
            human_readable = format_date(date, format=random.choice(formats), locale='en_US')
            human_readable = human_readable.lower()
            human_readable = human_readable.replace(',','')
            machine_readable = date.isoformat()
            
            dataset.append((human_readable, machine_readable))
            human_vocab.update(tuple(human_readable))
            machine_vocab.update(tuple(machine_readable))

        except AttributeError as e:
            error_counter += 1
            print("Error while generating dataset, count: ", error_counter)
        
    human_vocab = dict(zip( sorted(human_vocab), list(range(len(human_vocab)))  ))
    machine_vocab = dict(zip( sorted(machine_vocab), list(range(len(machine_vocab))) ))
    machine_vocab_inv = {v:k for k,v in machine_vocab.items()}

    return dataset, human_vocab, machine_vocab, machine_vocab_inv

def preprocess_dataset(dataset, human_vocab, machine_vocab, Tx, Ty):
    """Encodes all the characters in the dataset to one-hot vectors.
    """
    X, Y = zip(*dataset) #unpack the list

    # Get index of each character as defined in the vocab
    X = np.array([string_to_index(i, Tx, human_vocab) for i in X]) 
    Y = np.array([string_to_index(i, Ty, machine_vocab) for i in Y])

    # Convert the indices to One-hot vectors
    Xoh = np.array(list(map(lambda x: tf.keras.utils.to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = list(map(lambda x: tf.keras.utils.to_categorical(x, num_classes=len(machine_vocab)), Y))

    return X, Y, Xoh, np.array(Yoh)

def string_to_index(string, length, vocab):
    """Converts the given string to their index values defined by the vocab. It pads the string if it is less
    than the given length or truncates the string if it is above the given length.
    """
    string = string.lower()
    string = string.replace(',','')

    # truncate
    if  len(string) > length:
        string = string[:length]

    indices = []
    for x in list(string):

        index = vocab.get(x, '<unk>')
        indices.append(index)
    
    # append
    if len(indices) < length:
        indices += [vocab['<pad>']] * (length - len(indices))

    return indices


In [3]:
m_total = 30000
m = 25000
m_val = m_total - m
Tx = 30
Ty = 10

dataset, human_vocab, machine_vocab, machine_vocab_inv = generate_dataset(m_total)
X_total, Y_total, Xoh_total, Yoh_total = preprocess_dataset(dataset, human_vocab, machine_vocab, Tx, Ty)

# Training set
X, X_val = X_total[:m], X_total[m:]
Xoh, Xoh_val = Xoh_total[:m], Xoh_total[m:]
# Validation Set
Y, Y_val = Y_total[:m], Y_total[m:]
Yoh, Yoh_val = Yoh_total[:m], Yoh_total[m:]

print(X.shape, Y.shape, Xoh.shape, Yoh.shape)
print(X_val.shape, Y_val.shape, Xoh_val.shape, Yoh_val.shape)

(25000, 30) (25000, 10) (25000, 30, 37) (25000, 10, 11)
(5000, 30) (5000, 10) (5000, 30, 37) (5000, 10, 11)


# Attention Model

In a sequence-to-sequence model, the traditional approach is to run the input through a series of RNN/GRU/LSTM units which **memorizes** the entire input sequence. This is the encoding part of the network, it outputs a single vector which should then pass the entire information about the input sequence to a decoder which decodes to give the output sequence.

As we can see this of model fails when a very long sequence is given since its **hard to memorize the entire input**. Thus we use a much powerful model called **Attention Model.** In an attention model, to generate the first output word, we just need to look at the first few words of the input and we don't need to look very deep in the input sequence. Thus we can compute some **attention weights, $\alpha^{<t, t'>}$** which tells how **much attention we need to pay** for a specific input word $t'$ when generating a specific output word $t$.

![](./images/attention_network.png)

In [4]:
# These layers are defined globally because their weights are re-used druing each timestep.

repeat_vector = tfl.RepeatVector(Tx)
concatenate = tfl.Concatenate(axis = -1)
dense_layer1 = tfl.Dense(10, activation='tanh')
dense_layer2 = tfl.Dense(1, activation='relu')
softmax_layer = tfl.Softmax(axis=-1)
dot = tfl.Dot(axes=1)

n_a = 32 # number of units for the input LSTMs
n_s = 64 # number of units for the output LSTMs
output_LSTM = tfl.LSTM(n_s, return_state = True)
output_layer = tfl.Dense(len(machine_vocab), activation='softmax')

In [5]:
def attention(a, s_prev):
    """Given a single timestep/character, this function calculates how much context
    it should take from its neighbouring timesteps/characters.
    The s_prev is the output of a single LSTM, while 'a' is the output of all the input   
    timesteps(Tx). Thus s_prev is repeated Tx times. Then both are concatenated and 
    passed through 2 dense layers. 

    Finally they are given to a softmax layer to calculate the Attention Weight, alpha.
    This tells us how much amount of weight it should give each of the input activation.
    Thus its normalized using softmax such that all their weights add up to 1.

    Then we can finally apply the weights to the activations using the dot layer. This 
    gives us the amount of attention/context per activation (i.e. input timestep/char).
    """
    s_prev = repeat_vector(s_prev)
    concat = concatenate([a, s_prev])
    e = dense_layer1(concat)
    e = dense_layer2(e)
    alpha = softmax_layer(e)
    context = dot([alpha, a])

    return context

def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    """The activations of all the input timesteps/characters are calculated
    using the first Bidirectional LSTM. We can then use these activations to find the
    context for a single output at time, while simultaneously finding its output
    prediction.
    """
    # Defining the inputs
    X = tfl.Input(shape=(Tx, human_vocab_size))
    s0 = tfl.Input(shape=(n_s,), name='s0')
    c0 = tfl.Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    outputs = []
    
    # This first LSTM calculates the activations of all the input timesteps/characters
    a = tfl.Bidirectional(tfl.LSTM(units=n_a, return_sequences=True))(X)

    # Then to predict each out the output character, we first find the context
    # using the above attention function and then pass it through the output LSTM
    # and a softmax layer to produce one output at a time.
    for t in range(Ty):
        context = attention(a, s)
        s, _, c = output_LSTM(inputs=context, initial_state=[s,c])
        out = output_layer(s)
        outputs.append(out)
    
    model = tf.keras.models.Model(inputs=[X,s0,c0], outputs=outputs)
    
    return model

In [6]:
Tx = 30
Ty = 10
n_a = 32
n_s = 64
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))

In [7]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30, 37)]     0                                            
__________________________________________________________________________________________________
s0 (InputLayer)                 [(None, 64)]         0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 30, 64)       17920       input_1[0][0]                    
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 30, 64)       0           s0[0][0]                         
                                                                 lstm[0][0]                   

In [8]:
opt = tf.keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01) # Adam(...) 
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])

# Initializing the intitial hidden cell state (s0, usually a0 is used) and the memory cell
# state (c0)
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))

s0_val = np.zeros((m_val, n_s))
c0_val = np.zeros((m_val, n_s))
outputs_val = list(Yoh_val.swapaxes(0,1))

In [11]:
root_logdir = os.path.join(os.curdir, "logs")

def get_run_id(string=None):
    """Returns the current run id and the log dire
    """
    if string:
        run_id = "run-" + string + "-" + time.strftime("%Y-%m-%d-%H-%M")
    else:
        run_id = 'run-' + time.strftime("%Y-%m-%d-%H-%M")
    return run_id, os.path.join(root_logdir, run_id)

In [12]:
run_id, run_logdir = get_run_id()
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)

history = model.fit([Xoh, s0, c0],
                    outputs, 
                    epochs=50,
                    batch_size=500,
                    validation_data=([Xoh_val, s0_val, c0_val], outputs_val),
                    callbacks=[tensorboard_cb]
                    )
model.save(run_id + ".h5")

tep - loss: 0.8373 - dense_2_loss: 0.0247 - dense_2_1_loss: 0.0214 - dense_2_2_loss: 0.1564 - dense_2_3_loss: 0.1172 - dense_2_4_loss: 0.0030 - dense_2_5_loss: 0.0376 - dense_2_6_loss: 0.2107 - dense_2_7_loss: 0.0059 - dense_2_8_loss: 0.0811 - dense_2_9_loss: 0.1794 - dense_2_accuracy: 0.9908 - dense_2_1_accuracy: 0.9919 - dense_2_2_accuracy: 0.9340 - dense_2_3_accuracy: 0.9796 - dense_2_4_accuracy: 1.0000 - dense_2_5_accuracy: 0.9883 - dense_2_6_accuracy: 0.9398 - dense_2_7_accuracy: 1.0000 - dense_2_8_accuracy: 0.9819 - dense_2_9_accuracy: 0.9537 - val_loss: 0.8892 - val_dense_2_loss: 0.0279 - val_dense_2_1_loss: 0.0253 - val_dense_2_2_loss: 0.1595 - val_dense_2_3_loss: 0.1149 - val_dense_2_4_loss: 0.0028 - val_dense_2_5_loss: 0.0426 - val_dense_2_6_loss: 0.2276 - val_dense_2_7_loss: 0.0065 - val_dense_2_8_loss: 0.0966 - val_dense_2_9_loss: 0.1854 - val_dense_2_accuracy: 0.9898 - val_dense_2_1_accuracy: 0.9892 - val_dense_2_2_accuracy: 0.9326 - val_dense_2_3_accuracy: 0.9750 - val_de

In [15]:
%load_ext tensorboard
%tensorboard --logdir=./logs --port 6006 

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 39336), started 1:05:14 ago. (Use '!kill 39336' to kill it.)

In [35]:
test = ['aug 18 1997', '5th dec 1972', 'monday 10th of Jan 1992', '20th day of mar 2016', 'may 4th 1992']

s00 = np.zeros((1, n_s))
c00 = np.zeros((1, n_s))
for example in test:
    inputs = string_to_index(example, Tx, human_vocab)
    inputs = np.array(list(map(lambda x: tf.keras.utils.to_categorical(x, num_classes=len(human_vocab)), inputs))).swapaxes(0,1)
    inputs = np.swapaxes(inputs, 0, 1)
    inputs = np.expand_dims(inputs, axis=0)
    prediction = model.predict([inputs, s00, c00])
    prediction = np.argmax(prediction, axis = -1)
    output = [machine_vocab_inv[int(i)] for i in prediction]
    print("source:", example)
    print("output:", ''.join(output),"\n")

source: aug 18 1997
output: 1997-08-18 

source: 5th dec 1972
output: 1972-12-05 

source: monday 10th of Jan 1992
output: 1992-01-11 

source: 20th day of mar 2016
output: 2016-03-20 

source: may 4th 1992
output: 1992-05-04 

