In [30]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set(); sns.set_style('dark')

import os
import datetime

from sklearn.model_selection import train_test_split
import tensorflow as tf

In [31]:
from datetime import date

MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()

    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]

    x = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
    y = [dt.isoformat() for dt in dates]
    return x, y

In [32]:
INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
OUTPUT_CHARS = "".join(sorted(set("0123456789-")))
def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]

In [33]:
date_str_to_ids(y[0], OUTPUT_CHARS)


[3, 5, 5, 1, 0, 1, 2, 0, 1, 10]

In [34]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor() # using 0 as the padding token ID

def create_dataset(n_dates):
    x, y = random_dates(n_dates)
    return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

np.random.seed(42)

X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)


In [35]:
X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape, X_test.shape, Y_test.shape

(TensorShape([10000, 18]),
 TensorShape([10000, 10]),
 TensorShape([2000, 18]),
 TensorShape([2000, 10]),
 TensorShape([2000, 18]),
 TensorShape([2000, 10]))

In [36]:
Y_train

<tf.Tensor: shape=(10000, 10), dtype=int32, numpy=
array([[ 9,  2,  9, ...,  1,  4,  2],
       [10,  7,  9, ...,  1,  3,  7],
       [ 9,  3,  2, ...,  1,  3,  3],
       ...,
       [11,  9,  8, ...,  1,  5,  2],
       [ 3,  3,  4, ...,  1,  4,  9],
       [ 9, 10,  5, ...,  1,  4, 11]])>

In [45]:
max_output_length = Y_train.shape[1]
embedding_size = 32

encoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1,
                           output_dim=embedding_size,
                           input_shape=[None]),
    tf.keras.layers.GRU(128)
])
decoder = tf.keras.Sequential([
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation='softmax')
])

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.RepeatVector(max_output_length),
    decoder
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

print("Model output shape:", model.output.shape)
print("Target labels shape:", Y_train.shape)


Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_39 (Sequential)  (None, 128)               63456     
                                                                 
 repeat_vector_5 (RepeatVect  (None, 10, 128)          0         
 or)                                                             
                                                                 
 sequential_40 (Sequential)  (None, 10, 12)            100620    
                                                                 
Total params: 164,076
Trainable params: 164,076
Non-trainable params: 0
_________________________________________________________________
Model output shape: (None, 10, 12)
Target labels shape: (10000, 10)


In [46]:
history = model.fit(X_train, Y_train, epochs=10, validation_data=(X_valid, Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [47]:
model.evaluate(X_test, Y_test)



[0.007861150428652763, 0.9999499917030334]

In [76]:
def ids_to_date_strs(ids, chars=OUTPUT_CHARS):
    return ["".join([("?" + chars)[index] for index in sequence])
            for sequence in ids]

In [None]:
X_new = prepare_date_strs(["September 17, 2009", "July 14, 1789"])

ids = model.predict(X_new).argmax(axis=-1)
for date_str in ids_to_date_strs(ids):
    print(date_str)

2009-12-17
1789-08-14


array([[4, 2, 4, 2, 1, 3, 4, 1, 5, 3]], dtype=int64)