In [1]:
import numpy as np
import matplotlib.pyplot as plot
import tensorflow as tf
from tensorflow.keras import layers
from datetime import date

In [2]:
months = ["January","February","March","April","May","June","July","August",
          "September","October","November","December"]

def random_dates(n_dates):

  min_date = date(1000,1,1).toordinal()
  max_date = date(9999,12,31).toordinal()

  rand_dates = np.random.randint(max_date-min_date,size=n_dates) + min_date
  dates = [date.fromordinal(x) for x in rand_dates]

  X = [months[dt.month-1]+ " "+dt.strftime("%d, %Y") for dt in dates]
  y = [dt.isoformat() for dt in dates]

  return X,y

In [3]:
X_example,y_example = random_dates(3)

In [4]:
X_example

['May 07, 3010', 'January 08, 4571', 'August 11, 5645']

In [5]:
y_example

['3010-05-07', '4571-01-08', '5645-08-11']

In [6]:
input_chars = "".join(sorted(set("".join(months) + "0123456789, ")))

In [7]:
input_chars

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [8]:
output_chars = "0123456789-"

In [9]:
def date_strs_to_ids(date_str,chars=input_chars):
  return [chars.index(c)+1 for c in date_str]

In [10]:
date_strs_to_ids(X_example[0])

[17, 21, 38, 1, 3, 10, 2, 1, 6, 3, 4, 3]

In [11]:
def preprocess_strs_to_tensors(dates,chars=input_chars):
  X = [date_strs_to_ids(dt,chars=chars) for dt in dates]
  X = tf.ragged.constant(X,ragged_rank=1)
  return X.to_tensor()

In [12]:
preprocess_strs_to_tensors(y_example,chars=output_chars)

<tf.Tensor: shape=(3, 10), dtype=int32, numpy=
array([[ 4,  1,  2,  1, 11,  1,  6, 11,  1,  8],
       [ 5,  6,  8,  2, 11,  1,  2, 11,  1,  9],
       [ 6,  7,  5,  6, 11,  1,  9, 11,  2,  2]], dtype=int32)>

In [13]:
preprocess_strs_to_tensors(random_dates(5)[0])

<tf.Tensor: shape=(5, 18), dtype=int32, numpy=
array([[20, 24, 32, 35, 24, 29, 22, 24, 33,  1,  5,  4,  2,  1,  4,  3,
         3, 12],
       [16, 21, 30, 36, 21, 33, 38,  1,  3,  7,  2,  1, 12,  9,  5,  6,
         0,  0],
       [17, 21, 38,  1,  4,  8,  2,  1,  7, 10, 12,  7,  0,  0,  0,  0,
         0,  0],
       [20, 24, 32, 35, 24, 29, 22, 24, 33,  1,  3,  7,  2,  1, 11,  7,
        11,  8],
       [13, 36, 25, 36, 34, 35,  1,  5, 12,  2,  1, 10, 11,  6,  9,  0,
         0,  0]], dtype=int32)>

In [14]:
def create_dataset(n_dates):
  X,y = random_dates(n_dates)
  X = preprocess_strs_to_tensors(X,chars=input_chars)
  y = preprocess_strs_to_tensors(y,chars=output_chars)
  return X,y

In [15]:
X_train,y_train = create_dataset(10000)
X_val,y_val = create_dataset(2000)
X_test,y_test = create_dataset(2000)

In [16]:
X_train

<tf.Tensor: shape=(10000, 18), dtype=int32, numpy=
array([[16, 21, 30, ..., 10,  0,  0],
       [16, 21, 30, ...,  9,  0,  0],
       [19, 23, 35, ...,  7,  0,  0],
       ...,
       [15, 24, 22, ..., 12, 11,  0],
       [16, 36, 28, ...,  0,  0,  0],
       [16, 21, 30, ...,  4,  0,  0]], dtype=int32)>

In [17]:
embedding_dim = 32
max_output_length = y_train.shape[1]

np.random.seed(12)
tf.random.set_seed(12)

encoder = tf.keras.models.Sequential([
    layers.Embedding(input_dim=len(input_chars)+1,
                     output_dim=embedding_dim,
                     input_shape=[None]),
    layers.LSTM(units=128)
],name="encoder")

decoder = tf.keras.models.Sequential([
    layers.LSTM(128,return_sequences=True),
    layers.Dense(len(output_chars)+1,activation="softmax")
],name="decoder")

model = tf.keras.models.Sequential([
    encoder,
    layers.RepeatVector(max_output_length),
    decoder
],name="date2date_1")

model.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics="accuracy")

model.summary()

Model: "date2date_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Sequential)        (None, 128)               83680     
                                                                 
 repeat_vector (RepeatVector  (None, 10, 128)          0         
 )                                                               
                                                                 
 decoder (Sequential)        (None, 10, 12)            133132    
                                                                 
Total params: 216,812
Trainable params: 216,812
Non-trainable params: 0
_________________________________________________________________


In [18]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          1248      
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
Total params: 83,680
Trainable params: 83,680
Non-trainable params: 0
_________________________________________________________________


In [19]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 10, 128)           131584    
                                                                 
 dense (Dense)               (None, 10, 12)            1548      
                                                                 
Total params: 133,132
Trainable params: 133,132
Non-trainable params: 0
_________________________________________________________________


In [20]:
history_1 = model.fit(X_train,y_train,
                      epochs=20,
                      validation_data=(X_val,y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
model.evaluate(X_test,y_test)



[0.0036633601412177086, 0.9998499751091003]

In [22]:
new_dates = ["September 15, 1972","July 15, 1979"]
new_dates = preprocess_strs_to_tensors(new_dates)
pred_dates = model.predict(new_dates)

In [23]:
pred = np.argmax(pred_dates,axis=-1)

In [24]:
pred

array([[ 2, 10,  8,  3, 11,  1, 10, 11,  2,  6],
       [ 2, 10,  8, 10, 11,  1,  8, 11,  2,  6]])

In [25]:
"".join([output_chars[id-1] for id in pred[0]])

'1972-09-15'

In [26]:
"".join([output_chars[id-1] for id in pred[1]])

'1979-07-15'

In [27]:
def ids_to_dates(preds,chars=output_chars):
  preds = np.argmax(preds,axis=-1)
  for pred in preds:
    print("".join([output_chars[id-1] for id in pred]))

In [28]:
ids_to_dates(pred_dates)

1972-09-15
1979-07-15


In [29]:
new_dates = ["November 12, 2002","May 05, 2003"]
new_dates = preprocess_strs_to_tensors(new_dates)
pred_dates = model.predict(new_dates)

In [30]:
ids_to_dates(pred_dates)

2002-12-12
2003-05-05
