<a href="https://colab.research.google.com/github/Rstam59/TaskDataRepoForStudents/blob/main/Date_conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import tensorflow as tf
from datetime import date
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, TextVectorization
from tensorflow.keras.models import Model


MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()
    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]
    x = [MONTHS[d.month - 1] + " " + d.strftime("%d, %Y") for d in dates]
    y = [d.strftime("%Y-%m-%d") for d in dates]
    return x, y

np.random.seed(42)
x_texts, y_texts = random_dates(10000)

# Add "startofseq" and "endofseq" markers
decoder_inputs_texts = [f"startofseq {text}" for text in y_texts]
decoder_targets_texts = [f"{text} endofseq" for text in y_texts]


max_input_len = max(len(txt) for txt in x_texts)
max_decoder_input_len = max(len(txt) for txt in decoder_inputs_texts)
max_decoder_target_len = max(len(txt) for txt in decoder_targets_texts)

input_vectorizer = TextVectorization(
    output_mode='int',
    output_sequence_length=max_input_len,
    split='character',
    standardize=None
)
decoder_vectorizer = TextVectorization(
    output_mode='int',
    output_sequence_length=max(max_decoder_input_len, max_decoder_target_len),
    split='character',
    standardize=None
)

# Adapt vectorizers
input_vectorizer.adapt(x_texts)
decoder_vectorizer.adapt(decoder_inputs_texts + decoder_targets_texts)

# Vectorize
encoder_input = input_vectorizer(tf.constant(x_texts))
decoder_input = decoder_vectorizer(tf.constant(decoder_inputs_texts))
decoder_target = decoder_vectorizer(tf.constant(decoder_targets_texts))

# Get vocab size
input_vocab_size = len(input_vectorizer.get_vocabulary())
target_vocab_size = len(decoder_vectorizer.get_vocabulary())

# Convert target to one-hot for teacher forcing
decoder_target = tf.one_hot(decoder_target, depth=target_vocab_size)


latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,), name="encoder_inputs")
x = Embedding(input_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(x)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,), name="decoder_inputs")
x = Embedding(target_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
x = LSTM(latent_dim, return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(target_vocab_size, activation="softmax")(x)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()



model.fit(
    [encoder_input, decoder_input],
    decoder_target,
    batch_size=64,
    epochs=5,
    validation_split=0.2
)


Epoch 1/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.5202 - loss: 1.6236 - val_accuracy: 0.7992 - val_loss: 0.5284
Epoch 2/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8274 - loss: 0.4557 - val_accuracy: 0.9016 - val_loss: 0.2649
Epoch 3/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9182 - loss: 0.2188 - val_accuracy: 0.9606 - val_loss: 0.1330
Epoch 4/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9714 - loss: 0.1083 - val_accuracy: 0.9913 - val_loss: 0.0564
Epoch 5/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9937 - loss: 0.0443 - val_accuracy: 0.9977 - val_loss: 0.0242


<keras.src.callbacks.history.History at 0x78d685b83390>

In [14]:
import tensorflow as tf
import numpy as np

def translate_date(encoder_input_data, decoder_input_data, model, decoder_vectorizer):
    """
    Translates an encoded date input into a human-readable date string.

    Args:
        encoder_input_data: The encoded input date.
        decoder_input_data: The encoded decoder input.
        model: The trained seq2seq model.
        decoder_vectorizer: The TextVectorization layer used for the decoder.

    Returns:
        The translated date string.
    """
    # Make prediction
    prediction = model.predict([tf.expand_dims(encoder_input_data, axis=0),
                                 tf.expand_dims(decoder_input_data, axis=0)])

    # Get predicted character indices
    predicted_indices = np.argmax(prediction, axis=2)[0]

    # Convert indices to characters and join them
    translated_text = "".join([decoder_vectorizer.get_vocabulary()[index]
                                for index in predicted_indices
                                if index != 0])

    # Remove "startofseq" and "endofseq"
    translated_text = translated_text.replace("startofseq ", "").replace(" endofseq", "")

    return translated_text

In [15]:
# Assuming you want to translate the first sample
translated_date = translate_date(encoder_input[0], decoder_input[0], model, decoder_vectorizer)
print(translated_date)  # Output: The translated date

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
7075-09-20


In [17]:
# Get the vocabulary of the input vectorizer
input_vocabulary = input_vectorizer.get_vocabulary()

# Get the encoded values for the first sample
encoded_date = encoder_input[0].numpy()

# Convert encoded values to characters and join them
original_date = "".join([input_vocabulary[index] for index in encoded_date if index != 0])

# Print the original date
print(f"Encoded input: {encoded_date}")
print(f"Original date: {original_date}")

Encoded input: [39  4 29 22  4 23 18  4  7  2  6  8  3  2 10  8 10 15]
Original date: September 20, 7075
