# Assigment 1, Steve Veldman, 6/20/2024

Question 4: 20 points
* Build from scratch in PyTorch, using LSTM, encoder-decoder network that translates between two date formats,\
    for example January 5, 2025: 5/1/2025 ← 2025/1/5.
* Generate the training dataset and save it to file
* Implement your own Dataset class to load the data
* Train your model, demonstrate how well it does the translation.

In [1]:
from io import open

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
import datetime
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Generate Data and Save Training and Validation Sets:

In [2]:
# Generate Dataset:

# generate all dates between Janaury 1, 1582 (year the Gregorian calendar was introduced) and December 31, 2024:
# generate in format '2019-04-22'

months_30_days = [4, 6, 9, 11]
months_31_days = [1, 3, 5, 7, 8, 10, 12]

dates_numeric = []
for y in range(1582, 2025):
  year = y
  for m in range(1, 13):
    month = m
    if month in months_30_days:
      for d in range(1, 31):
        day = d
        date = datetime.date(year, month, day)
        date = str(date)
        dates_numeric.append(date)
    elif month in months_31_days:
      for d in range(1, 32):
        day = d
        date = datetime.date(year, month, day)
        date = str(date)
        dates_numeric.append(date)
    elif month == 2:
      for d in range(1, 29):
        day = d
        date = datetime.date(year, month, day)
        date = str(date)
        dates_numeric.append(date)

In [3]:
# Convert dates to format "April 22, 2019"
dates_original = []

for i in range(0, len(dates_numeric)):
  date_str = dates_numeric[i]
  date_obj = datetime.datetime.strptime(date_str, "%Y-%m-%d")
  formatted_date = date_obj.strftime("%B %d, %Y")
  formatted_date = str(formatted_date)
  dates_original.append(formatted_date)

In [4]:
# Create Train and Validate Splits:
orig_train, orig_val, num_train, num_val = train_test_split(dates_original, dates_numeric, test_size=0.2, random_state=47)

In [5]:
# Save datasets to csv file:
with open('dates_train.csv', 'w', newline='') as f:
  writer = csv.writer(f)
  writer.writerow(['num_train', 'orig_train'])
  for i in range(len(num_train)):
    writer.writerow([num_train[i], orig_train[i]])

with open('dates_val.csv', 'w', newline='') as f:
  writer = csv.writer(f)
  writer.writerow(['num_val', 'orig_val'])
  for i in range(len(num_val)):
    writer.writerow([num_val[i], orig_val[i]])

In [6]:
# Implement Custom Dataset Class to Capture Data as Pairs:

class Pairs(Dataset):
    def __init__(self, data_file):
        self.data = pd.read_csv(data_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        date_numeric = self.data.iloc[idx,0]
        date_original = self.data.iloc[idx,1]
        return date_original, date_numeric

In [7]:
# Load Data as Pairs:
train_pairs = Pairs('dates_train.csv')
val_pairs = Pairs('dates_val.csv')

In [8]:
train_pairs[1][0]

'June 11, 1820'

In [9]:
train_pairs[1][1]

'1820-06-11'

### Create Vocabularies and Encodings:

In [10]:
# Create Vocabularies:
numeric_vocab = set()
original_vocab = set()

for i in range(len(train_pairs)):
    numeric_vocab.update(list(train_pairs[i][1]))
    original_vocab.update(train_pairs[i][0].split())

print("Numeric vocabulary size:", len(numeric_vocab))
print("Original vocabulary size:", len(original_vocab))

Numeric vocabulary size: 11
Original vocabulary size: 486


In [11]:
# Add Start of Sequence Token ("$") and End of Sequence Token ("#"):
numeric_vocab.update(["$","#","<PAD>"])
original_vocab.update(["$","#","<PAD>"])

print("Numeric vocabulary size:", len(numeric_vocab))
print("Original vocabulary size:", len(original_vocab))

Numeric vocabulary size: 14
Original vocabulary size: 489


In [12]:
# Creating character/word to token mapping:
orig_word2token = {word: i for i, word in enumerate(original_vocab)}
num_char2token = {char: i for i, char in enumerate(numeric_vocab)}

# Creating token to character/word mapping
orig_token2word = {i: word for word, i in orig_word2token.items()}
num_token2char = {i: char for char, i in num_char2token.items()}

In [13]:
# Test tokenizations:
orig_example = "June 25, 2024"
num_example = "2024-06-25"

# Encoding
orig_encoded = np.array([orig_word2token[word] for word in orig_example.split()], dtype=np.int32)
num_encoded = np.array([num_char2token[word] for word in list(num_example)], dtype=np.int32)

print('Original date encoded:', orig_encoded)
print('Numeric date encoded:', num_encoded)

# Decoding
print('Decoded Original:', ' '.join([orig_token2word[i] for i in orig_encoded]))
print('Decoded Numeric:', ''.join([num_token2char[i] for i in num_encoded]))

Original date encoded: [320 238 390]
Numeric date encoded: [12 13 12  1  4 13  0  4 12  2]
Decoded Original: June 25, 2024
Decoded Numeric: 2024-06-25


### Implement Dataset Class for Training Encoder/Decoder Model and Create DataLoaders:

In [14]:
# Implement Custom Dataset Class for Translation:
class DateTranslationDataset(Dataset):
    def __init__(self, pairs, orig_word2token, num_char2token):
        self.pairs = pairs
        self.orig_word2token = orig_word2token
        self.num_char2token = num_char2token

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        orig, num = self.pairs[idx]
        orig_tensor = torch.tensor([self.orig_word2token[word] for word in orig.split()]
                                  + [self.orig_word2token['#']], dtype=torch.long)
        num_tensor = torch.tensor([self.num_char2token[char] for char in list(num)]
                                  + [self.num_char2token['#']], dtype=torch.long)
        return orig_tensor, num_tensor

# Custom collate function to handle padding
def collate_fn(batch):
    orig_batch, num_batch = zip(*batch)
    orig_batch_padded = pad_sequence(orig_batch, batch_first=True, padding_value=orig_word2token["<PAD>"])
    num_batch_padded = pad_sequence(num_batch, batch_first=True, padding_value=num_char2token["<PAD>"])
    return orig_batch_padded, num_batch_padded

In [15]:
# Create train and test datasets and DataLoader
train_dataset = DateTranslationDataset(train_pairs, orig_word2token, num_char2token)
val_dataset = DateTranslationDataset(val_pairs, orig_word2token, num_char2token)

batch_size = 73
translation_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                    shuffle=True,  drop_last=True, collate_fn=collate_fn)

print("Translation samples: ", len(train_dataset))
print("Translation batches: ", len(translation_dataloader))

Translation samples:  129356
Translation batches:  1772


### Create Encoder and Decoder Classes and Initialize Models:

In [16]:
# Create Encoder and Decoder Classes:

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,
                            batch_first=True)

    def forward(self, x):
        # Reversing the sequence of indices
        x = torch.flip(x, [1])
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x)
        out, (hidden, cell) = self.lstm(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

In [17]:
# Hyperparameters
orig_vocab_size = len(orig_word2token)
num_vocab_size = len(num_char2token)
#embed_size = 489
#hidden_size = 987
embed_size = 500
hidden_size = 1000
num_layers = 1

# Initialize the models
encoder = Encoder(orig_vocab_size, 14, hidden_size, num_layers).to(device)
decoder = Decoder(num_vocab_size, embed_size, hidden_size, num_layers).to(device)

### Train Model and Evaluate:

In [18]:
# Train Model:

# Loss Function (exclude padding)
loss_fn = nn.CrossEntropyLoss(ignore_index=num_char2token["<PAD>"])

# Optimizers
encoder_optimizer = optim.AdamW(encoder.parameters())
decoder_optimizer = optim.AdamW(decoder.parameters())

# Number of epochs
num_epochs = 3

# Training Loop
encoder.train()
decoder.train()

for epoch in range(num_epochs):
    for i, (input_tensor, target_tensor) in enumerate(translation_dataloader):
        input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)
        #input_tensor, target_tensor = input_tensor, target_tensor

        # Zero gradients of both optimizers
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        target_length = target_tensor.size(1)

        # Encoder
        _, encoder_hidden, encoder_cell = encoder(input_tensor)

        # Decoder
        decoder_input = torch.full((batch_size, 1), num_char2token['$'], dtype=torch.long).to(device)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        # Randomly select a word index from the target sequence
        #random_word_index = random.randint(0, target_length - 1)

        loss = 0

        for di in range(target_length):
            logits, decoder_hidden, decoder_cell  = decoder(decoder_input, decoder_hidden, decoder_cell)
            #if di == random_word_index:
            #    loss = loss_fn(logits, target_tensor[:, di])
            #    break  # Only compute loss for the randomly selected word
            loss += loss_fn(logits, target_tensor[:,di])
            decoder_input = target_tensor[:, di].reshape(batch_size, 1)  # Teacher forcing


        # Backpropagation
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        if i % 100 == 0:  # Print loss every 10 batches
            print(f'Epoch {epoch}, Batch {i}, Loss: {loss.item() / target_length:.4f}')

Epoch 0, Batch 0, Loss: 2.6297
Epoch 0, Batch 100, Loss: 1.0863
Epoch 0, Batch 200, Loss: 0.8011
Epoch 0, Batch 300, Loss: 0.4254
Epoch 0, Batch 400, Loss: 0.1900
Epoch 0, Batch 500, Loss: 0.0314
Epoch 0, Batch 600, Loss: 0.0094
Epoch 0, Batch 700, Loss: 0.0037
Epoch 0, Batch 800, Loss: 0.0024
Epoch 0, Batch 900, Loss: 0.0013
Epoch 0, Batch 1000, Loss: 0.0008
Epoch 0, Batch 1100, Loss: 0.0006
Epoch 0, Batch 1200, Loss: 0.0005
Epoch 0, Batch 1300, Loss: 0.0004
Epoch 0, Batch 1400, Loss: 0.0003
Epoch 0, Batch 1500, Loss: 0.0003
Epoch 0, Batch 1600, Loss: 0.0003
Epoch 0, Batch 1700, Loss: 0.0002
Epoch 1, Batch 0, Loss: 0.0002
Epoch 1, Batch 100, Loss: 0.0103
Epoch 1, Batch 200, Loss: 0.0011
Epoch 1, Batch 300, Loss: 0.0007
Epoch 1, Batch 400, Loss: 0.0004
Epoch 1, Batch 500, Loss: 0.0004
Epoch 1, Batch 600, Loss: 0.0003
Epoch 1, Batch 700, Loss: 0.0003
Epoch 1, Batch 800, Loss: 0.0002
Epoch 1, Batch 900, Loss: 0.0002
Epoch 1, Batch 1000, Loss: 0.0002
Epoch 1, Batch 1100, Loss: 0.0001
Epoc

In [19]:
# Function to convert between formats:
def convert(encoder, decoder, input_date, orig_word2token, num_token2char, max_length=15):
    encoder.eval()
    decoder.eval()
    with torch.inference_mode():
        # Tokenize and encode the sentence
        input_tensor = torch.tensor([orig_word2token[word] for word in input_date.split()]
                                    + [orig_word2token["#"]], dtype=torch.long)
        input_tensor = input_tensor.view(1, -1).to(device)  # batch_first=True

        # Pass the input through the encoder
        _, encoder_hidden, encoder_cell = encoder(input_tensor)

        # Initialize the decoder input with the SOS token
        decoder_input = torch.tensor([[num_char2token["$"]]], dtype=torch.long)  # SOS
        # Initialize the hidden state of the decoder with the encoder's hidden state
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        # Decoding the sentence
        output_string = []
        last_char = torch.tensor([[num_char2token["$"]]]).to(device)
        for di in range(max_length):
            logits, decoder_hidden, decoder_cell = decoder(last_char, decoder_hidden, decoder_cell)
            next_token = logits.argmax(dim=1) # greedy
            last_char = torch.tensor([[next_token]]).to(device)
            if next_token.item() == num_char2token["#"]:
                break
            else:
                output_string.append(num_token2char.get(next_token.item()))

        return ''.join(output_string)

In [20]:
# Demonstrate model on sample date:
original_date = "April 26, 1992"
converted_date = convert(encoder, decoder, original_date, orig_word2token, num_token2char)
print("Translated:", converted_date)

Translated: 1992-04-26


In [21]:
# Make predictions on validation dataset:
original = [val_pairs[i][0] for i in range(len(val_pairs))]
numeric = [val_pairs[i][1] for i in range(len(val_pairs))]
converted = [convert(encoder, decoder, original[i], orig_word2token, num_token2char) for i in range(len(original))]

In [22]:
print(original[:5])
print(numeric[:5])
print(converted[:5])

['November 02, 1958', 'October 10, 1945', 'June 29, 1965', 'October 14, 1894', 'May 17, 1627']
['1958-11-02', '1945-10-10', '1965-06-29', '1894-10-14', '1627-05-17']
['1958-11-02', '1945-10-10', '1965-06-29', '1894-10-14', '1627-05-17']


In [23]:
val_accuracy = accuracy_score(numeric, converted)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 1.0


### Citations:
The following resources were referenced in creating this notebook:
* https://www.youtube.com/watch?v=QQEL7MC0u1E
* https://colab.research.google.com/drive/1GBC7eLlEM-HqKLUuMcFIQdVuYXzLoS_P?usp=sharing#scrollTo=5qk88CrMSq6E