In [1]:
# imports
import numpy as np
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader

from d_utils import *
from torchinfo import summary


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [2]:
# hyper parameters
learning_rate = 1e-3
BATCH_SIZE = 64
EPOCHS = 500
dataset_size = 30000
# max length on input sequence
Tx = 30
# length of output sequence
Ty = 10

PATIENCE = 100
model_name = f"noise_{EPOCHS}_{learning_rate}.pth"


In [3]:
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(dataset_size)

100%|██████████████████████████████████████████████████████████████████████████| 30000/30000 [00:01<00:00, 28205.79it/s]


In [4]:
dataset[:10]

[('wednesday 28 novemBeR1 962', '1962-11-28'),
 ('Mon JuL 15 1901', '1901-07-15'),
 ('8 aUG  2024', '2024-08-08'),
 ('04.02.45', '1945-02-04'),
 ('558019', '1955-08-19'),
 ('03-30-29', '1929-03-30'),
 ('19 85 JUne', '1985-06-19'),
 ('sUnDay 1 juen 1924', '1924-06-01'),
 ('mOnday octOber 7 1918', '1918-10-07'),
 ('16 maY 1965', '1965-05-16')]

In [5]:
X_org, Y_org, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)
# Xoh and Yoh is one hot representation X and Y 

In [6]:
Xoh = torch.tensor(Xoh, dtype=torch.float32)
Y = torch.tensor(Y_org, dtype=torch.long)


  Y = torch.tensor(Y_org, dtype=torch.long)


In [7]:
class dynamic_dataset(Dataset):
    def __init__(self, dataset_size):
        self.dataset_size = dataset_size
        
    def __len__(self):
        return self.dataset_size

    def __getitem__(self, idx):
        X, Y, _ = load_date()
        X = torch.tensor(string_to_int(X, Tx, human_vocab))
        Y = torch.tensor(string_to_int(Y, Ty, machine_vocab))

        Xoh = np.array(list(map(lambda x: one_hot(x, num_classes=len(human_vocab)), X))).astype('float64')
        
        return torch.tensor(Xoh, dtype=torch.float32).to(device), torch.tensor(Y, dtype=torch.long).to(device)

In [8]:
train_data = dynamic_dataset(dataset_size)

data_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
n_a = 32         # hidden units for pre_attention_lstm/encoder
n_s = 64         # hidden units for post_attention_lstm/decoder

In [10]:
# repeat vector for attention mechanism
class RepeatVector(nn.Module):
    def __init__(self, n):
        super(RepeatVector, self).__init__()
        self.n = n

    def forward(self, x):
        # x: (batch_size, features)
        return x.unsqueeze(1).repeat(1, self.n, 1)

In [11]:
class modelf(nn.Module):
    def __init__(self, Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
        super(modelf, self).__init__()

        self.Tx = Tx
        self.Ty = Ty
        self.n_a = n_a
        self.n_s = n_s
        self.human_vocab_size = human_vocab_size
        self.machine_vocab_size = machine_vocab_size
        
        # one_step_attention layers:
        self.repeator = RepeatVector(self.Tx)
        repeator = RepeatVector(self.Tx)
        self.linear1 = nn.Linear((2*self.n_a) + self.n_s, 10)
        self.linear2 = nn.Linear(10, 1)
        
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        # forward layers
        self.pre_attention_lstm = nn.LSTM(input_size=self.human_vocab_size, hidden_size=self.n_a, batch_first=True, bidirectional=True)
        self.post_attention_lstm = nn.LSTM(2* self.n_a , self.n_s, batch_first=True)
        self.output_layer = nn.Linear(self.n_s, self.machine_vocab_size)
        self.softmax_main = nn.Softmax(dim=-1)
    
    def one_step_attention(self, a, s_prev):
        # this attention mechnism for lstm
        
        # add's dimenison 1 making (1,2) to (1, 3, 2)
        s_prev = self.repeator(s_prev)
        concat = torch.cat([a, s_prev],dim=-1)
       
        e = self.linear1(concat)
        e = self.tanh(e)
        
        energies = self.linear2(e)
        energies = self.relu(energies)
        # softmax on dimension 1
        alphas = self.softmax(energies)
        # this is dot product
        context = torch.sum(alphas*a, dim=1, keepdim=True)
        
        return context


    def forward(self, X):
        
        outputs = []
        
        a, _ = self.pre_attention_lstm(X)

        batch_size = X.shape[0]
        s = torch.zeros(batch_size, n_s).to(device)
        c = torch.zeros(batch_size, n_s).to(device)

        for t in range(self.Ty):
            # one setp attention
            context = self.one_step_attention(a, s)
        
            _, (s, c) =  self.post_attention_lstm(context, (s.unsqueeze(0), c.unsqueeze(0)))
            s, c = s.squeeze(0), c.squeeze(0)
            
            out = self.output_layer(s)
            
            outputs.append(out)

        
        outputs = torch.stack(outputs, dim=1)
        return outputs

        
    

In [12]:
human_vocab_size = len(human_vocab)
machine_vocab_size = len(machine_vocab)
model = modelf(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size).to(device)


In [13]:
summary(model , input_size = Xoh.shape)

Layer (type:depth-idx)                   Output Shape              Param #
modelf                                   [30000, 10, 11]           --
├─LSTM: 1-1                              [30000, 30, 64]           24,064
├─RepeatVector: 1-2                      [30000, 30, 64]           --
├─Linear: 1-3                            [30000, 30, 10]           1,290
├─Tanh: 1-4                              [30000, 30, 10]           --
├─Linear: 1-5                            [30000, 30, 1]            11
├─ReLU: 1-6                              [30000, 30, 1]            --
├─Softmax: 1-7                           [30000, 30, 1]            --
├─LSTM: 1-8                              [30000, 1, 64]            33,280
├─Linear: 1-9                            [30000, 11]               715
├─RepeatVector: 1-10                     [30000, 30, 64]           --
├─Linear: 1-11                           [30000, 30, 10]           (recursive)
├─Tanh: 1-12                             [30000, 30, 10]        

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)



In [None]:
best_loss = float('inf')
patience_counter = 0
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for inputs, labels in data_loader:
        optimizer.zero_grad()
        out = model(inputs)

        output_flat = out.reshape(-1, out.shape[-1])
        labels_flat = labels.reshape(-1)

        loss = criterion(output_flat, labels_flat)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(out, dim=-1)
        correct += (preds == labels).all(dim=1).sum().item()  # full‐sequence matches
        total += labels.size(0)
        
    avg_loss = total_loss / data_loader.__len__()
    
    if epoch % 50 == 0 or epoch == EPOCHS-1:
        print(f"epoch:{epoch}, loss:{round(total_loss/1000, 4)}, accuracy: {correct/total*100:.4f}%")
        for param_group in optimizer.param_groups:
            print(f"Learning rate: {param_group['lr']}")

    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        torch.save(model.state_dict(), f"dynamic_models/{model_name}")
    



  return torch.tensor(Xoh, dtype=torch.float32).to(device), torch.tensor(Y, dtype=torch.long).to(device)


epoch:0, loss:0.6919, accuracy: 0.0033%
Learning rate: 0.001


In [None]:
torch.save(model.state_dict(), f"dynamic_models/final_{model_name}")

In [None]:
def translate(model, sentence):
    model.eval()
    x_enc = string_to_int(sentence, Tx, human_vocab)
    x_enc = np.array(list(map(lambda x: np.eye(len(human_vocab))[x], x_enc)))
    X_tensor = torch.tensor(x_enc, dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(X_tensor)
        if isinstance(outputs, list):
            outputs = torch.stack(outputs, dim=1)
        preds = torch.argmax(outputs, dim=-1).squeeze(0).cpu().numpy()
    return ''.join(int_to_string(preds, inv_machine_vocab))


In [None]:
ans = translate(model, "19 3 2003")
ans

In [None]:
examples = ['3 May 1979', '5 Apr 09', '20th Feb 2016', 'Wed 10 Jul 2007', '30 SEPT 2027', '1 jnu 2030']
for example in examples:
    print(f"Input: {example} -> Output: {translate(model, example)}")