In [None]:
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from models.embedding import *
from models.transformer import PyTorchTransformerEncoder


torch.autograd.set_detect_anomaly(True)

%load_ext autoreload
%autoreload 2

In [None]:
if torch.cuda.device_count():
    device="cuda"
else:
    device="cpu"
print("Device", device)

In [None]:
n_epochs=10
lr=1e-4
num_layers=3
embedding_dim=32
batch_size=256
max_length=20
heads=4
dropout=0.1
ignore_index=2

In [None]:
sequences_en = np.load("small_vocab_en.npz")["data"]
sequences_fr = np.load("small_vocab_fr.npz")["data"]
vocab_size_en = sequences_en.max()
vocab_size_fr = sequences_fr.max()

In [None]:
class NPZSequencesDataset(Dataset):
    """
    This class loads data from a npz file
    """
    def __init__(
        self,
        input_seqs_filename: str,
        output_seqs_filename: str,
        key="data",
        max_length=50,
        trainingset_size: float = 0.9,
        train=True
    ):
        """
        Initialize the dataset class
        :param filename: The filename of the CSV file
        """
        source_seqs = np.load(input_seqs_filename)[key][:, :max_length]
        target_seqs = np.load(output_seqs_filename)[key][:, :max_length]
        limit = int(sequences_en.shape[0] * trainingset_size)
        self.source_seqs = source_seqs[:limit] if train else source_seqs[limit:]
        self.target_seqs = target_seqs[:limit] if train else target_seqs[limit:]
        self.vocab_in_size = self.source_seqs.max()+1
        self.vocab_out_size = self.target_seqs.max()+1
        if self.source_seqs.shape[0] != self.target_seqs.shape[0]:
            raise Exception("Number of samples of source and target sequences must be equal")

    def __len__(self):
        """
        This function returns the total number of items in the dataset.
        We are using a pandas data frame in this dataset which has an attribut named shape.
        The first dimension of shape is equal to the number of items in the dataset.
        :return: The number of rows in the CSV file
        """
        return self.source_seqs.shape[0]

    def __getitem__(self, idx):
        """
        This function returns a single tuple from the dataset.
        :param idx: The index of the tuple that should be returned.
        :return: Tuple of an x-value and a y-value
        """
        return self.source_seqs[idx], self.target_seqs[idx]

In [None]:
dataset_train = NPZSequencesDataset("small_vocab_en.npz", "small_vocab_fr.npz", max_length=max_length)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, drop_last=True)

dataset_test = NPZSequencesDataset("small_vocab_en.npz", "small_vocab_fr.npz", max_length=max_length, train=False)
dataloader_test = DataLoader(dataset_test, batch_size=64, shuffle=True, drop_last=True)

input_seqs, target_seqs = next(iter(dataloader_train))
input_seqs = input_seqs.to(device)
target_seqs = target_seqs.to(torch.long).to(device)
input_seqs.shape, target_seqs.shape

In [None]:
load_from_checkpoint = False
checkpoint_file = "transformer_temp2.pt"

# Transformer model
model = PyTorchTransformerEncoder(
    embedding_type=EmbeddingType.POS_LEARNED,
    src_vocab_size=dataset_train.vocab_in_size,
    trg_vocab_size=dataset_train.vocab_out_size,
    embedding_dim=embedding_dim,
    num_layers=num_layers,
    heads=heads,
    dropout=dropout,
    src_pad_idx=ignore_index,
    trg_pad_idx=ignore_index,
    device=device
).to(device)

# Initialize optimizer for encoder and decoder
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

# Loss function
criterion = torch.nn.NLLLoss(ignore_index=ignore_index)

# Load model weights from checkpoint
if load_from_checkpoint:
    checkpoint = torch.load(checkpoint_file, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# Run the feature sequences through the model
output = model(input_seqs[:, :-1], None)

In [None]:
# Get the predicted classes of the model
topv, topi = output.topk(1, dim=2)
output.shape, topi.shape, topv.shape

In [None]:
loss = 0.0
for i in range(max_length-1):
    _loss = criterion(output[:, i, :], target_seqs[:, i])
    if not _loss.isnan():
        loss = loss + _loss
loss.item() / max_length

In [None]:
history = []
accuracies = []
print_every = 1

model.train()

for epoch in range(n_epochs):
    ##############################
    #    TRANSFORMER TRAINING    #
    ############################## 
    
    # Get a batch of training data
    for input_seqs, target_seqs in dataloader_train:
        # Set gradients of all model parameters to zero
        optimizer.zero_grad()

        # Initialize loss
        loss = torch.tensor(0.0).to(device)
        accuracy = 0.0
    
        input_seqs = input_seqs.to(device)
        target_seqs = target_seqs.to(torch.long).to(device)
        
        # Run the input sequences through the model
        output = model(input_seqs[:, :-1], None)

        # Iterate over sequence positions to compute the loss
        for i in range(max_length-1):
            # Get the predicted classes of the model
            topv, topi = output[:, i, :].topk(1)
            _loss = criterion(output[:, i, :], target_seqs[:, i+1])
            if not _loss.isnan():
                loss += _loss
                mask = target_seqs[:, i+1] != 2
                accuracy += float((topi.squeeze()[mask] == target_seqs[mask, i+1]).sum() / (target_seqs[mask].size(0)*(target_seqs[mask].size(1)-2)))

        history.append(loss.item())
        accuracies.append(accuracy)

        if not epoch % print_every:
            _accuracy = sum(accuracies[-print_every:]) / print_every
            lr = scheduler.get_last_lr()[0]
            print(f"LOSS after epoch {epoch}", loss.item() / (target_seqs.size(1)), "LR", lr, "ACCURACY", _accuracy)

        ######################
        #   WEIGHTS UPDATE   #
        ######################

        # Compute gradient
        loss.backward()
        accuracy = 0.0

        # Update weights of encoder and decoder
        optimizer.step()

    # Adjust the learning rate
    scheduler.step()

In [None]:
model.eval()  # Set the model to evaluation mode
total_correct = 0
total_samples = 0

with torch.no_grad():
    for input_seqs, target_seqs in dataloader_train:
        # Move batch data to the device
        input_seqs = input_seqs.to(device)
        target_seqs = target_seqs[:, 1:].to(device)

        # Forward pass
        outputs = model(input_seqs[:, :-1])

        # Compute the predicted classes
        topv, topi = output.topk(1)

        # Iterate over sequence positions to compute the loss
        accuracy = 0.0
        for i in range(max_length-1):
            # Get the predicted classes of the model
            topv, topi = output[:, i, :].topk(1)
            mask = target_seqs[:, i] != 2
            accuracy += float((topi.squeeze()[mask] == target_seqs[mask, i]).sum() / (target_seqs[mask].size(0)*(target_seqs[mask].size(1)-2)))
        print("ACC", accuracy)

# Compute the accuracy
accuracy = total_correct / total_samples

# Print the accuracy
print(f"Accuracy on the test dataset: {accuracy:.4f}")