# Assigment 4, Steve Veldman, 7/12/2024
(This notebook is part 1, containing my answer to question 1 of this assignment)

Question 1: 60 points \
Finish Transformer implementation in PyTorch that was left unfinished:
* Implement Decoder
* Connect Decoder and Encoder
* Try to train it on some translation task

In [1]:
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
#from transformers import AutoTokenizer
from torch import nn
from transformers import AutoConfig
import torch
from math import sqrt
import torch.nn.functional as F
from torch.utils.data import Dataset
import torch.optim as optim
from sklearn.metrics import accuracy_score

## Housekeeping: Import Data, Create Vocabularies and Encodings, Instantiate Dataset/Dataloader:
For this problem, I will use the Date Translation dataset that I generated for assignment 1 & 2.

In [4]:
# Implement Custom "Pairs" Class to Capture Data as Pairs for Establishing Vocabulary and Tokens:
class Pairs(Dataset):
  def __init__(self, data_file):
      self.data = pd.read_csv(data_file)

  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):
      date_numeric = self.data.iloc[idx,0]
      date_original = self.data.iloc[idx,1]
      return date_original, date_numeric

# Load Data as Pairs:
train_pairs = Pairs('dates_train.csv')
val_pairs = Pairs('dates_val.csv')

In [5]:
train_pairs[1][0]

'June 11, 1820'

In [6]:
train_pairs[1][1]

'1820-06-11'

In [7]:
# Create Vocabularies:
numeric_vocab = set()
original_vocab = set()

for i in range(len(train_pairs)):
    numeric_vocab.update(list(train_pairs[i][1]))
    original_vocab.update(train_pairs[i][0].split())

# Add Start of Sequence Token ("$") and End of Sequence Token ("#"):
numeric_vocab.update(["$","#","<PAD>"])
original_vocab.update(["$","#","<PAD>"])

# Creating character/word to token mapping:
orig_word2token = {word: i for i, word in enumerate(original_vocab)}
num_char2token = {char: i for i, char in enumerate(numeric_vocab)}

# Creating token to character/word mapping
orig_token2word = {i: word for word, i in orig_word2token.items()}
num_token2char = {i: char for char, i in num_char2token.items()}

print("Numeric vocabulary size:", len(numeric_vocab))
print("Original vocabulary size:", len(original_vocab))

Numeric vocabulary size: 14
Original vocabulary size: 489


In [8]:
# Implement Custom Dataset Class for Translation:
class DateTranslationDataset(Dataset):
    def __init__(self, pairs, orig_word2token, num_char2token):
        self.pairs = pairs
        self.orig_word2token = orig_word2token
        self.num_char2token = num_char2token

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        orig, num = self.pairs[idx]
        orig_tensor = torch.tensor([self.orig_word2token[word] for word in orig.split()]
                                  + [self.orig_word2token['#']], dtype=torch.long)
        num_tensor = torch.tensor([self.num_char2token[char] for char in list(num)]
                                  + [self.num_char2token['#']], dtype=torch.long)
        return orig_tensor, num_tensor

In [9]:
# Create train and test datasets and DataLoader
train_dataset = DateTranslationDataset(train_pairs, orig_word2token, num_char2token)
val_dataset = DateTranslationDataset(val_pairs, orig_word2token, num_char2token)

batch_size = 73
translation_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  drop_last=True)

print("Translation samples: ", len(train_dataset))
print("Translation batches: ", len(translation_dataloader))

Translation samples:  129356
Translation batches:  1772


In [10]:
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,  drop_last=True)

In [11]:
# Create Custom Config Parameters:
class Config():
    def __init__(self, hidden_size, num_attention_heads, num_hidden_layers, input_vocab_size, output_vocab_size):
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_hidden_layers
        self.input_vocab_size = input_vocab_size
        self.output_vocab_size = output_vocab_size
        self.intermediate_size = hidden_size*4
        self.hidden_dropout_prob = 0.1
        self.input_max_position_embeddings = 5
        self.output_max_position_embeddings = 12

config = Config(hidden_size=128, num_attention_heads=4, num_hidden_layers=2, input_vocab_size=len(original_vocab), output_vocab_size=len(numeric_vocab))

## The Encoder

#### Scaled dot-product attention
Note: In order to make sure tensor shapes and other outputs were correct, I followed the steps from the example code even when not strictly necessary for defining the functions/classes.

In [12]:
def tokenize_input(input_date):
  input_ids = [orig_word2token[word] for word in input_date.split()]
  return torch.tensor(input_ids).unsqueeze(0)

In [13]:
input_date = "June 25, 2024"

In [14]:
input_ids = tokenize_input(input_date)
input_ids

tensor([[440, 348, 281]])

In [15]:
encoder_token_emb = nn.Embedding(config.input_vocab_size, config.hidden_size)
encoder_token_emb

Embedding(489, 128)

In [16]:
inputs_embeds = encoder_token_emb(input_ids)
inputs_embeds.size()

torch.Size([1, 3, 128])

In [17]:
query = key = value = inputs_embeds
dim_k = key.size(-1)
scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)
scores.size()

torch.Size([1, 3, 3])

In [18]:
weights = F.softmax(scores, dim=-1)
weights.sum(dim=-1)

tensor([[1., 1., 1.]], grad_fn=<SumBackward1>)

In [19]:
attn_outputs = torch.bmm(weights, value)
attn_outputs.shape

torch.Size([1, 3, 128])

In [20]:
def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

#### Multi-headed attention

In [21]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

In [22]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

In [23]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)
attn_output.size()

torch.Size([1, 3, 128])

### The Feed-Forward Layer

In [24]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

In [25]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_outputs)
ff_outputs.size()

torch.Size([1, 3, 128])

### Adding Layer Normalization

In [26]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Apply layer normalization and then copy input into query, key, value
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.attention(hidden_state)
        #print("Shape of x after self-attention:", x.shape)
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.layer_norm_2(x))
        #print("Shape of x after feed-forward:", x.shape)
        return x

In [27]:
encoder_layer = TransformerEncoderLayer(config)
inputs_embeds.shape, encoder_layer(inputs_embeds).size()

(torch.Size([1, 3, 128]), torch.Size([1, 3, 128]))

### Positional Embeddings

In [28]:
class Enc_Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.input_vocab_size,
                                             config.hidden_size)
        self.position_embeddings = nn.Embedding(config.input_max_position_embeddings,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [29]:
embedding_layer = Enc_Embeddings(config)
embedding_layer(input_ids).size()

torch.Size([1, 3, 128])

In [30]:
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Enc_Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config)
                                     for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [31]:
encoder = TransformerEncoder(config)
encoder(input_ids).size()

torch.Size([1, 3, 128])

## The Decoder

In [32]:
#seq_len = input_ids.size(-1)
mask = torch.tril(torch.ones(10, 10)).unsqueeze(0)
mask[0]

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [33]:
#scores.masked_fill(mask == 0, -float("inf"))

In [34]:
def masked_scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    scores = scores.masked_fill(mask == 0, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    return weights.bmm(value)

In [35]:
class MaskedAttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = masked_scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

In [36]:
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [MaskedAttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

In [37]:
class CrossAttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state, enc_output):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(enc_output), self.v(enc_output))
        return attn_outputs

In [38]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [CrossAttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state, enc_output):
        x = torch.cat([h(hidden_state, enc_output) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

In [39]:

class TransformerDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_3 = nn.LayerNorm(config.hidden_size)
        self.self_attention = MaskedMultiHeadAttention(config)
        self.self_enc_dec_attention = MultiHeadCrossAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x, enc_output):
        # Apply layer normalization and then copy input into query, key, value
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.self_attention(hidden_state)
        #print("Shape of x after self-attention:", x.shape)
        # Apply cross-attention
        x = x + self.self_enc_dec_attention(self.layer_norm_2(x), enc_output)
        #print("Shape of x after cross-attention:", x.shape)
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.layer_norm_3(x))
        #print("Shape of x after feed-forward:", x.shape)
        return x


In [40]:
class Dec_Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.output_vocab_size,
                                             config.hidden_size)
        self.position_embeddings = nn.Embedding(config.output_max_position_embeddings,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [41]:
class TransformerDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Dec_Embeddings(config)
        self.layers = nn.ModuleList([TransformerDecoderLayer(config)
                                     for _ in range(config.num_hidden_layers)])

    def forward(self, x, encoder_output):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x, encoder_output)
        return x

In [42]:
class TransformerSeq2Seq(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.decoder = TransformerDecoder(config)

    def forward(self, encoder_input_ids, decoder_input_ids):
        encoder_output = self.encoder(encoder_input_ids)
        decoder_output = self.decoder(decoder_input_ids, encoder_output)
        return decoder_output

    def train_transformer(self, dataloader, optimizer, criterion, epochs=10):
      for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (src, tgt) in enumerate(dataloader):
          optimizer.zero_grad()
          output = self(src, tgt[:, :-1]) # Exclude last token in target

          # Reshape output to [batch_size * seq_len, vocab_size]
          output = output.view(-1, output.size(-1))

          # Reshape target to [batch_size * seq_len]
          tgt = tgt[:, 1:].reshape(-1)

          #loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1)) # Shift target by one position
          loss = criterion(output, tgt)

          loss.backward()
          optimizer.step()
          total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(dataloader)}')

## Instantiate the Transformer and Train:

In [43]:
transformer = TransformerSeq2Seq(config)

In [44]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

TransformerSeq2Seq(
  (encoder): TransformerEncoder(
    (embeddings): Enc_Embeddings(
      (token_embeddings): Embedding(489, 128)
      (position_embeddings): Embedding(5, 128)
      (layer_norm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (layer_norm_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (layer_norm_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attention): MultiHeadAttention(
          (heads): ModuleList(
            (0-3): 4 x AttentionHead(
              (q): Linear(in_features=128, out_features=32, bias=True)
              (k): Linear(in_features=128, out_features=32, bias=True)
              (v): Linear(in_features=128, out_features=32, bias=True)
            )
          )
          (output_linear): Linear(in_features=128, out_features=128, bias=True)
        )
        (feed_forward): FeedFo

In [45]:
transformer.train_transformer(translation_dataloader, optimizer, criterion, epochs=5)

Epoch 1/5, Loss: 0.7982889625409925
Epoch 2/5, Loss: 0.3513216957887193
Epoch 3/5, Loss: 0.15056730914255498
Epoch 4/5, Loss: 0.06379561357380945
Epoch 5/5, Loss: 0.033492572245589596


In [51]:
transformer.train_transformer(translation_dataloader, optimizer, criterion, epochs=5)

Epoch 1/5, Loss: 0.020468875809029192
Epoch 2/5, Loss: 0.01398254228972895
Epoch 3/5, Loss: 0.010390567351708138
Epoch 4/5, Loss: 0.008132720167682144
Epoch 5/5, Loss: 0.006571304087526308


In [57]:
transformer.train_transformer(translation_dataloader, optimizer, criterion, epochs=5)

Epoch 1/5, Loss: 0.005512983855026609
Epoch 2/5, Loss: 0.004839316803090553
Epoch 3/5, Loss: 0.003949055943161322
Epoch 4/5, Loss: 0.003743307198947273
Epoch 5/5, Loss: 0.003215595563496153


In [63]:
transformer.train_transformer(translation_dataloader, optimizer, criterion, epochs=5)

Epoch 1/5, Loss: 0.0030687263382065265
Epoch 2/5, Loss: 0.002789252189441225
Epoch 3/5, Loss: 0.0023378077679537482
Epoch 4/5, Loss: 0.002250565965013703
Epoch 5/5, Loss: 0.002056198958536952


In [64]:
def predict_and_target(transformer, dataloader):
  all_targets = []
  all_predictions = []

  for batch_idx, (src, tgt) in enumerate(dataloader):
    # Use the model to generate predictions
    with torch.inference_mode():
      output = transformer(src, tgt[:, :-1]) # Exclude last token in target
    # Get the predicted token IDs
    predicted_ids = torch.argmax(output, dim=-1)

    # Convert predicted and target tokens to strings
    #targets = [''.join(str(token_id.item()) for token_id in seq) for seq in tgt[:, 1:-1]]
    #predictions = [''.join(str(token_id.item()) for token_id in seq) for seq in predicted_ids]

    all_targets.extend(tgt[:, 1:-1].tolist())
    all_predictions.extend(predicted_ids[:, :-1].tolist())

  return all_targets, all_predictions

In [65]:
targets, predictions = predict_and_target(transformer, translation_dataloader)

print(targets[:3])
print(predictions[:3])

[[0, 10, 5, 7, 10, 9, 7, 3, 6], [12, 6, 12, 7, 9, 3, 7, 3, 12], [5, 10, 13, 7, 10, 6, 7, 9, 5]]
[[12, 10, 5, 7, 10, 9, 7, 3, 6], [12, 6, 12, 7, 9, 3, 7, 3, 12], [5, 10, 13, 7, 10, 6, 7, 9, 5]]


In [66]:
target_strings = [''.join(map(str, target)) for target in targets]
prediction_strings = [''.join(map(str, prediction)) for prediction in predictions]

In [67]:
train_accuracy = accuracy_score(target_strings, prediction_strings)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.5076146448560561


In [68]:
val_targets, val_predictions = predict_and_target(transformer, validation_dataloader)
val_target_strings = [''.join(map(str, target)) for target in val_targets]
val_prediction_strings = [''.join(map(str, prediction)) for prediction in val_predictions]

val_accuracy = accuracy_score(val_target_strings, val_prediction_strings)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.5109001515198367


While the training and test accuracy is currently at ~50%, this calcualtion is being done based on the percentage of dates that it is getting completely correct. From examining a small sample, it looks like the model is consistently getting the majority of these