https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch

Below code is a step by step coding tutorial from the following website, which is a basic transformer model following the architecture set in 'Attention is all you need' research paper. The transformer is trained on the imdb dataset from huggingface

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt


In [2]:
from datasets import load_dataset

imdb = load_dataset("imdb")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
BATCH_SIZE = 32
LR = 5e-5
EPOCHS = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 512

In [4]:
""" 
------------------ Preprocessing ------------------

1. Split data into training/validation datasets
2. Text preprocessing to get it into a nice format
    - Remove trailing whitespaces, any encoding issues, lowercasing for models that rely on specific casing of characters
3. Tokenize the data
    - pick tokenizers for the specific model or architecture being used
        - Word Pieces (BERT)
        - Byte-Pair encoding (GPT)
        - SentencePiece (P5)
4. Depending on the task, may need specific inputs from the tokenizer output
     - Classification: input ids, attention masks, label
     - translation: input ids, decoder input ids, labels
     - text generation: just input ids and maybe a promnpt

5. Create a dataset or a data loader ovject (pyTorch) if already in dataset form (example loaded from Hugging Face)
"""

# Dataset already split into training data, no validation for this small dataset
# No need to remove or clean dataset as it is already fine from hugging face

# Import tokenizer from huggingface
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
tokenizer(imdb['train'][0]['text'])

# returns: input ids: the numbers representing the tokens in the text
# token type ids: the sequence the token belongs to, if more than 1
# attention mask: should it be masked or not

{'input_ids': [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107, 2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 383

In [6]:
def tokenization(example):
    # Tokenizes the dataset, typically works with 'text' field for input text
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)


# Create the mappings and format for the data splits so they can be loaded into a pyotrch data loader
train_data = imdb['train'].map(tokenization, batched=True)
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

val_data = imdb['test'].map(tokenization, batched=True)
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

test_data = imdb['unsupervised'].map(tokenization, batched=True)
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])


Map: 100%|██████████| 25000/25000 [00:16<00:00, 1554.02 examples/s]
Map: 100%|██████████| 25000/25000 [00:15<00:00, 1638.31 examples/s]
Map: 100%|██████████| 50000/50000 [00:35<00:00, 1403.22 examples/s]


In [7]:

# Define a DataLoader for batching during training
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)


In [None]:
# # data loading
# def get_batch(split):
#     # generate a small batch of data of inputs x and targets y
#     data = train_data if split == 'train' else val_data
#     ix = torch.randint(len(data) - block_size, (BATCH_SIZE,))
#     x = torch.stack([data[i:i+block_size] for i in ix])
#     y = torch.stack([data[i+1:i+block_size+1] for i in ix])
#     x, y = x.to(device), y.to(device)
#     return x, y

# Assuming trainloader is a PyTorch DataLoader



In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [15]:
class PositionWiseFeedForward(nn.Module):
    """
    The MLP layer used in between multi - head attention blocks to capture and retain information between tokens. 
    """
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [16]:
class PositionalEncoding(nn.Module):
    """  
    The positional embedding of the input tokens, to hold information about where the token is in the input sequence. Added with the embedding vector of the token.
    """
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)   # A tensor filled with zeros, which will be populated with positional encodings.
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)  # A tensor containing the position indices for each position in the sequence.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))  # A term used to scale the position indices in a specific way.
        
        # The sine function is applied to the even indices and the cosine function to the odd indices of pe.
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Added as a buffer so it is added to the model's state but not a trainable parameter
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [17]:
class EncoderLayer(nn.Module):
    """ 
    The encoding layer of the transformer: takes the input sequence of tokens and extracts the meaning and context to output a contextualized matrix of embedded tokens
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)     # Multi head attention blocks, with d_model multi-head blocks and num_heads self attention mechanisms in each multi head block
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)  # MLP layer applied after each multi head block
        self.norm1 = nn.LayerNorm(d_model)  # Normalization for the outputs of the MLP and Multi head blocks
        self.norm2 = nn.LayerNorm(d_model)  
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [18]:
class DecoderLayer(nn.Module):
    """ 
    
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        """ 
        x: The input to the decoder layer.
        enc_output: The output from the corresponding encoder (used in the cross-attention step).
        src_mask: Source mask to ignore certain parts of the encoder's output.
        tgt_mask: Target mask to ignore certain parts of the decoder's inp  

        Steps:

        Self-Attention on Target Sequence: The input x is processed through a self-attention mechanism.
        Add & Normalize (after Self-Attention): The output from self-attention is added to the original x, followed by dropout and normalization using norm1.
        Cross-Attention with Encoder Output: The normalized output from the previous step is processed through a cross-attention mechanism that attends to the encoder's output enc_output.
        Add & Normalize (after Cross-Attention): The output from cross-attention is added to the input of this stage, followed by dropout and normalization using norm2.
        Feed-Forward Network: The output from the previous step is passed through the feed-forward network.
        Add & Normalize (after Feed-Forward): The feed-forward output is added to the input of this stage, followed by dropout and normalization using norm3.
        Output: The processed tensor is returned as the output of the decoder layer.
        """
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [44]:
class Transformer(nn.Module):

    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, num_classes=None, classification=False):

        """ 
        src_vocab_size: Source vocabulary size.
        tgt_vocab_size: Target vocabulary size.
        d_model: The dimensionality of the model's embeddings.
        num_heads: Number of attention heads in the multi-head attention mechanism.
        num_layers: Number of layers for both the encoder and the decoder.
        d_ff: Dimensionality of the inner layer in the feed-forward network.
        max_seq_length: Maximum sequence length for positional encoding.
        dropout: Dropout rate for regularization.

        num_classes and classification: parameters that allow the transformer archietcure to adapt to either classifciation NLP or generation tasks (hopefully easily)
        """


        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)  # Embedding layer for the source sequence.
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)  # Embedding layer for the target sequence.
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length) # Positional encoding component.

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])   # A list of encoder layers.
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])   # A list of decoder layers.
        self.classification = classification
        if not classification:
            out_dim = tgt_vocab_size
        else:
            out_dim = num_classes
        self.fc = nn.Linear(d_model, out_dim)    # Final fully connected (linear) layer mapping to target vocabulary size.
        self.dropout = nn.Dropout(dropout)  # Dropout layer

    def generate_mask(self, src, tgt=None):
        """ 
        This method is used to create masks for the source and target sequences, ensuring that padding tokens are ignored and that future tokens are not visible during training for the target sequence.
        """
        # If we don't want a tgt mask, 
        if tgt is not None:
            tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
            seq_length = tgt.size(1)
            nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
            tgt_mask = tgt_mask & nopeak_mask
        else:
            tgt_mask = None

        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        
        return src_mask, tgt_mask

    def forward(self, src, tgt=None):
        """ 
        Final output is the decoded tensor representing the models prediction for the next token in the sequence. 
        """

        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        if self.classification:
            x = enc_output[:, 0, :]
            return self.fc(x)
        
       

        # If used for generation, include the decoder layer
        if tgt:
            tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
            dec_output = tgt_embedded
            for dec_layer in self.decoder_layers:
                dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

            output = self.fc(dec_output)
        
        return output

In [None]:
""" 
------------------ Training Loop ------------------

Very simple to train, can pass in the batch of the data to the transformer, check dimensions are the same for batch that the mdoel can handle
Get the outputs, calculate the loss and step backwards the loss function and the optimizer to update the weights in the model. 

Logits can be calculated from the output predictions and can be used to show accuracy measurements, etc.
"""

# Parameters based on IMDB dataset
src_vocab_size = tokenizer.vocab_size  # 30522 for 'bert-base-uncased'
tgt_vocab_size = tokenizer.vocab_size  # Same as source for classification
d_model = 512  # Standard model dimension
num_heads = 8  # Multi-head attention heads
num_layers = 6  # Number of encoder/decoder layers
d_ff = 2048  # Feed-forward network size
max_seq_length = 512  # Max length of sequences
dropout = 0.1  # Dropout rate

# Initialize the model
transformer_model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, num_classes=2, classification=True)
transformer_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


# Set up the optimizer
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=LR)
loss_fn = torch.nn.CrossEntropyLoss() 

# ---------- Issues/to do  ---------- #

# The train loader and how this unpacks data
# the batching of the data to be trained and the forward pass parameters sent to the transformer
# validation of the data as well
# CReate test loop too




import torch
import torch.nn.functional as F

# Training loop
epochs = 1
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(epochs):
    transformer_model.train()  # Set model to training mode
        
    total_train_loss = 0
    total_train_correct = 0
    total_train_samples = 0
    
    # Get the batch of data from the training split data loader
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        # Move batch to device
        batch = {k: v.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) for k, v in batch.items()}

        src = batch['input_ids']
        labels = batch['label']
       
        # Forward pass
        outputs = transformer_model(src)
        
        # Calculate the loss
        train_loss = loss_fn(outputs, labels)
        total_train_loss += train_loss.item()

        # Get predicted labels (argmax over logits)
        _, predicted_labels = torch.max(outputs, dim=1)
        
        # Calculate accuracy (correct predictions / total samples)
        correct_predictions = (predicted_labels == labels).sum().item()
        total_train_correct += correct_predictions
        total_train_samples += labels.size(0)
        
        # Backward pass
        train_loss.backward()
        optimizer.step()
    
    # Calculate average training loss and accuracy for this epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_accuracy = total_train_correct / total_train_samples

    # Now for validation
    transformer_model.eval()  # Set model to evaluation mode
    total_val_loss = 0
    total_val_correct = 0
    total_val_samples = 0

    
    with torch.no_grad():  # Disable gradient calculation for validation
        for batch in val_dataloader:
            src = batch['input_ids']
            labels = batch['label']

            # Forward pass
            outputs = transformer_model(src)
            
            # Calculate the loss
            val_loss = loss_fn(outputs, labels)
            total_val_loss += val_loss.item()

            # Get predicted labels (argmax over logits)
            _, predicted_labels = torch.max(outputs, dim=1)
            
            # Calculate accuracy (correct predictions / total samples)
            correct_predictions = (predicted_labels == labels).sum().item()
            total_val_correct += correct_predictions
            total_val_samples += labels.size(0)
    
    # Calculate average validation loss and accuracy for this epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = total_val_correct / total_val_samples

    # Store the losses and accuracies
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)

    # Print epoch stats
    print(f'Epoch [{epoch+1}/{epochs}] | '
          f'Train Loss: {avg_train_loss:.4f} | Train Accuracy: {train_accuracy*100:.2f}% | '
          f'Val Loss: {avg_val_loss:.4f} | Val Accuracy: {val_accuracy*100:.2f}%')



In [None]:
# Function to plot the training metrics
def plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies):
    epochs = range(1, len(train_losses) + 1)

    # Plot Training & Validation Loss
    plt.figure(figsize=(12, 5))

    # Plot Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Training Loss", color='blue')
    plt.plot(epochs, val_losses, label="Validation Loss", color='orange')
    plt.title("Training and Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    # Plot Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label="Training Accuracy", color='green')
    plt.plot(epochs, val_accuracies, label="Validation Accuracy", color='red')
    plt.title("Training and Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    # Show the plot
    plt.tight_layout()
    plt.show()

plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies)