https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch

Below code is a step by step coding tutorial from the following website, which is a basic transformer model following the architecture set in 'Attention is all you need' research paper. The transformer is trained on the imdb dataset from huggingface

In [1]:
""" 
Still have issues with the training loop and data handling. When running in training, nothing is output, perhaps due to going through a batch with the 
training data loader being too big, not sure. 

Check out YT video currently watching to try and change training, or use hugging face. Would like to write at least one full trinaing loop, issue most likely in how data being 
handled and batched.

"""

' \nStill have issues with the training loop and data handling. When running in training, nothing is output, perhaps due to going through a batch with the \ntraining data loader being too big, not sure. \n\nCheck out YT video currently watching to try and change training, or use hugging face. Would like to write at least one full trinaing loop, issue most likely in how data being \nhandled and batched.\n\n'

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt


In [3]:
from datasets import load_dataset

imdb = load_dataset("imdb")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
BATCH_SIZE = 32
LR = 5e-5
EPOCHS = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 512

In [5]:
""" 
------------------ Preprocessing ------------------

1. Split data into training/validation datasets
2. Text preprocessing to get it into a nice format
    - Remove trailing whitespaces, any encoding issues, lowercasing for models that rely on specific casing of characters
3. Tokenize the data
    - pick tokenizers for the specific model or architecture being used
        - Word Pieces (BERT)
        - Byte-Pair encoding (GPT)
        - SentencePiece (P5)
4. Depending on the task, may need specific inputs from the tokenizer output
     - Classification: input ids, attention masks, label
     - translation: input ids, decoder input ids, labels
     - text generation: just input ids and maybe a promnpt

5. Create a dataset or a data loader ovject (pyTorch) if already in dataset form (example loaded from Hugging Face)
"""

# Dataset already split into training data, no validation for this small dataset
# No need to remove or clean dataset as it is already fine from hugging face

# Import tokenizer from huggingface
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
tokenizer(imdb['train'][0]['text'])

# returns: input ids: the numbers representing the tokens in the text
# token type ids: the sequence the token belongs to, if more than 1
# attention mask: should it be masked or not

{'input_ids': [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107, 2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 383

In [None]:
# def tokenization(example):
#     # Tokenizes the dataset, typically works with 'text' field for input text
#     return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)


# # Create the mappings and format for the data splits so they can be loaded into a pyotrch data loader
# train_data = imdb['train'].map(tokenization, batched=True)
# train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

# val_data = imdb['test'].map(tokenization, batched=True)
# val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

# test_data = imdb['unsupervised'].map(tokenization, batched=True)
# test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

# # Define a DataLoader for batching during training
# train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
# val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE)
# test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_imdb = imdb.map(preprocess_function, batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map: 100%|██████████| 25000/25000 [00:14<00:00, 1745.08 examples/s]
Map: 100%|██████████| 25000/25000 [00:15<00:00, 1611.14 examples/s]
Map: 100%|██████████| 50000/50000 [00:31<00:00, 1584.99 examples/s]


In [None]:
from transformers import DataCollatorWithPadding
import evaluate


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)    # dynamically pad the tokens
accuracy = evaluate.load("accuracy")    # accracy metric from hugging face


In [None]:
def compute_metrics(eval_pred):
    """ 
    Apply the accuracy metric to the validate predictions
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [12]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
""" 
------------------ Training Loop ------------------

Very simple to train, can pass in the batch of the data to the transformer, check dimensions are the same for batch that the mdoel can handle
Get the outputs, calculate the loss and step backwards the loss function and the optimizer to update the weights in the model. 

Logits can be calculated from the output predictions and can be used to show accuracy measurements, etc.
"""

# Parameters based on IMDB dataset
src_vocab_size = tokenizer.vocab_size  # 30522 for 'bert-base-uncased'
tgt_vocab_size = tokenizer.vocab_size  # Same as source for classification
d_model = 512  # Standard model dimension
num_heads = 8  # Multi-head attention heads
num_layers = 6  # Number of encoder/decoder layers
d_ff = 2048  # Feed-forward network size
max_seq_length = 512  # Max length of sequences
dropout = 0.1  # Dropout rate

# Initialize the model
transformer_model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, num_classes=2, classification=True)
transformer_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


# Set up the optimizer
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=LR)
loss_fn = torch.nn.CrossEntropyLoss() 

# ---------- Issues/to do  ---------- #

# The train loader and how this unpacks data
# the batching of the data to be trained and the forward pass parameters sent to the transformer
# validation of the data as well
# CReate test loop too




import torch
import torch.nn.functional as F

# Training loop
epochs = 1
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(epochs):
    transformer_model.train()  # Set model to training mode
        
    total_train_loss = 0
    total_train_correct = 0
    total_train_samples = 0
    
    # Get the batch of data from the training split data loader
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        # Move batch to device
        batch = {k: v.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) for k, v in batch.items()}

        src = batch['input_ids']
        labels = batch['label']
       
        # Forward pass
        outputs = transformer_model(src)
        
        # Calculate the loss
        train_loss = loss_fn(outputs, labels)
        total_train_loss += train_loss.item()

        # Get predicted labels (argmax over logits)
        _, predicted_labels = torch.max(outputs, dim=1)
        
        # Calculate accuracy (correct predictions / total samples)
        correct_predictions = (predicted_labels == labels).sum().item()
        total_train_correct += correct_predictions
        total_train_samples += labels.size(0)
        
        # Backward pass
        train_loss.backward()
        optimizer.step()
    
    # Calculate average training loss and accuracy for this epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_accuracy = total_train_correct / total_train_samples

    # Now for validation
    transformer_model.eval()  # Set model to evaluation mode
    total_val_loss = 0
    total_val_correct = 0
    total_val_samples = 0

    
    with torch.no_grad():  # Disable gradient calculation for validation
        for batch in val_dataloader:
            src = batch['input_ids']
            labels = batch['label']

            # Forward pass
            outputs = transformer_model(src)
            
            # Calculate the loss
            val_loss = loss_fn(outputs, labels)
            total_val_loss += val_loss.item()

            # Get predicted labels (argmax over logits)
            _, predicted_labels = torch.max(outputs, dim=1)
            
            # Calculate accuracy (correct predictions / total samples)
            correct_predictions = (predicted_labels == labels).sum().item()
            total_val_correct += correct_predictions
            total_val_samples += labels.size(0)
    
    # Calculate average validation loss and accuracy for this epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = total_val_correct / total_val_samples

    # Store the losses and accuracies
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)

    # Print epoch stats
    print(f'Epoch [{epoch+1}/{epochs}] | '
          f'Train Loss: {avg_train_loss:.4f} | Train Accuracy: {train_accuracy*100:.2f}% | '
          f'Val Loss: {avg_val_loss:.4f} | Val Accuracy: {val_accuracy*100:.2f}%')



In [None]:
# Function to plot the training metrics
def plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies):
    epochs = range(1, len(train_losses) + 1)

    # Plot Training & Validation Loss
    plt.figure(figsize=(12, 5))

    # Plot Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Training Loss", color='blue')
    plt.plot(epochs, val_losses, label="Validation Loss", color='orange')
    plt.title("Training and Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    # Plot Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label="Training Accuracy", color='green')
    plt.plot(epochs, val_accuracies, label="Validation Accuracy", color='red')
    plt.title("Training and Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    # Show the plot
    plt.tight_layout()
    plt.show()

plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies)