# **Artificial Neural Networks and Deep Learning**

---

## **Lecture 7: Image Augmentation and Image Retrieval**

<img src="https://drive.google.com/uc?export=view&id=1idXlqipXxn5yDono2mWxxZ94TxSkjiZ4" width="500"/>

## üåê **Google Drive Connection**

In [1]:
from google.colab import drive
drive.mount("/gdrive")
current_dir = "/gdrive/My\\ Drive/Colab\\ Notebooks/[2025-2026]\\ AN2DL/Lecture\\ 7"
%cd $current_dir

ValueError: mount failed

## ‚öôÔ∏è **Libraries Import**

In [None]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import logging
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
import torchvision
from torchvision.transforms import v2 as transforms
from torch.utils.data import TensorDataset, DataLoader
!pip install torchview
from torchview import draw_graph

# Configurazione di TensorBoard e directory
logs_dir = "tensorboard"
!pkill -f tensorboard
%load_ext tensorboard
!mkdir -p models

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import cv2
import copy
import shutil
from itertools import product
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from PIL import Image
import matplotlib.gridspec as gridspec
import requests
from io import BytesIO

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

## ‚è≥ **Data Loading**

In [None]:
# Set environment variables for Animals dataset
os.environ["ANIMALS_DATASET_NAME"] = "animals.zip"
os.environ["ANIMALS_DATASET_URL"] = "1nlTR-mwPLc05vxaOncUhXu82NR8gbx63"

# Check if Animals dataset exists, download and unzip if not
if not os.path.exists(os.environ["ANIMALS_DATASET_NAME"]):
    print("Downloading Animals dataset...")
    !gdown -q ${ANIMALS_DATASET_URL} -O ${ANIMALS_DATASET_NAME}
    print("Animals dataset downloaded!")

    print("Unzipping Animals dataset...")
    !unzip -o ${ANIMALS_DATASET_NAME}
    print("Animals dataset unzipped!")
else:
    print("Animals dataset already downloaded and unzipped. Using cached data.")

# Set environment variables for Items dataset
os.environ["ITEMS_DATASET_NAME"] = "items.zip"
os.environ["ITEMS_DATASET_URL"] = "1tcDVgQYuMnISgFCjaxinXSryB0CAZYHP"

# Check if Items dataset exists, download and unzip if not
if not os.path.exists(os.environ["ITEMS_DATASET_NAME"]):
    print("Downloading Items dataset...")
    !gdown -q ${ITEMS_DATASET_URL} -O ${ITEMS_DATASET_NAME}
    print("Items dataset downloaded!")

    print("Unzipping Items dataset...")
    !unzip -o ${ITEMS_DATASET_NAME}
    print("Items dataset unzipped!")
else:
    print("Items dataset already downloaded and unzipped. Using cached data.")

In [None]:
def load_images_from_folder(folder):
    """
    Load and preprocess images from a specified folder.

    Args:
        folder (str): Path to the folder containing images

    Returns:
        np.ndarray: Array of preprocessed images with shape (N, H, W, C)
    """
    images = []

    # Iterate through files in the specified folder
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, filename))

        # Normalize image pixel values to a float range [0, 1]
        img = (img / 255).astype(np.float32)

        # Convert image from BGR to RGB
        img = img[...,::-1]

        # Make the image dataset squared
        dim = min(img.shape[:-1])
        img = img[(img.shape[0]-dim)//2:(img.shape[0]+dim)//2,
                  (img.shape[1]-dim)//2:(img.shape[1]+dim)//2, :]

        # Resize the image to 224x224 pixels
        img = cv2.resize(img, (224, 224))

        if img is not None:
            images.append(img)

    return np.array(images)

# Load images from the 'animals/' folder
animals_path = 'animals/'
animals = load_images_from_folder(animals_path)

# Load images from the 'items/' folder
items_path = 'items/'
items = load_images_from_folder(items_path)

print(f"Loaded {len(animals)} animal images")
print(f"Loaded {len(items)} item images")

## üîé **Exploration and Data Analysis**

In [None]:
# Number of images to display
num_img = 10

# Create subplots for displaying items
fig, axes = plt.subplots(2, num_img//2, figsize=(20, 9))
for i in range(num_img):
    ax = axes[i%2, i%num_img//2]
    ax.imshow(np.clip(items[i], 0, 1))  # Display clipped item images
    ax.axis('off')
plt.tight_layout()
plt.show()

# Create subplots for displaying animals
fig, axes = plt.subplots(2, num_img//2, figsize=(20, 9))
for i in range(num_img):
    ax = axes[i%2, i%num_img//2]
    ax.imshow(np.clip(animals[i], 0, 1))  # Display clipped animal images
    ax.axis('off')
plt.tight_layout()
plt.show()

## üîÑ **Data Preprocessing**

In [None]:
# Concatenate 'animals' and 'items' arrays along axis 0
X = np.concatenate([animals, items], axis=0)

# Create labels: 1 for 'animals', 0 for 'items'
y = np.concatenate([np.ones(len(animals)), np.zeros(len(items))], axis=0)

# Reshape labels to column vectors (N, 1)
y = y.reshape(-1, 1)

# Split data into train_val and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, random_state=SEED, test_size=30, stratify=y
)

# Further split train_val into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, random_state=SEED, test_size=len(X_test), stratify=y_train_val
)

# Print shapes of the datasets
print(f"Training Data Shape: {X_train.shape}")
print(f"Training Label Shape: {y_train.shape}")
print(f"Validation Data Shape: {X_val.shape}")
print(f"Validation Label Shape: {y_val.shape}")
print(f"Test Data Shape: {X_test.shape}")
print(f"Test Label Shape: {y_test.shape}")

In [None]:
# Define the input shape based on the training data
input_shape = (X_train.shape[3], X_train.shape[1], X_train.shape[2])

# Define the number of classes
num_classes = len(np.unique(y_train))

print("Input Shape:", input_shape)
print("Number of Classes:", num_classes)

In [None]:
# Convert numpy arrays to PyTorch datasets (without augmentation for now)
train_ds = TensorDataset(
    torch.from_numpy(X_train).permute(0, 3, 1, 2),
    torch.from_numpy(y_train).squeeze().long()
)
val_ds = TensorDataset(
    torch.from_numpy(X_val).permute(0, 3, 1, 2),
    torch.from_numpy(y_val).squeeze().long()
)
test_ds = TensorDataset(
    torch.from_numpy(X_test).permute(0, 3, 1, 2),
    torch.from_numpy(y_test).squeeze().long()
)

In [None]:
# Define the batch size
BATCH_SIZE = 32

In [None]:
def make_loader(ds, batch_size, shuffle, drop_last):
    """
    Create a PyTorch DataLoader with optimized settings.

    Args:
        ds (Dataset): PyTorch Dataset object
        batch_size (int): Number of samples per batch
        shuffle (bool): Whether to shuffle data at each epoch
        drop_last (bool): Whether to drop last incomplete batch

    Returns:
        DataLoader: Configured DataLoader instance
    """
    # Determine optimal number of worker processes for data loading
    cpu_cores = os.cpu_count() or 2
    num_workers = max(2, min(4, cpu_cores))

    # Create DataLoader with performance optimizations
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        pin_memory=True,  # Faster GPU transfer
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=4,  # Load 4 batches ahead
    )

In [None]:
# Create data loaders with different settings for each phase
train_loader = make_loader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader = make_loader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_loader = make_loader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

In [None]:
# Get one batch from the training data loader
for xb, yb in train_loader:
    print("Features batch shape:", xb.shape)
    print("Labels batch shape:", yb.shape)
    break  # Stop after getting one batch

## üßÆ **Network Parameters**

In [None]:
# Number of training epochs
LEARNING_RATE = 1e-3
EPOCHS = 1000
PATIENCE = 100

# Regularisation
DROPOUT_RATE = 0.2         # Dropout probability
L1_LAMBDA = 0              # L1 penalty
L2_LAMBDA = 0              # L2 penalty

# Set up loss function
criterion = nn.CrossEntropyLoss()

# Print the defined parameters
print("Epochs:", EPOCHS)
print("Batch Size:", BATCH_SIZE)
print("Learning Rate:", LEARNING_RATE)
print("Dropout Rate:", DROPOUT_RATE)
print("L1 Penalty:", L1_LAMBDA)
print("L2 Penalty:", L2_LAMBDA)

## üõ†Ô∏è **Build the Model**

In [None]:
class SimpleCNN(nn.Module):
    """
    Simple Convolutional Neural Network for binary classification.

    Architecture:
    - 4 convolutional blocks with increasing channels (16->32->64->128)
    - ReLU activations and MaxPooling for spatial reduction
    - Dropout and fully connected layer for classification
    """
    def __init__(self, input_shape=(3, 224, 224), num_classes=2, dropout_rate=0.2):
        super().__init__()

        # First convolutional block: 16 filters
        self.conv0 = nn.Conv2d(input_shape[0], 16, kernel_size=3, padding='same')
        self.relu0 = nn.ReLU()
        self.mp0 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Second convolutional block: 32 filters
        self.conv1 = nn.Conv2d(16, 32, kernel_size=3, padding='same')
        self.relu1 = nn.ReLU()
        self.mp1 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Third convolutional block: 64 filters
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding='same')
        self.relu2 = nn.ReLU()
        self.mp2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Fourth convolutional block: 128 filters
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding='same')
        self.relu3 = nn.ReLU()

        # Calculate flattened size after all blocks using a dummy forward pass
        with torch.no_grad():
            dummy_input = torch.zeros(1, *input_shape)
            dummy_output = self._forward_features(dummy_input)
            flattened_size = dummy_output.view(1, -1).shape[1]

        # Classification head
        self.classifier_head = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(dropout_rate),
            nn.Linear(flattened_size, num_classes)
        )

    def _forward_features(self, x):
        """Forward pass through convolutional layers only."""
        x = self.mp0(self.relu0(self.conv0(x)))
        x = self.mp1(self.relu1(self.conv1(x)))
        x = self.mp2(self.relu2(self.conv2(x)))
        x = self.relu3(self.conv3(x))
        return x

    def forward(self, x):
        """Forward pass through the entire network."""
        x = self._forward_features(x)
        x = self.classifier_head(x)
        return x

In [None]:
# Instantiate CNN model and move to computing device (CPU/GPU)
vanilla_model = SimpleCNN(
    input_shape,
    num_classes,
    dropout_rate=DROPOUT_RATE
).to(device)

# Display model architecture summary
summary(vanilla_model, input_size=input_shape)
model_graph = draw_graph(vanilla_model, input_size=(BATCH_SIZE,)+input_shape, expand_nested=True)
model_graph.visual_graph

In [None]:
# Set up TensorBoard logging and save model architecture
experiment_name = "vanilla_cnn"
writer = SummaryWriter("./"+logs_dir+"/"+experiment_name)
x = torch.randn(1, input_shape[0], input_shape[1], input_shape[2]).to(device)
writer.add_graph(vanilla_model, x)

# Define optimizer with L2 regularization
optimizer = torch.optim.Adam(vanilla_model.parameters(), lr=LEARNING_RATE, weight_decay=L2_LAMBDA)

# Enable mixed precision training for GPU acceleration
scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

## üß† **Model Training**

In [None]:
# Initialize best model tracking variables
best_model = None
best_performance = float('-inf')

In [None]:
def train_one_epoch(model, train_loader, criterion, optimizer, scaler, device, l1_lambda=0, l2_lambda=0):
    """
    Perform one complete training epoch through the entire training dataset.

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        criterion (nn.Module): Loss function (e.g., CrossEntropyLoss, MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        l1_lambda (float): Lambda for L1 regularization
        l2_lambda (float): Lambda for L2 regularization

    Returns:
        tuple: (average_loss, f1 score) - Training loss and f1 score for this epoch
    """
    model.train()  # Set model to training mode

    running_loss = 0.0
    all_predictions = []
    all_targets = []

    # Iterate through training batches
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Move data to device (GPU/CPU)
        inputs, targets = inputs.to(device), targets.to(device)

        # Clear gradients from previous step
        optimizer.zero_grad(set_to_none=True)

        # Forward pass with mixed precision (if CUDA available)
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            logits = model(inputs)
            loss = criterion(logits, targets)

            # Add L1 and L2 regularization
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            l2_norm = sum(p.pow(2).sum() for p in model.parameters())
            loss = loss + l1_lambda * l1_norm + l2_lambda * l2_norm

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Accumulate metrics
        running_loss += loss.item() * inputs.size(0)
        predictions = logits.argmax(dim=1)
        all_predictions.append(predictions.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

    # Calculate epoch metrics
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_f1 = f1_score(
        np.concatenate(all_targets),
        np.concatenate(all_predictions),
        average='weighted'
    )

    return epoch_loss, epoch_f1

In [None]:
def validate_one_epoch(model, val_loader, criterion, device):
    """
    Perform one complete validation epoch through the entire validation dataset.

    Args:
        model (nn.Module): The neural network model to evaluate (must be in eval mode)
        val_loader (DataLoader): PyTorch DataLoader containing validation data batches
        criterion (nn.Module): Loss function used to calculate validation loss
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)

    Returns:
        tuple: (average_loss, accuracy) - Validation loss and accuracy for this epoch

    Note:
        This function automatically sets the model to evaluation mode and disables
        gradient computation for efficiency during validation.
    """
    model.eval()  # Set model to evaluation mode

    running_loss = 0.0
    all_predictions = []
    all_targets = []

    # Disable gradient computation for validation
    with torch.no_grad():
        for inputs, targets in val_loader:
            # Move data to device
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass with mixed precision (if CUDA available)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model(inputs)
                loss = criterion(logits, targets)

            # Accumulate metrics
            running_loss += loss.item() * inputs.size(0)
            predictions = logits.argmax(dim=1)
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

    # Calculate epoch metrics
    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_accuracy = f1_score(
        np.concatenate(all_targets),
        np.concatenate(all_predictions),
        average='weighted'
    )

    return epoch_loss, epoch_accuracy

In [None]:
def log_metrics_to_tensorboard(writer, epoch, train_loss, train_f1, val_loss, val_f1, model):
    """
    Log training metrics and model parameters to TensorBoard for visualization.

    Args:
        writer (SummaryWriter): TensorBoard SummaryWriter object for logging
        epoch (int): Current epoch number (used as x-axis in TensorBoard plots)
        train_loss (float): Training loss for this epoch
        train_f1 (float): Training f1 score for this epoch
        val_loss (float): Validation loss for this epoch
        val_f1 (float): Validation f1 score for this epoch
        model (nn.Module): The neural network model (for logging weights/gradients)

    Note:
        This function logs scalar metrics (loss/f1 score) and histograms of model
        parameters and gradients, which helps monitor training progress and detect
        issues like vanishing/exploding gradients.
    """
    # Log scalar metrics
    writer.add_scalar('Loss/Training', train_loss, epoch)
    writer.add_scalar('Loss/Validation', val_loss, epoch)
    writer.add_scalar('F1/Training', train_f1, epoch)
    writer.add_scalar('F1/Validation', val_f1, epoch)

    # Log model parameters and gradients
    for name, param in model.named_parameters():
        if param.requires_grad:
            # Check if the tensor is not empty before adding a histogram
            if param.numel() > 0:
                writer.add_histogram(f'{name}/weights', param.data, epoch)
            if param.grad is not None:
                # Check if the gradient tensor is not empty before adding a histogram
                if param.grad.numel() > 0:
                    if param.grad is not None and torch.isfinite(param.grad).all():
                        writer.add_histogram(f'{name}/gradients', param.grad.data, epoch)

In [None]:
def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scaler, device,
        l1_lambda=0, l2_lambda=0, patience=0, evaluation_metric="val_f1", mode='max',
        restore_best_weights=True, writer=None, verbose=10, experiment_name=""):
    """
    Train the neural network model on the training data and validate on the validation data.

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        val_loader (DataLoader): PyTorch DataLoader containing validation data batches
        epochs (int): Number of training epochs
        criterion (nn.Module): Loss function (e.g., CrossEntropyLoss, MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        l1_lambda (float): L1 regularization coefficient (default: 0)
        l2_lambda (float): L2 regularization coefficient (default: 0)
        patience (int): Number of epochs to wait for improvement before early stopping (default: 0)
        evaluation_metric (str): Metric to monitor for early stopping (default: "val_f1")
        mode (str): 'max' for maximizing the metric, 'min' for minimizing (default: 'max')
        restore_best_weights (bool): Whether to restore model weights from best epoch (default: True)
        writer (SummaryWriter, optional): TensorBoard SummaryWriter object for logging (default: None)
        verbose (int, optional): Frequency of printing training progress (default: 10)
        experiment_name (str, optional): Experiment name for saving models (default: "")

    Returns:
        tuple: (model, training_history) - Trained model and metrics history
    """

    # Initialize metrics tracking
    training_history = {
        'train_loss': [], 'val_loss': [],
        'train_f1': [], 'val_f1': []
    }

    # Configure early stopping if patience is set
    if patience > 0:
        patience_counter = 0
        best_metric = float('-inf') if mode == 'max' else float('inf')
        best_epoch = 0

    print(f"Training {epochs} epochs...")

    # Main training loop: iterate through epochs
    for epoch in range(1, epochs + 1):

        # Forward pass through training data, compute gradients, update weights
        train_loss, train_f1 = train_one_epoch(
            model, train_loader, criterion, optimizer, scaler, device, l1_lambda, l2_lambda
        )

        # Evaluate model on validation data without updating weights
        val_loss, val_f1 = validate_one_epoch(
            model, val_loader, criterion, device
        )

        # Store metrics for plotting and analysis
        training_history['train_loss'].append(train_loss)
        training_history['val_loss'].append(val_loss)
        training_history['train_f1'].append(train_f1)
        training_history['val_f1'].append(val_f1)

        # Write metrics to TensorBoard for visualization
        if writer is not None:
            log_metrics_to_tensorboard(
                writer, epoch, train_loss, train_f1, val_loss, val_f1, model
            )

        # Print progress every N epochs or on first epoch
        if verbose > 0:
            if epoch % verbose == 0 or epoch == 1:
                print(f"Epoch {epoch:3d}/{epochs} | "
                    f"Train: Loss={train_loss:.4f}, F1 Score={train_f1:.4f} | "
                    f"Val: Loss={val_loss:.4f}, F1 Score={val_f1:.4f}")

        # Early stopping logic: monitor metric and save best model
        if patience > 0:
            current_metric = training_history[evaluation_metric][-1]
            is_improvement = (current_metric > best_metric) if mode == 'max' else (current_metric < best_metric)

            if is_improvement:
                best_metric = current_metric
                best_epoch = epoch
                torch.save(model.state_dict(), "models/"+experiment_name+'_model.pt')
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping triggered after {epoch} epochs.")
                    break

    # Restore best model weights if early stopping was used
    if restore_best_weights and patience > 0:
        model.load_state_dict(torch.load("models/"+experiment_name+'_model.pt'))
        print(f"Best model restored from epoch {best_epoch} with {evaluation_metric} {best_metric:.4f}")

    # Save final model if no early stopping
    if patience == 0:
        torch.save(model.state_dict(), "models/"+experiment_name+'_model.pt')

    # Close TensorBoard writer
    if writer is not None:
        writer.close()

    return model, training_history

In [None]:
%%time
# Train model and track training history
vanilla_model, vanilla_history = fit(
    model=vanilla_model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=EPOCHS,
    criterion=criterion,
    optimizer=optimizer,
    scaler=scaler,
    device=device,
    writer=writer,
    verbose=1,
    experiment_name="vanilla_cnn",
    patience=PATIENCE
)

# Update best model if current performance is superior
if vanilla_history['val_f1'][-1] > best_performance:
    best_model = vanilla_model
    best_performance = vanilla_history['val_f1'][-1]

# Calculate and print the final validation F1 score
final_val_f1 = round(max(vanilla_history['val_f1']) * 100, 2)
print(f'Final validation F1 score: {final_val_f1}%')

In [None]:
# @title Plot History
# Create a figure with two side-by-side subplots
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))

# Plot of training and validation loss on the first axis
ax1.plot(vanilla_history['train_loss'], label='Training loss', alpha=0.3, color='#ff7f0e', linestyle='--')
ax1.plot(vanilla_history['val_loss'], label='Validation loss', alpha=0.9, color='#ff7f0e')
ax1.set_title('Categorical Crossentropy')
ax1.legend()
ax1.grid(alpha=0.3)

# Plot of training and validation F1 score on the second axis
ax2.plot(vanilla_history['train_f1'], label='Training F1', alpha=0.3, color='#ff7f0e', linestyle='--')
ax2.plot(vanilla_history['val_f1'], label='Validation F1', alpha=0.9, color='#ff7f0e')
ax2.set_title('F1 Score')
ax2.legend()
ax2.grid(alpha=0.3)

# Adjust the layout and display the plot
plt.tight_layout()
plt.subplots_adjust(right=0.85)
plt.show()

## üí™ **Image Augmentation**

In [None]:
# Define the URL for the image
url = "https://static.wikia.nocookie.net/jujutsu-kaisen/images/6/66/Gojo_reveals_his_Six_Eyes_%28Anime%29.png/revision/latest?cb=20201114064149"

# Send a GET request to the URL and retrieve the image content
response = requests.get(url)

# Load the image and normalise pixel values
img = np.array(Image.open(BytesIO(response.content))) / 255

# Make the image squared by cropping equally from left and right
dim = min(img.shape[:-1])
img = img[(img.shape[0]-dim)//2:(img.shape[0]+dim)//2,
          (img.shape[1]-dim)//2:(img.shape[1]+dim)//2, :]

# Display the image using matplotlib
plt.figure(figsize=(4,4))
plt.imshow(img)
plt.axis('off')
plt.show()

### **Geometric - Random Flip**

In [None]:
# Define image augmentation with random horizontal and vertical flipping
augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

### **Geometric - Random Translation**

In [None]:
# Define image augmentation with random translation
augmentation = transforms.Compose([
    transforms.RandomAffine(degrees=0, translate=(0.2, 0.2), scale=None),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

### **Geometric - Random Rotation**

In [None]:
# Define image augmentation with random rotation
augmentation = transforms.Compose([
    transforms.RandomAffine(degrees=72, translate=None, scale=None),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

### **Geometric - Random Zoom**

In [None]:
# Define image augmentation with random zoom
augmentation = transforms.Compose([
    transforms.RandomAffine(degrees=0, translate=None, scale=(0.8, 1.2)),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

 **Geometric - All Together**

In [None]:
# Define image augmentation with random zoom
augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomAffine(degrees=72, translate=(0.2, 0.2), scale=(0.8, 1.2)),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

In [None]:
# Define image augmentation with random zoom
augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomAffine(degrees=72, translate=(0.2, 0.2), scale=(0.8, 1.2), fill=128),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

### **Photometric - Random Brightness**

In [None]:
# Define image augmentation with random brightness
augmentation = transforms.Compose([
    transforms.ColorJitter(brightness=0.5),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

### **Photometric - Random Contrast**

In [None]:
# Define image augmentation with random contrast
augmentation = transforms.Compose([
    transforms.ColorJitter(contrast=0.75),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

### **Photometric - Random Saturation**

In [None]:
# Define image augmentation with random saturation
augmentation = transforms.Compose([
    transforms.ColorJitter(saturation=0.75),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

 ### **All Together**

In [None]:
# Define image augmentation with random zoom
augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.5, contrast=0.75, saturation=0.75),
    transforms.RandomAffine(degrees=72, translate=(0.2, 0.2), scale=(0.8, 1.2)),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

## üí™ **Advanced Image Augmentation**

In [None]:
# Define image augmentation with random erasing (cutout)
# RandomErasing randomly selects a rectangle region and erases its pixels
# Note: RandomErasing works on tensors, not PIL images
augmentation = transforms.Compose([
    transforms.RandomErasing(p=1.0, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

In [None]:
# Define image augmentation with random erasing (cutout)
# RandomErasing randomly selects a rectangle region and erases its pixels
# Note: RandomErasing works on tensors, not PIL images
augmentation = transforms.Compose([
    transforms.RandomErasing(p=1.0, scale=(0.02, 0.33), value='random'),
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

In [None]:
# Define image augmentation with random erasing (cutout)
# RandomErasing randomly selects a rectangle region and erases its pixels
# Note: RandomErasing works on tensors, not PIL images
k_cutout = 5
augmentation = transforms.Compose([
    transforms.RandomErasing(p=1.0, scale=(0.01, 0.033), value='random')
    for _ in range(k_cutout)
])

# Convert numpy image to PIL for transforms
img_pil = Image.fromarray((img * 255).astype(np.uint8))

# Set up the figure and grid layout for displaying images
fig = plt.figure(constrained_layout=True, figsize=(12, 3))
gs = gridspec.GridSpec(1, 4, figure=fig, width_ratios=[1, 1, 1, 1], wspace=0.1)

# Display the original image
ax1 = fig.add_subplot(gs[0])
ax1.imshow(img)
ax1.axis('off')

# Apply augmentation and display the first augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax2 = fig.add_subplot(gs[1])
ax2.imshow(np.clip(augmented_img, 0., 1.))
ax2.axis('off')

# Apply augmentation again and display the second augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax3 = fig.add_subplot(gs[2])
ax3.imshow(np.clip(augmented_img, 0., 1.))
ax3.axis('off')

# Apply augmentation again and display the third augmented image
augmented_img = np.array(augmentation(img_pil)) / 255
ax4 = fig.add_subplot(gs[3])
ax4.imshow(np.clip(augmented_img, 0., 1.))
ax4.axis('off')

# Show the figure with all images
plt.show()

## üõ†Ô∏è **Build the Model with Augmentation**

In [None]:
# Custom Dataset class that applies transforms v2 on-the-fly
class AugmentedDataset(torch.utils.data.Dataset):
    """
    Custom PyTorch Dataset that applies data augmentation transforms using transforms v2.

    Following the recommended approach from torchvision documentation:
    - Use ToImage() to convert PIL to tensor
    - Use ToDtype(torch.float32, scale=True) to convert to float and scale to [0, 1]

    Args:
        data (np.ndarray): Input images with shape (N, H, W, C)
        labels (np.ndarray): Labels with shape (N,)
        transform (callable, optional): Transform to apply to images
    """
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

        # Base transform: convert to tensor (following v2 guidelines)
        self.to_tensor = transforms.Compose([
            transforms.ToImage(),
            transforms.ToDtype(torch.float32, scale=True)
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get image and label
        image = self.data[idx]
        label = self.labels[idx]

        # Convert numpy to PIL Image
        image_pil = Image.fromarray((image * 255).astype(np.uint8))

        # Convert to tensor using v2 recommended approach
        image_tensor = self.to_tensor(image_pil)

        # Apply additional transforms if provided
        if self.transform:
            image_tensor = self.transform(image_tensor)

        return image_tensor, torch.tensor(label, dtype=torch.long)

In [None]:
# Define data augmentation pipeline for training using transforms v2
train_augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomErasing(p=1.0, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0)
])

# Create augmented datasets
train_aug_ds = AugmentedDataset(X_train, y_train.squeeze(), transform=train_augmentation)
val_aug_ds = AugmentedDataset(X_val, y_val.squeeze(), transform=None)
test_aug_ds = AugmentedDataset(X_test, y_test.squeeze(), transform=None)

# Create data loaders for augmented datasets
train_aug_loader = make_loader(train_aug_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_aug_loader = make_loader(val_aug_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_aug_loader = make_loader(test_aug_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

In [None]:
# Instantiate CNN model with augmentation and move to computing device (CPU/GPU)
aug_model = SimpleCNN(
    input_shape,
    num_classes,
    dropout_rate=DROPOUT_RATE
).to(device)

# Display model architecture summary
summary(aug_model, input_size=input_shape)
model_graph = draw_graph(aug_model, input_size=(BATCH_SIZE,)+input_shape, expand_nested=True)
model_graph.visual_graph

In [None]:
# Set up TensorBoard logging and save model architecture
experiment_name = "augmented_cnn"
writer = SummaryWriter("./"+logs_dir+"/"+experiment_name)
x = torch.randn(1, input_shape[0], input_shape[1], input_shape[2]).to(device)
writer.add_graph(aug_model, x)

# Define optimizer with L2 regularization
optimizer = torch.optim.Adam(aug_model.parameters(), lr=LEARNING_RATE, weight_decay=L2_LAMBDA)

# Enable mixed precision training for GPU acceleration
scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

### üß† **Train the Model with Augmentation**

In [None]:
%%time
# Train model with augmentation and track training history
aug_model, aug_history = fit(
    model=aug_model,
    train_loader=train_aug_loader,
    val_loader=val_aug_loader,
    epochs=EPOCHS,
    criterion=criterion,
    optimizer=optimizer,
    scaler=scaler,
    device=device,
    writer=writer,
    verbose=1,
    experiment_name="augmented_cnn",
    patience=PATIENCE
)

# Update best model if current performance is superior
if aug_history['val_f1'][-1] > best_performance:
    best_model = aug_model
    best_performance = aug_history['val_f1'][-1]

# Calculate and print the final validation F1 score
final_val_f1 = round(max(aug_history['val_f1']) * 100, 2)
print(f'Final validation F1 score: {final_val_f1}%')

In [None]:
# @title Plot History
# Create a figure with two side-by-side subplots
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))

# Plot of training and validation loss on the first axis
ax1.plot(aug_history['train_loss'], label='Training loss', alpha=0.3, color='#ff7f0e', linestyle='--')
ax1.plot(aug_history['val_loss'], label='Validation loss', alpha=0.9, color='#ff7f0e')
ax1.set_title('Categorical Crossentropy')
ax1.legend()
ax1.grid(alpha=0.3)

# Plot of training and validation F1 score on the second axis
ax2.plot(aug_history['train_f1'], label='Training F1', alpha=0.3, color='#ff7f0e', linestyle='--')
ax2.plot(aug_history['val_f1'], label='Validation F1', alpha=0.9, color='#ff7f0e')
ax2.set_title('F1 Score')
ax2.legend()
ax2.grid(alpha=0.3)

# Adjust the layout and display the plot
plt.tight_layout()
plt.subplots_adjust(right=0.85)
plt.show()

In [None]:
# Copy TensorBoard logs to accessible location for Colab
!rsync -a $current_dir"/"$logs_dir/ "/content/"$logs_dir/

# Launch TensorBoard interface
%tensorboard --logdir "/content/"$logs_dir

## üïπÔ∏è **Use the Model - Make Inference**

In [None]:
# Collect predictions and ground truth labels using augmented test loader
test_preds, test_targets = [], []
with torch.no_grad():  # Disable gradient computation for inference
    for xb, yb in test_aug_loader:
        xb = xb.to(device)

        # Forward pass: get model predictions
        logits = best_model(xb)
        preds = logits.argmax(dim=1).cpu().numpy()

        # Store batch results
        test_preds.append(preds)
        test_targets.append(yb.numpy())

# Combine all batches into single arrays
test_preds = np.concatenate(test_preds)
test_targets = np.concatenate(test_targets)

In [None]:
# Calculate overall test metrics
test_acc = accuracy_score(test_targets, test_preds)
test_prec = precision_score(test_targets, test_preds, average='weighted')
test_rec = recall_score(test_targets, test_preds, average='weighted')
test_f1 = f1_score(test_targets, test_preds, average='weighted')
print(f"Accuracy over the test set: {test_acc:.4f}")
print(f"Precision over the test set: {test_prec:.4f}")
print(f"Recall over the test set: {test_rec:.4f}")
print(f"F1 score over the test set: {test_f1:.4f}")

# Generate confusion matrix for detailed error analysis
cm = confusion_matrix(test_targets, test_preds)

# Create numeric labels for heatmap annotation
labels = np.array([f"{num}" for num in cm.flatten()]).reshape(cm.shape)

# Visualise confusion matrix
plt.figure(figsize=(8, 7))
sns.heatmap(cm, annot=labels, fmt='',
            xticklabels=['Item','Animal'],
            yticklabels=['Item','Animal'],
            cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix ‚Äì Test Set')
plt.tight_layout()
plt.show()

## **üîÑ Test Time Augmentation (TTA)**

In [None]:
# First, let's establish baseline performance without TTA
print("Computing baseline test performance (no augmentation)...")

best_model.eval()

test_preds_baseline = []
test_targets_baseline = []
test_probs_baseline = []

with torch.no_grad():
    for xb, yb in test_aug_loader:
        xb = xb.to(device)

        # Forward pass: get model predictions
        logits = best_model(xb)
        probs = torch.softmax(logits, dim=1)
        preds = logits.argmax(dim=1)

        # Store results
        test_probs_baseline.append(probs.cpu().numpy())
        test_preds_baseline.append(preds.cpu().numpy())
        test_targets_baseline.append(yb.numpy())

# Concatenate all batches
test_probs_baseline = np.concatenate(test_probs_baseline)
test_preds_baseline = np.concatenate(test_preds_baseline)
test_targets_baseline = np.concatenate(test_targets_baseline)

# Calculate baseline metrics
baseline_acc = accuracy_score(test_targets_baseline, test_preds_baseline)
baseline_f1 = f1_score(test_targets_baseline, test_preds_baseline, average='weighted')

print(f"\nBaseline Performance (No TTA):")
print(f"  Accuracy: {baseline_acc:.4f}")
print(f"  F1 Score: {baseline_f1:.4f}")

In [None]:
# Define deterministic horizontal flip transformation
horizontal_flip = transforms.Compose([
    transforms.RandomHorizontalFlip(p=1.0)  # Always flip (p=1.0)
])

print("\nApplying Test Time Augmentation with horizontal flip...")

In [None]:
# Get predictions for original images (already computed above)
print("Step 1/3: Using original predictions...")
probs_original = test_probs_baseline

# Get predictions for horizontally flipped images
print("Step 2/3: Computing predictions with horizontal flip...")

best_model.eval()

test_probs_flipped = []

with torch.no_grad():
    for xb, yb in test_aug_loader:
        xb = xb.to(device)

        # Apply horizontal flip
        xb_flipped = horizontal_flip(xb)

        # Forward pass with flipped images
        logits_flipped = best_model(xb_flipped)
        probs_flipped = torch.softmax(logits_flipped, dim=1)

        # Store results
        test_probs_flipped.append(probs_flipped.cpu().numpy())

# Concatenate all batches
test_probs_flipped = np.concatenate(test_probs_flipped)

print(f"Predictions computed for {len(test_probs_flipped)} flipped test images")

In [None]:
# Average the soft labels (probabilities) from original and flipped predictions
print("Step 3/3: Averaging soft labels from both predictions...")

probs_tta = (probs_original + test_probs_flipped) / 2.0

# Get final predictions from averaged probabilities
preds_tta = np.argmax(probs_tta, axis=1)

print(f"\nTTA ensemble complete:")
print(f"  Original predictions shape: {probs_original.shape}")
print(f"  Flipped predictions shape:  {test_probs_flipped.shape}")
print(f"  Averaged predictions shape: {probs_tta.shape}")

In [None]:
# Calculate TTA metrics
tta_acc = accuracy_score(test_targets_baseline, preds_tta)
tta_prec = precision_score(test_targets_baseline, preds_tta, average='weighted')
tta_rec = recall_score(test_targets_baseline, preds_tta, average='weighted')
tta_f1 = f1_score(test_targets_baseline, preds_tta, average='weighted')

print("\n" + "="*60)
print("TEST TIME AUGMENTATION RESULTS")
print("="*60)
print(f"TTA Accuracy:  {tta_acc:.4f}")
print(f"TTA Precision: {tta_prec:.4f}")
print(f"TTA Recall:    {tta_rec:.4f}")
print(f"TTA F1 Score:  {tta_f1:.4f}")
print("="*60)

### üìä **Performance Comparison: Baseline vs TTA**

In [None]:
# Compare baseline and TTA performance
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score'],
    'Baseline': [f'{baseline_acc:.4f}', f'{baseline_f1:.4f}'],
    'TTA (Horizontal Flip)': [f'{tta_acc:.4f}', f'{tta_f1:.4f}'],
    'Improvement': [
        f'{(tta_acc - baseline_acc):.4f} ({(tta_acc - baseline_acc)/baseline_acc*100:+.2f}%)',
        f'{(tta_f1 - baseline_f1):.4f} ({(tta_f1 - baseline_f1)/baseline_f1*100:+.2f}%)'
    ]
})

print("\n")
print("="*80)
print("PERFORMANCE COMPARISON: BASELINE VS TTA")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Generate confusion matrix for TTA predictions
cm_tta = confusion_matrix(test_targets_baseline, preds_tta)

# Visualise TTA confusion matrix
plt.figure(figsize=(8, 7))
sns.heatmap(cm_tta, annot=True, fmt='d',
            xticklabels=['Item', 'Animal'],
            yticklabels=['Item', 'Animal'],
            cmap='Greens')
plt.xlabel('Predicted labels', fontsize=12, fontweight='bold')
plt.ylabel('True labels', fontsize=12, fontweight='bold')
plt.title('Confusion Matrix - Test Set with TTA', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## ü™Ñ **Image Retrieval**

In [None]:
# Create an embedding model using the existing _forward_features method
class EmbeddingModel(nn.Module):
    """Wrapper that extracts flattened features using the model's feature extractor."""
    def __init__(self, base_model):
        super().__init__()
        # Create a deep copy to avoid modifying the original best_model
        self.base_model = copy.deepcopy(base_model)
        # Truncate the classifier_head to only include the Flatten layer
        self.base_model.classifier_head = nn.Sequential(self.base_model.classifier_head[0])

    def forward(self, x):
        # The _forward_features method already handles the convolutional layers
        x = self.base_model._forward_features(x)
        # The truncated classifier_head now only contains the Flatten layer
        x = self.base_model.classifier_head(x)
        return x

# Create embedding model and move to device
embedding = EmbeddingModel(best_model).to(device)
embedding.eval()

# Display the embedding model architecture
print(embedding)

In [None]:
# Define the conversion transform following v2 guidelines
to_tensor = transforms.Compose([
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True)
])

# Extract and preprocess a single image for feature extraction
index = 10

# Prepare the query image
image_np = X[index]
image_pil = Image.fromarray((image_np * 255).astype(np.uint8))
image_tensor = to_tensor(image_pil).unsqueeze(0).to(device)

# Predict the features of the selected image using the embedding model
with torch.no_grad():
    image_features = embedding(image_tensor).cpu().numpy()

# Display the selected image
plt.figure(figsize=(4, 4))
plt.imshow(image_np)
plt.xticks([])
plt.yticks([])
plt.show()

# Extract features from the entire dataset using the embedding model
all_features = []
with torch.no_grad():
    for i in range(0, len(X), 32):  # Process in batches
        batch = X[i:i+32]
        batch_tensors = []
        for img in batch:
            img_pil = Image.fromarray((img * 255).astype(np.uint8))
            img_tensor = to_tensor(img_pil)
            batch_tensors.append(img_tensor)
        batch_tensor = torch.stack(batch_tensors).to(device)
        features = embedding(batch_tensor).cpu().numpy()
        all_features.append(features)

dataset_features = np.concatenate(all_features, axis=0)

# Compute the distances between the selected image's features and the entire dataset's features
distances = np.mean(np.square(dataset_features - image_features), axis=-1)

# Sort images by their distances (similarity to the selected image)
ordered_indices = distances.argsort()
ordered_images = X[ordered_indices]

# Display the top 10 most similar images
num_img = 10
fig, axes = plt.subplots(1, num_img, figsize=(20, 20))
for i in range(num_img):
    ax = axes[i % num_img]
    ax.imshow(ordered_images[i])
    ax.set_xticks([])
    ax.set_yticks([])
plt.tight_layout()
plt.show()

### **Use Pretrained Models as Image Search Engine**

In [None]:
# Load the EfficientNetB0 model pre-trained on ImageNet, without the top classification layer
pretrained_weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
pretrained_model = torchvision.models.efficientnet_b0(weights=pretrained_weights)

# Remove the classifier head to use as feature extractor
pretrained_model.classifier = nn.Identity()
pretrained_model = pretrained_model.to(device)
pretrained_model.eval()

# Display pretrained model summary
summary(pretrained_model, input_size=input_shape)
model_graph = draw_graph(pretrained_model, input_size=(BATCH_SIZE,)+input_shape, expand_nested=True, depth=6)
model_graph.visual_graph

In [None]:
# Get the default preprocessing transforms for EfficientNetB0
efficientnet_transform = pretrained_weights.transforms()
print(efficientnet_transform)

In [None]:
# Extract and preprocess a single image for feature extraction using EfficientNetB0
index = 10

# Prepare the query image
image_np = X[index]
image_pil = Image.fromarray((image_np * 255).astype(np.uint8))
# Apply the EfficientNet preprocessing transforms and add a batch dimension
image_tensor = efficientnet_transform(image_pil).unsqueeze(0).to(device)

# Predict the features of the selected image using the pre-trained EfficientNetB0 model
with torch.no_grad():
    image_features = pretrained_model(image_tensor).cpu().numpy()

# Display the selected query image
plt.figure(figsize=(4, 4))
plt.imshow(image_np)
plt.xticks([])
plt.yticks([])
plt.show()

# Extract features from all images in the dataset using the pre-trained EfficientNetB0 model
all_features = []
with torch.no_grad():
    # Process images in batches to manage memory and potentially speed up computation
    for i in range(0, len(X), 32):
        batch = X[i:i+32]
        batch_tensors = []
        for img in batch:
            img_pil = Image.fromarray((img * 255).astype(np.uint8))
            # Apply the EfficientNet preprocessing transforms to each image
            img_tensor = efficientnet_transform(img_pil)
            batch_tensors.append(img_tensor)
        # Stack the processed image tensors into a single batch tensor
        batch_tensor = torch.stack(batch_tensors).to(device)
        # Get features for the current batch
        features = pretrained_model(batch_tensor).cpu().numpy()
        all_features.append(features)

# Concatenate features from all batches into a single array
dataset_features = np.concatenate(all_features, axis=0)

# Compute the squared Euclidean distances between the query image's features
# and the features of all images in the dataset. This measures similarity.
distances = np.mean(np.square(dataset_features - image_features), axis=-1)

# Sort the dataset images by their distances (smallest distance means most similar)
ordered_indices = distances.argsort()
ordered_images = X[ordered_indices]

# Display the top 10 most similar images to the query image
num_img = 10
fig, axes = plt.subplots(1, num_img, figsize=(20, 20))
for i in range(num_img):
    ax = axes[i % num_img]
    ax.imshow(ordered_images[i])
    ax.set_xticks([])
    ax.set_yticks([])
plt.tight_layout()
plt.show()

#  
<img src="https://airlab.deib.polimi.it/wp-content/uploads/2019/07/airlab-logo-new_cropped.png" width="350">

##### Connect with us:
- <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/81/LinkedIn_icon.svg/2048px-LinkedIn_icon.svg.png" width="14"> **LinkedIn:**  [AIRLab Polimi](https://www.linkedin.com/company/airlab-polimi/)
- <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/9/95/Instagram_logo_2022.svg/800px-Instagram_logo_2022.svg.png" width="14"> **Instagram:** [airlab_polimi](https://www.instagram.com/airlab_polimi/)

##### Contributors:
- **Eugenio Lomurno**: eugenio.lomurno@polimi.it
- **Alberto Archetti**: alberto.archetti@polimi.it
- **Roberto Basla**: roberto.basla@polimi.it
- **Carlo Sgaravatti**: carlo.sgaravatti@polimi.it

```
   Copyright 2025 Eugenio Lomurno, Alberto Archetti, Roberto Basla, Carlo Sgaravatti

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
```