# **Artificial Neural Networks and Deep Learning**

---

## **Lecture 8: Object Localisation and Class Activation Maps**

<img src="https://drive.google.com/uc?export=view&id=14qXmXmQHVwDxXJ3DiVhNmMOcnpA6QMiq" width="500"/>

## üåê **Google Drive Connection**

In [None]:
from google.colab import drive
drive.mount("/gdrive")
current_dir = "/gdrive/My\\ Drive/Colab\\ Notebooks/[2025-2026]\\ AN2DL/Lecture\\ 8"
%cd $current_dir

## ‚öôÔ∏è **Libraries Import**

In [None]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import logging
import random
import numpy as np

# Set seeds for random number generators
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
import torchvision
from torchvision.transforms import v2 as transforms
from torch.utils.data import TensorDataset, DataLoader, Dataset

# Install and import torchview for model visualisation
!pip install -q torchview
from torchview import draw_graph

# Configure device and set seeds for CUDA if available
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device("cpu")

# Setup directories for models and logs
logs_dir = "tensorboard"
!pkill -f tensorboard
%load_ext tensorboard
!mkdir -p models

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import cv2
import csv
import scipy
from PIL import Image
from xml.dom import minidom
import pandas as pd
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

## üì• **Download Dataset**

In [None]:
# Download training images if not already present
os.environ["TRAINING_DATASET_NAME"] = "cats_dogs_images_train.zip"
os.environ["TRAINING_DATASET_URL"] = "1_fGNrYZxs0yzIJQfUmUWHrWnRisVEYaY"

if not os.path.exists(os.environ["TRAINING_DATASET_NAME"]):
    print("Downloading training images...")
    !gdown -q ${TRAINING_DATASET_URL} -O ${TRAINING_DATASET_NAME}
    print("Training images downloaded!")

    print("Unzipping training images...")
    !unzip -q -o ${TRAINING_DATASET_NAME}
    print("Training images unzipped!")
else:
    print("Training images already available.")

In [None]:
# Download bounding boxes annotations if not already present
os.environ["TRAINING_DATASET_BOUNDING_BOXES_NAME"] = "cats_dogs_images_boxes.csv"
os.environ["TRAINING_DATASET_BOUNDING_BOXES_URL"] = "1visBcJA_F9oUOAOTNq6R-MTzkFBXa2LY"

if not os.path.exists(os.environ["TRAINING_DATASET_BOUNDING_BOXES_NAME"]):
    print("Downloading bounding boxes annotations...")
    !gdown -q ${TRAINING_DATASET_BOUNDING_BOXES_URL} -O ${TRAINING_DATASET_BOUNDING_BOXES_NAME}
    print("Bounding boxes annotations downloaded!")
else:
    print("Bounding boxes annotations already available.")

In [None]:
# Download test images if not already present
os.environ["TEST_DATASET_NAME"] = "cats_dogs_images_test.zip"
os.environ["TEST_DATASET_URL"] = "1RFJwHLkLdj3RVq-xkYtP_8uLkj5K-obn"

if not os.path.exists(os.environ["TEST_DATASET_NAME"]):
    print("Downloading test images...")
    !gdown -q ${TEST_DATASET_URL} -O ${TEST_DATASET_NAME}
    print("Test images downloaded!")

    print("Unzipping test images...")
    !unzip -q -o ${TEST_DATASET_NAME}
    print("Test images unzipped!")
else:
    print("Test images already available.")

In [None]:
# Download test images (multiple targets) if not already present
os.environ["TEST_MULTIPLE_DATASET_NAME"] = "multiple_cats_dogs_images_test.zip"
os.environ["TEST_MULTIPLE_DATASET_URL"] = "198qHfig8EwdbSmO1ubUiaHrrgVaFf8gx"

if not os.path.exists(os.environ["TEST_MULTIPLE_DATASET_NAME"]):
    print("Downloading test (multiple targets) images...")
    !gdown -q ${TEST_MULTIPLE_DATASET_URL} -O ${TEST_MULTIPLE_DATASET_NAME}
    print("Test images downloaded!")

    print("Unzipping test (multiple targets) images...")
    !unzip -q -o ${TEST_MULTIPLE_DATASET_NAME}
    print("Test images (multiple targets) unzipped!")
else:
    print("Test images (multiple targets) already available.")

<img src="https://drive.google.com/uc?export=view&id=15T4O0D_r2AF3M1FzHaqf1z2y5NXV43t2" width="900"/>

## ‚öôÔ∏è **Configuration**

In [None]:
# ImageNet normalisation statistics for pre-trained models
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

# Training hyperparameters
IMG_SIZE = 256
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
PATIENCE = 10
EPOCHS = 1000

print(f"Configuration:")
print(f"  Image size: {IMG_SIZE}x{IMG_SIZE}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Patience: {PATIENCE}")
print(f"  Max epochs: {EPOCHS}")

## üîß **Data Loading Functions**

In [None]:
def load_images_from_folder(folder, img_dim):
    """
    Load images from folder and preprocess them.

    Args:
        folder: Path to folder containing images
        img_dim: Target dimension for square images

    Returns:
        Numpy array of preprocessed images
    """
    images = []
    for filename in sorted(os.listdir(folder)):
        img = cv2.imread(os.path.join(folder, filename))
        if img is None:
            continue

        # Centre crop to make image square
        dim = min(img.shape[:-1])
        img = img[(img.shape[0]-dim)//2:(img.shape[0]+dim)//2,
                  (img.shape[1]-dim)//2:(img.shape[1]+dim)//2]

        # Resize to fixed size
        img = cv2.resize(img, (img_dim, img_dim))

        # Convert from BGR to RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        images.append(img)

    return np.array(images)


def preprocess_dataset(image_dir='cats_dogs_images', image_size=(256, 256)):
    """
    Preprocess dataset with bounding box annotations.

    Args:
        image_dir: Directory containing images
        image_size: Target image size (width, height)

    Returns:
        Tuple of (labels, bounding_boxes, images)
    """
    df = pd.read_csv('cats_dogs_images_boxes.csv')

    def process_row(row):
        """Process a single row from the CSV file."""
        img_path = row[0]
        label = int(row[3])
        bbox_coords = list(map(float, row[4:8]))

        # Load image
        img = cv2.imread(os.path.join(image_dir, img_path))
        if img is None:
            return None

        # Get original dimensions
        orig_h, orig_w = img.shape[:2]
        x1, y1, x2, y2 = bbox_coords

        # Normalise bounding box coordinates to [0, 1]
        bbox = [x1/orig_w, y1/orig_h, x2/orig_w, y2/orig_h]

        # Resize image and convert colour space
        img = cv2.resize(img, image_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        return label, bbox, img

    # Process all rows in parallel
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_row, df.itertuples(index=False)))

    # Filter out None results and separate components
    results = [r for r in results if r is not None]
    labels, boxes, img_list = zip(*results)

    return np.array(list(labels)), np.array(list(boxes)), np.array(list(img_list))


print("Data loading functions defined.")

## üì¶ **Dataset Class**

In [None]:
class CustomDataset(Dataset):
    """
    Custom Dataset for object localisation and classification.

    Supports both regression (bounding boxes) and classification (labels).
    """

    def __init__(self, images, labels=None, boxes=None, augmentation=None):
        """
        Initialise the dataset.

        Args:
            images: Numpy array of images (H, W, C)
            labels: Optional numpy array of class labels
            boxes: Optional numpy array of bounding boxes (normalised)
            augmentation: Optional torchvision transforms for data augmentation
        """
        # Convert images to tensors and normalise to [0, 1]
        self.images = torch.from_numpy(images).float().permute(0, 3, 1, 2) / 255.0

        # Convert labels and boxes to tensors if provided
        self.labels = torch.from_numpy(labels).long() if labels is not None else None
        self.boxes = torch.from_numpy(boxes).float() if boxes is not None else None

        self.augmentation = augmentation

        # ImageNet normalisation for pre-trained models
        self.normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)

    def __len__(self):
        """Return the total number of samples."""
        return len(self.images)

    def __getitem__(self, idx):
        """
        Get a single sample.

        Args:
            idx: Index of the sample

        Returns:
            Tuple of (image, label, box) depending on what was provided
        """
        img = self.images[idx].clone()

        # Apply augmentation if provided
        if self.augmentation:
            img = self.augmentation(img)

        # Apply ImageNet normalisation
        img = self.normalize(img)

        # Build return tuple based on available data
        items = [img]
        if self.labels is not None:
            items.append(self.labels[idx])
        if self.boxes is not None:
            items.append(self.boxes[idx])

        # Return tuple if multiple items, else single item
        return tuple(items) if len(items) > 1 else items[0]


print("CustomDataset class defined.")

## üìä **Load and Prepare Data**

In [None]:
# Preprocess dataset with bounding boxes and labels
print("Processing images and annotations...")
labels, boxes, img_list = preprocess_dataset()

# Shuffle the data
combined = list(zip(img_list, boxes, labels))
random.shuffle(combined)
img_list, boxes, labels = zip(*combined)
img_list, boxes, labels = np.array(img_list), np.array(boxes), np.array(labels)

print(f"Total images loaded: {len(img_list)}")
print(f"Image shape: {img_list[0].shape}")
print(f"Number of classes: {len(np.unique(labels))}")

In [None]:
# Define class names
num_to_labels = {0: 'cat', 1: 'dog'}

print("\nClass distribution:")
for class_idx, class_name in num_to_labels.items():
    count = np.sum(labels == class_idx)
    print(f"  {class_name}: {count} images ({count/len(labels)*100:.1f}%)")

In [None]:
def visualize_samples_with_boxes(images, labels, boxes, class_names, num_samples=6):
    """
    Visualise sample images with bounding boxes and labels.

    Args:
        images: Array of images
        labels: Array of class labels
        boxes: Array of bounding boxes (normalised coordinates)
        class_names: Dictionary mapping class indices to names
        num_samples: Number of samples to display
    """
    num_samples = min(num_samples, len(images))

    # Select random samples
    indices = np.random.choice(len(images), num_samples, replace=False)

    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    axes = axes.flatten()

    plt.suptitle("Sample Images with Bounding Boxes",
                fontsize=18, fontweight='bold')

    for i, idx in enumerate(indices):
        ax = axes[i]

        # Get image, label, and box
        img = images[idx].copy()
        label = labels[idx]
        box = boxes[idx]

        h, w, _ = img.shape

        # Draw bounding box (green)
        x1, y1, x2, y2 = box
        cv2.rectangle(img,
                     (int(x1*w), int(y1*h)),
                     (int(x2*w), int(y2*h)),
                     (0, 255, 0), 3)

        # Display image
        ax.imshow(img)
        ax.set_title(f"{class_names[label]}",
                    fontsize=14, fontweight='bold')
        ax.axis('off')

    plt.tight_layout()
    plt.show()


# Visualise sample images with bounding boxes
print("Visualising sample images with bounding boxes...")
visualize_samples_with_boxes(img_list, labels, boxes, num_to_labels, num_samples=6)

## ‚úÇÔ∏è **Split Dataset**

In [None]:
# Split into train and validation sets
print("\nSplitting data...")
X_train, X_val, y_train, y_val, box_train, box_val = train_test_split(
    img_list, labels, boxes,
    test_size=0.2,
    random_state=SEED,
    stratify=labels
)

print(f"Dataset split complete:")
print(f"  Training:   {len(X_train)} images")
print(f"  Validation: {len(X_val)} images")

## üé® **Create DataLoaders**

In [None]:
# Define augmentation for training
train_augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5)
])

print("Creating DataLoaders...")

In [None]:
# Create DataLoaders for bounding box regression
train_box_loader = DataLoader(
    CustomDataset(X_train, boxes=box_train, augmentation=train_augmentation),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_box_loader = DataLoader(
    CustomDataset(X_val, boxes=box_val),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

print(f"Regression DataLoaders created:")
print(f"  Training batches:   {len(train_box_loader)}")
print(f"  Validation batches: {len(val_box_loader)}")

In [None]:
# Create DataLoaders for classification
train_cls_loader = DataLoader(
    CustomDataset(X_train, labels=y_train, augmentation=train_augmentation),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_cls_loader = DataLoader(
    CustomDataset(X_val, labels=y_val),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

print(f"\nClassification DataLoaders created:")
print(f"  Training batches:   {len(train_cls_loader)}")
print(f"  Validation batches: {len(val_cls_loader)}")

## üìè **Metrics**

In [None]:
def spearman_rho(predictions, targets):
    """
    Compute Spearman's rank correlation coefficient.

    Spearman's rho measures the monotonic relationship between
    predicted and target values, suitable for regression evaluation.

    Args:
        predictions: Predicted values (tensor or numpy array)
        targets: Ground truth values (tensor or numpy array)

    Returns:
        float: Spearman's rho correlation coefficient
    """
    # Convert to tensors if needed
    if isinstance(predictions, np.ndarray):
        predictions = torch.from_numpy(predictions)
    if isinstance(targets, np.ndarray):
        targets = torch.from_numpy(targets)

    # Flatten to 1D
    predictions = predictions.float().flatten()
    targets = targets.float().flatten()

    # Compute ranks
    def rank(x):
        """Compute ranks of elements in tensor."""
        sorted_indices = torch.argsort(x)
        ranks = torch.argsort(sorted_indices) + 1
        return ranks.float()

    rank_pred = rank(predictions)
    rank_target = rank(targets)

    # Compute Pearson correlation of ranks
    mean_pred = torch.mean(rank_pred)
    mean_target = torch.mean(rank_target)

    diff_pred = rank_pred - mean_pred
    diff_target = rank_target - mean_target

    covariance = torch.mean(diff_pred * diff_target)
    std_pred = torch.sqrt(torch.mean(diff_pred ** 2))
    std_target = torch.sqrt(torch.mean(diff_target ** 2))

    return covariance / (std_pred * std_target + 1e-8)


print("Spearman's Rho metric function defined.")

## üèóÔ∏è **Build Models**

### **Bounding Box Regressor**

In [None]:
class BoxRegressorModel(nn.Module):
    """
    Bounding box regressor using pre-trained EfficientNetB0.

    The model predicts 4 normalised coordinates: (x1, y1, x2, y2)
    """

    def __init__(self, dropout_rate=0.5):
        """
        Initialise the regressor.

        Args:
            dropout_rate: Dropout rate for regularisation
        """
        super().__init__()

        # Load pre-trained EfficientNetB0
        self.backbone = torchvision.models.efficientnet_b0(
            weights=torchvision.models.EfficientNet_B0_Weights.IMAGENET1K_V1
        )

        # Freeze backbone to use as feature extractor
        for param in self.backbone.parameters():
            param.requires_grad = False

        # Get input features for classifier
        in_features = self.backbone.classifier[1].in_features

        # Replace classifier with regression head (4 outputs for bounding box)
        self.backbone.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(in_features, 4)
        )

    def forward(self, x):
        """
        Forward pass.

        Args:
            x: Input tensor of shape (batch, 3, height, width)

        Returns:
            Predicted bounding boxes of shape (batch, 4)
        """
        return self.backbone(x)


print("BoxRegressorModel defined.")

In [None]:
# Instantiate bounding box regressor
box_model = BoxRegressorModel().to(device)

print("\nBox Regressor Model Summary:")
print("="*60)
summary(box_model, input_size=(3, IMG_SIZE, IMG_SIZE))
print("="*60)

In [None]:
# Visualise model architecture
model_graph = draw_graph(
    box_model,
    input_size=(BATCH_SIZE, 3, IMG_SIZE, IMG_SIZE),
    expand_nested=True,
    depth=6
)
model_graph.visual_graph

In [None]:
def train_box_epoch(model, loader, criterion, optimizer, scaler):
    """
    Train bounding box regressor for one epoch.

    Args:
        model: Regressor model
        loader: Training DataLoader
        criterion: Loss function (MSE)
        optimizer: Optimiser
        scaler: Mixed precision scaler

    Returns:
        Average training loss
    """
    model.train()
    loss_sum = 0.0

    for img, box in loader:
        img, box = img.to(device), box.to(device)

        optimizer.zero_grad(set_to_none=True)

        # Forward pass with mixed precision
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            pred = model(img)
            loss = criterion(pred, box)

        # Backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loss_sum += loss.item() * img.size(0)

    return loss_sum / len(loader.dataset)


def val_box_epoch(model, loader, criterion):
    """
    Validate bounding box regressor for one epoch.

    Args:
        model: Regressor model
        loader: Validation DataLoader
        criterion: Loss function (MSE)

    Returns:
        Tuple of (average_loss, spearman_correlation)
    """
    model.eval()
    loss_sum = 0.0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for img, box in loader:
            img, box = img.to(device), box.to(device)

            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                pred = model(img)
                loss = criterion(pred, box)

            loss_sum += loss.item() * img.size(0)

            # Collect predictions for Spearman correlation
            all_preds.append(pred.cpu().numpy())
            all_targets.append(box.cpu().numpy())

    # Calculate Spearman's rho on entire validation set
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    spearman = spearman_rho(all_preds, all_targets)

    return loss_sum / len(loader.dataset), float(spearman)

def fit_box_regressor(model, train_loader, val_loader, epochs, criterion, optimizer, scaler,
                      patience=10, experiment_name="box_regressor"):
    """
    Complete training loop for bounding box regressor with early stopping.

    Args:
        model: Regressor model
        train_loader: Training DataLoader
        val_loader: Validation DataLoader
        epochs: Maximum number of epochs
        criterion: Loss function
        optimizer: Optimiser
        scaler: Mixed precision scaler
        patience: Early stopping patience
        experiment_name: Name for saving model and logs

    Returns:
        Tuple of (trained_model, history_dict)
    """
    writer = SummaryWriter(f"./{logs_dir}/{experiment_name}")

    # Initialise history
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_spearman': []
    }

    best_loss = float('inf')
    patience_counter = 0
    best_epoch = 0

    print(f"Starting training: {experiment_name}")
    print("="*60)

    for epoch in range(1, epochs + 1):
        # Training phase
        train_loss = train_box_epoch(model, train_loader, criterion, optimizer, scaler)

        # Validation phase
        val_loss, val_spearman = val_box_epoch(model, val_loader, criterion)

        # Record history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_spearman'].append(val_spearman)

        # Log to TensorBoard
        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Loss/Val', val_loss, epoch)
        writer.add_scalar('Spearman/Val', val_spearman, epoch)

        # Print progress
        if epoch % 5 == 0 or epoch == 1:
            print(f"Epoch {epoch:3d}/{epochs} | "
                  f"Train MSE: {train_loss:.4f} | "
                  f"Val MSE: {val_loss:.4f} | "
                  f"Val Spearman: {val_spearman:.4f}")

        # Early stopping logic
        if val_loss < best_loss:
            best_loss = val_loss
            best_epoch = epoch
            torch.save(model.state_dict(), f"models/{experiment_name}_best.pt")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered at epoch {epoch}")
                break

    print("="*60)
    print("Training finished!")

    # Restore best weights
    model.load_state_dict(torch.load(f"models/{experiment_name}_best.pt"))
    print(f"Best model restored from epoch {best_epoch}")
    print(f"  Val MSE: {best_loss:.4f}")
    print(f"  Val Spearman: {history['val_spearman'][best_epoch-1]:.4f}")

    writer.close()
    return model, history


print("Regression training functions defined.")

### üöÄ **Train Bounding Box Regressor**

In [None]:
%%time
# Setup training for bounding box regressor
box_optimizer = torch.optim.Adam(box_model.parameters(), lr=LEARNING_RATE)
box_criterion = nn.MSELoss()
scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

# Train the model
box_model, box_history = fit_box_regressor(
    model=box_model,
    train_loader=train_box_loader,
    val_loader=val_box_loader,
    epochs=EPOCHS,
    criterion=box_criterion,
    optimizer=box_optimizer,
    scaler=scaler,
    patience=PATIENCE,
    experiment_name="efficientnet_box_regressor"
)

### üìà **Plot Regression Results**

In [None]:
# Plot training history for bounding box regressor
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))

# Plot MSE Loss
ax1.plot(box_history['train_loss'], alpha=0.3, color='#4D61E2', linestyle='--', label='Training')
ax1.plot(box_history['val_loss'], alpha=0.8, color='#4D61E2', label='Validation')
ax1.set_title('Bounding Box Regression Loss (MSE)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend(loc='upper right')
ax1.grid(alpha=0.3)

# Plot Spearman's Rho
ax2.plot(box_history['val_spearman'], alpha=0.8, color='#408537', label='Validation Spearman')
ax2.set_title("Spearman's Rho Correlation", fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Rho')
ax2.legend(loc='lower right')
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

### üëÅÔ∏è **Visualise Bounding Box Predictions**

In [None]:
# Visualise predictions on validation set
box_model.eval()

# Get a batch from validation loader
val_iter = iter(val_box_loader)
imgs, gt_boxes = next(val_iter)
imgs = imgs.to(device)

# Generate predictions
with torch.no_grad():
    pred_boxes = box_model(imgs).cpu().numpy()

# Visualise 10 sample predictions
plt.figure(figsize=(20, 10))

for i in range(min(10, len(imgs))):
    ax = plt.subplot(2, 5, i + 1)

    # Denormalise image for visualisation
    img_np = imgs[i].cpu().permute(1, 2, 0).numpy()
    img_np = img_np * np.array(IMAGENET_STD) + np.array(IMAGENET_MEAN)
    img_np = np.clip(img_np, 0, 1)

    # Convert to uint8 for drawing
    img_copy = (img_np * 255).astype(np.uint8).copy()
    h, w, _ = img_copy.shape

    # Draw ground truth bounding box (green)
    gx1, gy1, gx2, gy2 = gt_boxes[i].numpy()
    cv2.rectangle(img_copy,
                  (int(gx1*w), int(gy1*h)),
                  (int(gx2*w), int(gy2*h)),
                  (0, 255, 0), 4)

    # Draw predicted bounding box (red)
    px1, py1, px2, py2 = pred_boxes[i]
    cv2.rectangle(img_copy,
                  (int(px1*w), int(py1*h)),
                  (int(px2*w), int(py2*h)),
                  (255, 0, 0), 4)

    ax.imshow(img_copy)
    ax.axis('off')

plt.suptitle("Bounding Box Predictions on Validation Set (Green: Ground Truth | Red: Prediction)",
            fontsize=18, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

### **Classifier**

In [None]:
class ClassifierModel(nn.Module):
    """
    Binary classifier using pre-trained EfficientNetB0.

    Classifies images as cat (0) or dog (1).
    """

    def __init__(self, num_classes=2, dropout_rate=0.5):
        """
        Initialise the classifier.

        Args:
            num_classes: Number of output classes
            dropout_rate: Dropout rate for regularisation
        """
        super().__init__()

        # Load pre-trained EfficientNetB0
        self.backbone = torchvision.models.efficientnet_b0(
            weights=torchvision.models.EfficientNet_B0_Weights.IMAGENET1K_V1
        )

        # Freeze backbone initially
        for param in self.backbone.parameters():
            param.requires_grad = False

        # Get input features for classifier
        in_features = self.backbone.classifier[1].in_features

        # Replace classifier head
        self.backbone.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        """
        Forward pass.

        Args:
            x: Input tensor of shape (batch, 3, height, width)

        Returns:
            Class logits of shape (batch, num_classes)
        """
        return self.backbone(x)


print("ClassifierModel defined.")

In [None]:
# Instantiate classifier
cls_model = ClassifierModel().to(device)

print("\nClassifier Model Summary:")
print("="*60)
summary(cls_model, input_size=(3, IMG_SIZE, IMG_SIZE))
print("="*60)

In [None]:
# Visualise classifier architecture
model_graph = draw_graph(
    cls_model,
    input_size=(BATCH_SIZE, 3, IMG_SIZE, IMG_SIZE),
    expand_nested=True,
    depth=6
)
model_graph.visual_graph

In [None]:
def train_cls_epoch(model, loader, criterion, optimizer, scaler):
    """
    Train classifier for one epoch.

    Args:
        model: Classifier model
        loader: Training DataLoader
        criterion: Loss function (CrossEntropy)
        optimizer: Optimiser
        scaler: Mixed precision scaler

    Returns:
        Tuple of (average_loss, average_accuracy)
    """
    model.train()
    loss_sum = 0.0
    acc_list = []

    for img, lbl in loader:
        img, lbl = img.to(device), lbl.to(device)

        optimizer.zero_grad(set_to_none=True)

        # Forward pass with mixed precision
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            out = model(img)
            loss = criterion(out, lbl)

        # Backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loss_sum += loss.item() * img.size(0)
        preds = out.argmax(dim=1)
        acc_list.append(accuracy_score(lbl.cpu(), preds.cpu()))

    return loss_sum / len(loader.dataset), np.mean(acc_list)


def val_cls_epoch(model, loader, criterion):
    """
    Validate classifier for one epoch.

    Args:
        model: Classifier model
        loader: Validation DataLoader
        criterion: Loss function (CrossEntropy)

    Returns:
        Tuple of (average_loss, average_accuracy)
    """
    model.eval()
    loss_sum = 0.0
    acc_list = []

    with torch.no_grad():
        for img, lbl in loader:
            img, lbl = img.to(device), lbl.to(device)

            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                out = model(img)
                loss = criterion(out, lbl)

            loss_sum += loss.item() * img.size(0)
            preds = out.argmax(dim=1)
            acc_list.append(accuracy_score(lbl.cpu(), preds.cpu()))

    return loss_sum / len(loader.dataset), np.mean(acc_list)

def fit_classifier(model, train_loader, val_loader, epochs, criterion, optimizer, scaler,
                   patience=10, experiment_name="classifier"):
    """
    Complete training loop for classifier with early stopping.

    Args:
        model: Classifier model
        train_loader: Training DataLoader
        val_loader: Validation DataLoader
        epochs: Maximum number of epochs
        criterion: Loss function
        optimizer: Optimiser
        scaler: Mixed precision scaler
        patience: Early stopping patience
        experiment_name: Name for saving model and logs

    Returns:
        Tuple of (trained_model, history_dict)
    """
    writer = SummaryWriter(f"./{logs_dir}/{experiment_name}")

    # Initialise history
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': []
    }

    best_acc = float('-inf')
    patience_counter = 0
    best_epoch = 0

    print(f"Starting training: {experiment_name}")
    print("="*60)

    for epoch in range(1, epochs + 1):
        # Training phase
        train_loss, train_acc = train_cls_epoch(model, train_loader, criterion, optimizer, scaler)

        # Validation phase
        val_loss, val_acc = val_cls_epoch(model, val_loader, criterion)

        # Record history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)

        # Log to TensorBoard
        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_acc, epoch)
        writer.add_scalar('Loss/Val', val_loss, epoch)
        writer.add_scalar('Accuracy/Val', val_acc, epoch)

        # Print progress
        if epoch % 5 == 0 or epoch == 1:
            print(f"Epoch {epoch:3d}/{epochs} | "
                  f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
                  f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Early stopping logic
        if val_acc > best_acc:
            best_acc = val_acc
            best_epoch = epoch
            torch.save(model.state_dict(), f"models/{experiment_name}_best.pt")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered at epoch {epoch}")
                break

    print("="*60)
    print("Training finished!")

    # Restore best weights
    model.load_state_dict(torch.load(f"models/{experiment_name}_best.pt"))
    print(f"Best model restored from epoch {best_epoch}")
    print(f"  Val Accuracy: {best_acc:.4f}")

    writer.close()
    return model, history

print("Classification training functions defined.")

### üöÄ **Train Classifier**

In [None]:
%%time
# Setup training for classifier
cls_optimizer = torch.optim.Adam(cls_model.parameters(), lr=LEARNING_RATE)
cls_criterion = nn.CrossEntropyLoss()

# Train the model
cls_model, cls_history = fit_classifier(
    model=cls_model,
    train_loader=train_cls_loader,
    val_loader=val_cls_loader,
    epochs=EPOCHS,
    criterion=cls_criterion,
    optimizer=cls_optimizer,
    scaler=scaler,
    patience=PATIENCE,
    experiment_name="efficientnet_classifier"
)

### üìà **Plot Classification Results**

In [None]:
# Plot training history for classifier
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))

# Plot loss
ax1.plot(cls_history['train_loss'], alpha=0.3, color='#ff7f0e', linestyle='--', label='Train')
ax1.plot(cls_history['val_loss'], alpha=0.8, color='#ff7f0e', label='Val')
ax1.set_title('Cross-Entropy Loss', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend(loc='upper right')
ax1.grid(alpha=0.3)

# Plot accuracy
ax2.plot(cls_history['train_acc'], alpha=0.3, color='#ff7f0e', linestyle='--', label='Train')
ax2.plot(cls_history['val_acc'], alpha=0.8, color='#ff7f0e', label='Val')
ax2.set_title('Classification Accuracy', fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend(loc='lower right')
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

### üîß **Fine-Tuning**

In [None]:
def unfreeze_and_finetune(model):
    """
    Unfreeze the last 3 blocks of EfficientNet features for fine-tuning.

    Args:
        model: Classifier model with frozen backbone

    Returns:
        Model with partially unfrozen backbone
    """
    # Unfreeze last 3 blocks of features
    for param in model.backbone.features[-3:].parameters():
        param.requires_grad = True

    print("Last 3 blocks of backbone unfrozen for fine-tuning")

    # Count trainable parameters after unfreezing
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())

    print(f"\nParameter breakdown after unfreezing:")
    print(f"  Total parameters:     {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")
    print(f"  Trainable percentage: {trainable_params/total_params*100:.1f}%")

    return model


# Apply unfreezing
cls_model = unfreeze_and_finetune(cls_model)

In [None]:
%%time
# Setup fine-tuning with lower learning rate
ft_optimizer = torch.optim.Adam(cls_model.parameters(), lr=1e-4)
ft_criterion = nn.CrossEntropyLoss()

print("\nStarting fine-tuning phase...")

# Fine-tune the model
cls_model, ft_history = fit_classifier(
    model=cls_model,
    train_loader=train_cls_loader,
    val_loader=val_cls_loader,
    epochs=EPOCHS,
    criterion=ft_criterion,
    optimizer=ft_optimizer,
    scaler=scaler,
    patience=5,
    experiment_name="efficientnet_finetuned"
)

### üìà **Plot Fine-Tuning Results**

In [None]:
# Plot fine-tuning history
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))

# Plot loss
ax1.plot(ft_history['train_loss'], alpha=0.3, color='#2ca02c', linestyle='--', label='Train')
ax1.plot(ft_history['val_loss'], alpha=0.8, color='#2ca02c', label='Val')
ax1.set_title('Cross-Entropy Loss (Fine-Tuned)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend(loc='upper right')
ax1.grid(alpha=0.3)

# Plot accuracy
ax2.plot(ft_history['train_acc'], alpha=0.3, color='#2ca02c', linestyle='--', label='Train')
ax2.plot(ft_history['val_acc'], alpha=0.8, color='#2ca02c', label='Val')
ax2.set_title('Classification Accuracy (Fine-Tuned)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend(loc='lower right')
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

### üìä **Confusion Matrix**

In [None]:
def plot_confusion_matrix(model, loader, class_names):
    """
    Compute and plot confusion matrix.

    Args:
        model: Trained classifier
        loader: Validation DataLoader
        class_names: Dictionary mapping class indices to names
    """
    model.eval()
    all_preds = []
    all_targets = []

    # Collect predictions
    with torch.no_grad():
        for img, lbl in loader:
            img, lbl = img.to(device), lbl.to(device)
            out = model(img)
            preds = out.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_targets.extend(lbl.cpu().numpy())

    # Compute confusion matrix
    cm = confusion_matrix(all_targets, all_preds)

    # Plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=list(class_names.values()),
                yticklabels=list(class_names.values()),
                cbar_kws={'label': 'Count'})
    plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
    plt.ylabel('True Label', fontsize=12, fontweight='bold')
    plt.title('Confusion Matrix (Fine-Tuned Model)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()


# Generate confusion matrix
print("Generating confusion matrix...")
plot_confusion_matrix(cls_model, val_cls_loader, num_to_labels)

## üß™ **Test Set Inference**

In [None]:
# Load test images
test_path = 'localization_test/'
print(f"Loading test images from {test_path}...")
X_test_raw = load_images_from_folder(test_path, IMG_SIZE)

print(f"Loaded {len(X_test_raw)} test images")

In [None]:
# Prepare test data transformations
transform_test = transforms.Compose([
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

# Apply transformations
X_test_tensor = torch.stack([
    transform_test(Image.fromarray(img)) for img in X_test_raw
]).to(device)

print(f"Test tensor shape: {X_test_tensor.shape}")

In [None]:
# Run inference on test set
print("\nRunning inference on test set...")

cls_model.eval()
box_model.eval()

with torch.no_grad():
    # Classification predictions
    cls_logits = cls_model(X_test_tensor)
    cls_probs = torch.softmax(cls_logits, dim=1).cpu().numpy()

    # Bounding box predictions
    box_preds = box_model(X_test_tensor).cpu().numpy()

print(f"Inference complete on {len(X_test_raw)} test images")

### üëÅÔ∏è **Visualise Combined Results**

In [None]:
# Visualise combined classification and localisation results
num_img = len(X_test_raw)
fig, axes = plt.subplots(2, 5, figsize=(20, 9))
plt.suptitle("Combined Inference: Classification + Localisation",
            fontsize=18, fontweight='bold')

for i in range(min(10, num_img)):
    ax = axes[i // 5, i % 5]
    img = X_test_raw[i].copy()
    h, w, _ = img.shape

    # Draw predicted bounding box (green)
    bx = box_preds[i]
    cv2.rectangle(img,
                  (int(bx[0]*w), int(bx[1]*h)),
                  (int(bx[2]*w), int(bx[3]*h)),
                  (0, 255, 0), 3)

    # Get predicted label and confidence
    lbl_idx = np.argmax(cls_probs[i])
    conf = cls_probs[i][lbl_idx] * 100

    ax.imshow(img)
    ax.set_title(f"{num_to_labels[lbl_idx]}: {conf:.1f}%",
                fontsize=13, fontweight='bold')
    ax.axis('off')

plt.tight_layout()
plt.show()

## üî• **Class Activation Maps (CAM)**

In [None]:
# @title üëÅÔ∏è **Visualise CAM**
def visualize_cam(model, images_tensor, original_images, target_class=None):
    """
    Visualise Class Activation Maps (CAM).

    CAM highlights the regions that are most important for classification.

    Args:
        model: Trained classifier
        images_tensor: Preprocessed images tensor
        original_images: Original images for visualisation
        target_class: Optional specific class to visualise (if None, uses predicted class)
    """
    model.eval()

    # Hook the last feature layer
    target_layer = model.backbone.features[-1]
    activations = []

    def hook_fn(module, input, output):
        """Save activations from forward pass."""
        activations.append(output)

    # Register forward hook
    handle = target_layer.register_forward_hook(hook_fn)

    # Get weights from final linear layer
    fc_weights = model.backbone.classifier[1].weight.data.cpu().numpy()

    # Generate predictions
    with torch.no_grad():
        outputs = model(images_tensor)
        preds = outputs.argmax(dim=1).cpu().numpy()

    # Remove hook
    handle.remove()

    # Get activations
    acts = activations[0].cpu().numpy()

    # Visualise CAMs - Modified logic based on input length
    num_total_imgs = len(images_tensor)

    if num_total_imgs == 10:
        num_rows = 2
        num_cols = 5
        plot_count = 10
        figsize = (20, 9)
    else:
        num_rows = 1
        num_cols = 5 # Always create 5 columns for the "otherwise" case
        plot_count = min(5, num_total_imgs) # Plot only up to 5 images
        figsize = (20, 4)

    fig, axes = plt.subplots(num_rows, num_cols, figsize=figsize)

    # Ensure axes is always a 1D iterable array for consistent access
    if num_rows * num_cols == 1:
        axes = np.array([axes]) # Make it an array of one element
    else:
        axes = axes.flatten()


    title_suffix = f"Class: {num_to_labels[target_class]}" if target_class is not None else "Predicted Class"
    plt.suptitle(f'Class Activation Maps - {title_suffix}',
                fontsize=18, fontweight='bold')

    for i in range(plot_count):
        ax = axes[i] # Use flattened axes for consistent access

        # Use target class or predicted class
        c_idx = target_class if target_class is not None else preds[i]

        # Compute weighted CAM
        cam = np.zeros(acts.shape[2:], dtype=np.float32)
        for w, feat in zip(fc_weights[c_idx], acts[i]):
            cam += w * feat

        # Resize and normalise
        cam = cv2.resize(cam, (IMG_SIZE, IMG_SIZE))
        cam = np.maximum(cam, 0)  # ReLU activation
        cam = cam / (cam.max() + 1e-8)

        # Display original image with CAM overlay
        ax.imshow(original_images[i])
        ax.imshow(cam, cmap='turbo', alpha=0.5)
        ax.axis('off')
        ax.set_title(f"{num_to_labels[c_idx]}", fontsize=13, fontweight='bold')

    # Hide any unused subplots
    for j in range(plot_count, num_rows * num_cols):
        if j < len(axes): # Check if the axis exists before trying to hide it
            fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()


# Generate CAMs for predicted classes
print("Generating Class Activation Maps...")
visualize_cam(cls_model, X_test_tensor, X_test_raw, target_class=None)

In [None]:
# @title üì¶ **Bounding Box from CAM**
def get_bbox_from_heatmap(heatmap, threshold_percent=0.2):
    """
    Generate bounding box from heatmap using thresholding.

    Args:
        heatmap: 2D array (H, W) normalised to [0,1]
        threshold_percent: Threshold value (0-1) for binary mask

    Returns:
        Tuple (x, y, w, h) or None if no contours found
    """
    # Apply threshold
    threshold_val = np.max(heatmap) * threshold_percent
    _, binary_map = cv2.threshold(heatmap, threshold_val, 255, cv2.THRESH_BINARY)
    binary_map = binary_map.astype(np.uint8)

    # Find contours
    contours, _ = cv2.findContours(binary_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if len(contours) > 0:
        # Take the largest contour
        largest_contour = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        return x, y, w, h

    return None

def visualize_cam_with_bbox(model, images_tensor, original_images):
    """
    Visualise CAM and extract bounding boxes from activation maps.

    Args:
        model: Trained classifier
        images_tensor: Preprocessed images tensor
        original_images: Original images for visualisation
    """
    model.eval()

    # Hook setup
    target_layer = model.backbone.features[-1]
    activations = []

    def hook_fn(module, input, output):
        activations.append(output)

    handle = target_layer.register_forward_hook(hook_fn)
    fc_weights = model.backbone.classifier[1].weight.data.cpu().numpy()

    # Generate predictions
    with torch.no_grad():
        outputs = model(images_tensor)
        preds = outputs.argmax(dim=1).cpu().numpy()

    handle.remove()
    acts = activations[0].cpu().numpy()

    # Visualise
    num_imgs = min(10, len(images_tensor))
    fig, axes = plt.subplots(2, 5, figsize=(20, 9))
    plt.suptitle("CAM-based Bounding Box Extraction",
                fontsize=18, fontweight='bold')

    for i in range(num_imgs):
        ax = axes[i // 5, i % 5]
        c_idx = preds[i]

        # Compute CAM
        cam = np.zeros(acts.shape[2:], dtype=np.float32)
        for w, feat in zip(fc_weights[c_idx], acts[i]):
            cam += w * feat
        cam = cv2.resize(cam, (IMG_SIZE, IMG_SIZE))
        cam = np.maximum(cam, 0)
        cam = cam / (cam.max() + 1e-8)

        # Extract bounding box from CAM
        img_copy = original_images[i].copy()
        bbox = get_bbox_from_heatmap(cam, threshold_percent=0.3)

        # Draw bounding box if found
        if bbox is not None:
            x, y, w, h = bbox
            cv2.rectangle(img_copy, (x, y), (x+w, y+h), (0, 255, 0), 3)

        # Display
        ax.imshow(img_copy)
        ax.imshow(cam, cmap='turbo', alpha=0.4)
        ax.axis('off')
        ax.set_title(f"{num_to_labels[c_idx]}", fontsize=13, fontweight='bold')

    plt.tight_layout()
    plt.show()


# Extract bounding boxes from CAMs
print("Extracting bounding boxes from CAMs...")
visualize_cam_with_bbox(cls_model, X_test_tensor, X_test_raw)

In [None]:
# @title üëÅÔ∏è **Visualise CAM Channel Decomposition**
def visualize_cam_channels(model, image_tensor, original_image, target_class=None, top_n=6):
    """
    Visualise individual feature map channels and their weighted contribution to CAM.

    This function shows how the Class Activation Map is computed as a weighted
    sum of feature maps from the last convolutional layer.

    Args:
        model: Trained classifier
        image_tensor: Preprocessed image tensor (single image)
        original_image: Original image for visualisation
        target_class: Target class index (if None, uses predicted class)
        top_n: Number of top weighted channels to display
    """
    model.eval()

    # Hook the last feature layer
    target_layer = model.backbone.features[-1]
    activations = []

    def hook_fn(module, input, output):
        activations.append(output)

    handle = target_layer.register_forward_hook(hook_fn)

    # Get weights from final linear layer
    fc_weights = model.backbone.classifier[1].weight.data.cpu().numpy()

    # Generate prediction
    with torch.no_grad():
        outputs = model(image_tensor)
        pred_class = outputs.argmax(dim=1).item()

    handle.remove()

    # Get activations
    acts = activations[0].cpu().numpy()[0]  # Shape: (num_channels, H, W)

    # Use target class or predicted class
    c_idx = target_class if target_class is not None else pred_class

    # Get weights for target class
    weights = fc_weights[c_idx]

    # Sort channels by weight value (take only positive contributors)
    channel_importance = [(i, weights[i]) for i in range(len(weights)) if weights[i] > 0]
    channel_importance.sort(key=lambda x: x[1], reverse=True)

    # Select top N channels
    top_channels = channel_importance[:top_n]

    # Compute final CAM
    cam = np.zeros(acts.shape[1:], dtype=np.float32)
    for channel_idx, _ in channel_importance:
        cam += weights[channel_idx] * acts[channel_idx]
    cam = cv2.resize(cam, (IMG_SIZE, IMG_SIZE))
    cam = np.maximum(cam, 0)
    cam = cam / (cam.max() + 1e-8)

    # Create visualisation - single row
    fig, axes = plt.subplots(1, top_n + 2, figsize=(22, 4))

    # Title
    fig.suptitle(f'CAM Channel Decomposition - Class: {num_to_labels[c_idx]}',
                fontsize=16, fontweight='bold', y=1.02)

    # Plot original image
    axes[0].imshow(original_image)
    axes[0].set_title('Original Image', fontsize=11, fontweight='bold')
    axes[0].axis('off')

    # Collect all feature maps for consistent color scaling
    all_maps = []

    for i, (channel_idx, weight) in enumerate(top_channels):
        # Get feature map
        feature_map = acts[channel_idx]
        feature_map = cv2.resize(feature_map, (IMG_SIZE, IMG_SIZE))
        feature_map = np.maximum(feature_map, 0)

        all_maps.append(feature_map)

    # Find global min/max for consistent coloring
    vmin = min(m.min() for m in all_maps)
    vmax = max(m.max() for m in all_maps)

    # Plot feature maps with consistent scaling
    for i, (channel_idx, weight) in enumerate(top_channels):
        feature_map = all_maps[i]

        # Plot feature map
        ax = axes[i + 1]
        ax.imshow(feature_map, cmap='jet', vmin=vmin, vmax=vmax)
        ax.set_title(f'Channel {channel_idx}\nw={weight:.4f}',
                    fontsize=10, fontweight='bold')
        ax.axis('off')

    # Plot final CAM
    axes[-1].imshow(original_image)
    axes[-1].imshow(cam, cmap='turbo', alpha=0.6)
    axes[-1].set_title('Final CAM\n(Weighted Sum)',
                      fontsize=11, fontweight='bold')
    axes[-1].axis('off')

    # Add equation text with proper formula: w √ó Channel
    equation_parts = [f'w{i+1} √ó Ch{top_channels[i][0]}' for i in range(len(top_channels))]
    equation_text = ' + '.join(equation_parts) + ' + ... = CAM'
    fig.text(0.5, -0.05, equation_text, ha='center', fontsize=12,
            fontweight='bold', style='italic')

    plt.tight_layout()
    plt.show()

# Visualise CAM channels for test images
print("Visualising CAM channel decomposition...")

# Select a test image
test_idx = 8
test_img_tensor = X_test_tensor[test_idx:test_idx+1]
test_img_orig = X_test_raw[test_idx]

visualize_cam_channels(cls_model, test_img_tensor, test_img_orig, target_class=None, top_n=8)

### **Multiple Targets CAM**

In [None]:
# Load test (multiple targets) images
test_path_multiple = 'multiple_cats_dogs_images_test/'
print(f"Loading test (multiple targets) images from {test_path}...")
X_test_multiple_raw = load_images_from_folder(test_path_multiple, IMG_SIZE)

print(f"Loaded {len(X_test_multiple_raw)} test images")

# Apply transformations
X_test_multiple_tensor = torch.stack([
    transform_test(Image.fromarray(img)) for img in X_test_multiple_raw
]).to(device)

print(f"Test tensor (multiple targets) shape: {X_test_multiple_tensor.shape}")

# Run inference on test set
print("Running inference on test set (multiple targets)...")

cls_model.eval()

with torch.no_grad():
    # Classification predictions
    cls_multiple_logits = cls_model(X_test_multiple_tensor)

print(f"Inference complete on {len(X_test_multiple_raw)} test images (multiple targets)")

In [None]:
#@title **üê± Generating CAM visualisations for class Cat**
print("Generating Class Activation Maps...")
visualize_cam(cls_model, X_test_multiple_tensor, X_test_multiple_raw, target_class=0)

In [None]:
#@title **üê∂ Generating CAM visualisations for class Dog**
print("Generating Class Activation Maps...")
visualize_cam(cls_model, X_test_multiple_tensor, X_test_multiple_raw, target_class=1)

## üéØ **Grad-CAM**

In [None]:
# @title üëÅÔ∏è **Visualise Grad-CAM**
class GradCAM:
    """
    Grad-CAM (Gradient-weighted Class Activation Mapping) implementation.

    Grad-CAM uses gradients flowing into the final convolutional layer
    to produce a coarse localisation map highlighting important regions.
    """

    def __init__(self, model, target_layer):
        """
        Initialise Grad-CAM.

        Args:
            model: Trained classifier
            target_layer: Target convolutional layer for CAM generation
        """
        self.model = model
        self.gradients = None
        self.activations = None
        self.handles = []

        # Register forward and backward hooks
        self.handles.append(
            target_layer.register_forward_hook(self.save_activation)
        )
        self.handles.append(
            target_layer.register_full_backward_hook(self.save_gradient)
        )

    def save_activation(self, module, input, output):
        """Save activations from forward pass."""
        self.activations = output

    def save_gradient(self, module, grad_input, grad_output):
        """Save gradients from backward pass."""
        self.gradients = grad_output[0]

    def cleanup(self):
        """Remove registered hooks."""
        for h in self.handles:
            h.remove()

    def __call__(self, x, class_idx=None):
        """
        Generate Grad-CAM heatmap.

        Args:
            x: Input tensor (single image)
            class_idx: Target class index (if None, uses predicted class)

        Returns:
            Tuple of (normalised_cam, class_index)
        """
        self.model.eval()
        self.model.zero_grad()

        # Forward pass
        output = self.model(x)
        if class_idx is None:
            class_idx = output.argmax(dim=1).item()

        # Backward pass for target class
        output[0, class_idx].backward()

        # Compute importance weights (average gradients)
        grads = self.gradients.cpu().numpy()[0]
        acts = self.activations.detach().cpu().numpy()[0]
        weights = np.mean(grads, axis=(1, 2))

        # Generate weighted CAM
        cam = np.zeros(acts.shape[1:], dtype=np.float32)
        for i, w in enumerate(weights):
            cam += w * acts[i]

        # Apply ReLU and resize
        cam = np.maximum(cam, 0)
        cam = cv2.resize(cam, (x.shape[3], x.shape[2]))

        # Normalise
        return cam / (cam.max() + 1e-8), class_idx


print("Generating Grad-CAM visualisations...")

# Initialise Grad-CAM on last feature layer
gcam = GradCAM(cls_model, cls_model.backbone.features[-1])

fig, axes = plt.subplots(2, 5, figsize=(20, 9))
plt.suptitle("Grad-CAM: Predicted Class Visualisation",
            fontsize=18, fontweight='bold')

for i in range(min(10, len(X_test_raw))):
    # Prepare single image tensor with gradients enabled
    img_t = X_test_tensor[i:i+1]
    img_t.requires_grad = True

    # Generate Grad-CAM heatmap
    mask, c_idx = gcam(img_t)

    ax = axes[i // 5, i % 5]
    ax.imshow(X_test_raw[i])
    ax.imshow(mask, cmap='jet', alpha=0.6)
    ax.set_title(num_to_labels[c_idx], fontsize=13, fontweight='bold')
    ax.axis('off')

plt.tight_layout()
plt.show()

# Cleanup
gcam.cleanup()

In [None]:
#@title üì¶ **Bounding Box from Grad-CAM**
def visualize_gradcam_with_bbox(model, images_tensor, original_images):
    """
    Visualise Grad-CAM and extract bounding boxes.

    Args:
        model: Trained classifier
        images_tensor: Preprocessed images tensor
        original_images: Original images for visualisation
    """
    # Initialise Grad-CAM
    gcam_tool = GradCAM(model, model.backbone.features[-1])

    num_imgs = min(10, len(images_tensor))
    fig, axes = plt.subplots(2, 5, figsize=(20, 9))
    plt.suptitle("Grad-CAM-based Bounding Box Extraction",
                fontsize=18, fontweight='bold')

    for i in range(num_imgs):
        ax = axes[i // 5, i % 5]

        # Prepare image with gradients
        img_t = images_tensor[i:i+1]
        img_t.requires_grad = True

        # Generate Grad-CAM heatmap
        heatmap, pred_c = gcam_tool(img_t)

        # Extract bounding box
        img_copy = original_images[i].copy()
        bbox = get_bbox_from_heatmap(heatmap, threshold_percent=0.3)

        # Draw bounding box if found
        if bbox is not None:
            x, y, w, h = bbox
            cv2.rectangle(img_copy, (x, y), (x+w, y+h), (0, 255, 0), 3)

        # Display
        ax.imshow(img_copy)
        ax.imshow(heatmap, cmap='jet', alpha=0.4)
        ax.set_title(f"{num_to_labels[pred_c]}", fontsize=13, fontweight='bold')
        ax.axis('off')

    # Cleanup
    gcam_tool.cleanup()
    plt.tight_layout()
    plt.show()


# Extract bounding boxes from Grad-CAMs
print("Extracting bounding boxes from Grad-CAMs...")
visualize_gradcam_with_bbox(cls_model, X_test_tensor, X_test_raw)

In [None]:
#@title üëÅÔ∏è **Visualise Grad-CAM Channel Decomposition**
def visualize_gradcam_channels(model, image_tensor, original_image, target_class=None, top_n=6):
    """
    Visualise individual feature map channels and their gradient-weighted contribution to Grad-CAM.

    This function shows how Grad-CAM is computed as a gradient-weighted
    sum of feature maps from the last convolutional layer.

    Args:
        model: Trained classifier
        image_tensor: Preprocessed image tensor (single image, requires_grad=True)
        original_image: Original image for visualisation
        target_class: Target class index (if None, uses predicted class)
        top_n: Number of top weighted channels to display
    """
    model.eval()

    # Hook setup
    target_layer = model.backbone.features[-1]
    activations = []
    gradients = []

    def forward_hook(module, input, output):
        activations.append(output)

    def backward_hook(module, grad_input, grad_output):
        gradients.append(grad_output[0])

    handle_forward = target_layer.register_forward_hook(forward_hook)
    handle_backward = target_layer.register_full_backward_hook(backward_hook)

    # Forward pass
    model.zero_grad()
    image_tensor.requires_grad = True
    output = model(image_tensor)

    # Get predicted class
    pred_class = output.argmax(dim=1).item()
    c_idx = target_class if target_class is not None else pred_class

    # Backward pass
    output[0, c_idx].backward()

    # Remove hooks
    handle_forward.remove()
    handle_backward.remove()

    # Get activations and gradients
    acts = activations[0].cpu().detach().numpy()[0]  # Shape: (num_channels, H, W)
    grads = gradients[0].cpu().numpy()[0]

    # Compute importance weights (global average pooling of gradients)
    weights = np.mean(grads, axis=(1, 2))

    # Sort channels by weight value (take only positive contributors)
    channel_importance = [(i, weights[i]) for i in range(len(weights)) if weights[i] > 0]
    channel_importance.sort(key=lambda x: x[1], reverse=True)

    # Select top N channels
    top_channels = channel_importance[:top_n]

    # Compute final Grad-CAM
    cam = np.zeros(acts.shape[1:], dtype=np.float32)
    for i in range(len(weights)):
        cam += weights[i] * acts[i]
    cam = cv2.resize(cam, (IMG_SIZE, IMG_SIZE))
    cam = np.maximum(cam, 0)  # ReLU
    cam = cam / (cam.max() + 1e-8)

    # Create visualisation - single row
    fig, axes = plt.subplots(1, top_n + 2, figsize=(22, 4))

    # Title
    fig.suptitle(f'Grad-CAM Channel Decomposition - Class: {num_to_labels[c_idx]}',
                fontsize=16, fontweight='bold', y=1.02)

    # Plot original image
    axes[0].imshow(original_image)
    axes[0].set_title('Original Image', fontsize=11, fontweight='bold')
    axes[0].axis('off')

    # Collect all feature maps for consistent color scaling
    all_maps = []

    for i, (channel_idx, weight) in enumerate(top_channels):
        # Get feature map
        feature_map = acts[channel_idx]
        feature_map = cv2.resize(feature_map, (IMG_SIZE, IMG_SIZE))
        feature_map = np.maximum(feature_map, 0)

        all_maps.append(feature_map)

    # Find global min/max for consistent coloring
    vmin = min(m.min() for m in all_maps)
    vmax = max(m.max() for m in all_maps)

    # Plot feature maps with consistent scaling
    for i, (channel_idx, weight) in enumerate(top_channels):
        feature_map = all_maps[i]

        # Plot feature map
        ax = axes[i + 1]
        ax.imshow(feature_map, cmap='jet', vmin=vmin, vmax=vmax)
        ax.set_title(f'Channel {channel_idx}\nŒ±={weight:.4f}',
                    fontsize=10, fontweight='bold')
        ax.axis('off')

    # Plot final Grad-CAM
    axes[-1].imshow(original_image)
    axes[-1].imshow(cam, cmap='jet', alpha=0.6)
    axes[-1].set_title('Final Grad-CAM\n(Weighted Sum)',
                      fontsize=11, fontweight='bold')
    axes[-1].axis('off')

    # Add equation text with proper formula: Œ± √ó Channel
    equation_parts = [f'Œ±{i+1} √ó Ch{top_channels[i][0]}' for i in range(len(top_channels))]
    equation_text = ' + '.join(equation_parts) + ' + ... = Grad-CAM'
    fig.text(0.5, -0.05, equation_text, ha='center', fontsize=12,
            fontweight='bold', style='italic')

    plt.tight_layout()
    plt.show()

# Visualise Grad-CAM channels for test images
print("Visualising Grad-CAM channel decomposition...")

# Select a test image
test_idx = 8
test_img_tensor = X_test_tensor[test_idx:test_idx+1].clone()
test_img_tensor.requires_grad = True
test_img_orig = X_test_raw[test_idx]

visualize_gradcam_channels(cls_model, test_img_tensor, test_img_orig, target_class=None, top_n=8)

In [None]:
#@title üîç **Multi-Layer Grad-CAM Comparison**
# Compare Grad-CAM at different layers
layers_to_viz = [
    cls_model.backbone.features[-5],
    cls_model.backbone.features[-3],
    cls_model.backbone.features[-1]
]
layer_names = ["Layer -5 (Early)", "Layer -3 (Mid)", "Layer -1 (Late)"]

num_imgs = min(10, len(X_test_raw))
fig, axes = plt.subplots(num_imgs, 4, figsize=(16, 4 * num_imgs))

plt.suptitle("Multi-Layer Grad-CAM Comparison",
            fontsize=18, fontweight='bold', y=0.995)

for idx in range(num_imgs):
    # Column 0: Original image
    axes[idx, 0].imshow(X_test_raw[idx])
    axes[idx, 0].set_title("Original", fontsize=12, fontweight='bold')
    axes[idx, 0].axis('off')

    # Columns 1-3: Grad-CAMs for different layers
    for l_idx, layer in enumerate(layers_to_viz):
        gcam_tool = GradCAM(cls_model, layer)

        img_t = X_test_tensor[idx:idx+1]
        img_t.requires_grad = True

        mask, pred_c = gcam_tool(img_t)

        ax = axes[idx, l_idx + 1]
        ax.imshow(X_test_raw[idx])
        ax.imshow(mask, cmap='jet', alpha=0.6)
        ax.set_title(f"{layer_names[l_idx]}\n{num_to_labels[pred_c]}",
                    fontsize=12, fontweight='bold')
        ax.axis('off')

        gcam_tool.cleanup()

plt.tight_layout()
plt.show()

In [None]:
#@title **üê± Generating Grad-CAM visualisations for class Cat**

# Choose target class
TARGET_CLASS = 0  # 0 = 'cat', 1 = 'dog'

# Initialise Grad-CAM on last feature layer
gcam = GradCAM(cls_model, cls_model.backbone.features[-1])

fig, axes = plt.subplots(1, 5, figsize=(20, 4)) # Changed to 1 row, 5 columns, adjusted figsize
plt.suptitle(f"Grad-CAM: Forced Class '{num_to_labels[TARGET_CLASS]}', first 5 images", # Added 'first 5 images'
            fontsize=18, fontweight='bold')

for i in range(min(5, len(X_test_multiple_raw))): # Changed range to 5 to fit 5 columns
    # Prepare single image tensor with gradients enabled
    img_t = X_test_multiple_tensor[i:i+1]
    img_t.requires_grad = True

    # Generate Grad-CAM heatmap
    mask, c_idx = gcam(img_t, class_idx=TARGET_CLASS)

    ax = axes[i % 5] # Adjusted indexing for a single row
    ax.imshow(X_test_multiple_raw[i])
    ax.imshow(mask, cmap='jet', alpha=0.6)
    ax.set_title(num_to_labels[c_idx], fontsize=13, fontweight='bold')
    ax.axis('off')

plt.tight_layout()
plt.show()

# Cleanup
gcam.cleanup()

In [None]:
#@title **üê∂ Generating Grad-CAM visualisations for class Dog**

# Choose target class
TARGET_CLASS = 1  # 0 = 'cat', 1 = 'dog'

# Initialise Grad-CAM on last feature layer
gcam = GradCAM(cls_model, cls_model.backbone.features[-1])

fig, axes = plt.subplots(1, 5, figsize=(20, 4)) # Changed to 1 row, 5 columns, adjusted figsize
plt.suptitle(f"Grad-CAM: Forced Class '{num_to_labels[TARGET_CLASS]}', first 5 images", # Added 'first 5 images'
            fontsize=18, fontweight='bold')

for i in range(min(5, len(X_test_multiple_raw))): # Changed range to 5 to fit 5 columns
    # Prepare single image tensor with gradients enabled
    img_t = X_test_multiple_tensor[i:i+1]
    img_t.requires_grad = True

    # Generate Grad-CAM heatmap
    mask, c_idx = gcam(img_t, class_idx=TARGET_CLASS)

    ax = axes[i % 5] # Adjusted indexing for a single row
    ax.imshow(X_test_multiple_raw[i])
    ax.imshow(mask, cmap='jet', alpha=0.6)
    ax.set_title(num_to_labels[c_idx], fontsize=13, fontweight='bold')
    ax.axis('off')

plt.tight_layout()
plt.show()

# Cleanup
gcam.cleanup()

#  
<img src="https://airlab.deib.polimi.it/wp-content/uploads/2019/07/airlab-logo-new_cropped.png" width="350">

##### Connect with us:
- <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/81/LinkedIn_icon.svg/2048px-LinkedIn_icon.svg.png" width="14"> **LinkedIn:**  [AIRLab Polimi](https://www.linkedin.com/company/airlab-polimi/)
- <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/9/95/Instagram_logo_2022.svg/800px-Instagram_logo_2022.svg.png" width="14"> **Instagram:** [airlab_polimi](https://www.instagram.com/airlab_polimi/)

##### Contributors:
- **Eugenio Lomurno**: eugenio.lomurno@polimi.it
- **Alberto Archetti**: alberto.archetti@polimi.it
- **Roberto Basla**: roberto.basla@polimi.it
- **Carlo Sgaravatti**: carlo.sgaravatti@polimi.it

```
   Copyright 2025 Eugenio Lomurno, Alberto Archetti, Roberto Basla, Carlo Sgaravatti

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
```