## Cell 1: Setup and Installations

In [1]:
# Install necessary libraries
!pip install torch pandas scikit-learn datasets huggingface_hub torchmetrics tqdm -q

# --- 1. Imports and Setup ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm.notebook import tqdm

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/983.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m983.0/983.2 kB[0m [31m40.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h

## Cell 2: Load and Explore the SuSy Datase

In [2]:
# --- 2. Load the Dataset from Hugging Face ---
print("Loading the SuSy dataset...")
dataset = load_dataset("HPAI-BSC/SuSy-Dataset", name="susy_dataset")

# Convert the Hugging Face Dataset to a pandas DataFrame for easier exploration
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

print("Dataset loaded successfully.")

Loading the SuSy dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train.zip:   0%|          | 0.00/15.2G [00:00<?, ?B/s]

data/val.zip:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

data/test.zip:   0%|          | 0.00/6.02G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully.


In [3]:
# Display the first few rows of the training data
print("\n--- First 5 Rows ---")
print(train_df.head())

# Get a summary of the data types and non-null values
print("\n--- Data Info ---")
train_df.info()

# Get statistical summary of the features
print("\n--- Statistical Summary ---")
print(train_df.describe())

# Check the balance of the target variable
# CORRECTED: Changed 'signal' to the correct column name, 'label'
print("\n--- Class Distribution (Target: label) ---")
print(train_df['label'].value_counts())


--- First 5 Rows ---
                                               image  label
0  {'bytes': None, 'path': 'zip://train/coco/0000...      0
1  {'bytes': None, 'path': 'zip://train/coco/0000...      0
2  {'bytes': None, 'path': 'zip://train/coco/0000...      0
3  {'bytes': None, 'path': 'zip://train/coco/0000...      0
4  {'bytes': None, 'path': 'zip://train/coco/0000...      0

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14451 entries, 0 to 14450
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   14451 non-null  object
 1   label   14451 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 225.9+ KB

--- Statistical Summary ---
              label
count  14451.000000
mean       2.640855
std        1.790788
min        0.000000
25%        1.000000
50%        3.000000
75%        4.000000
max        5.000000

--- Class Distribution (Target: label) ---
label
0    2967
2    2967
5    2967
4    2

## Cell 3: Data Preprocessing

In [4]:
from torchvision import transforms

def get_transforms(image_size=224):
    """Returns a dictionary of data transforms for training and validation."""
    # These are standard normalization values for models pretrained on ImageNet
    imagenet_mean = [0.485, 0.456, 0.406]
    imagenet_std = [0.229, 0.224, 0.225]

    # Transformations for the training set (includes data augmentation)
    train_transforms = transforms.Compose([
        transforms.RandomResizedCrop(image_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=imagenet_mean, std=imagenet_std)
    ])

    # Transformations for the validation/test set (no augmentation)
    val_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=imagenet_mean, std=imagenet_std)
    ])

    return {'train': train_transforms, 'val': val_transforms}

In [5]:
# Get the dictionary of transforms
image_transforms = get_transforms()

# Define functions to apply the transforms to a batch of data
def apply_train_transforms(batch):
    # Use the 'image' column from your dataset
    batch['pixel_values'] = [image_transforms['train'](img.convert("RGB")) for img in batch['image']]
    return batch

def apply_val_transforms(batch):
    batch['pixel_values'] = [image_transforms['val'](img.convert("RGB")) for img in batch['image']]
    return batch

# Set the transforms for the datasets
dataset['train'].set_transform(apply_train_transforms)
dataset['test'].set_transform(apply_val_transforms)

# Define a custom collate function to correctly stack tensors and labels
def collate_fn(examples):
    pixel_values = torch.stack([example['pixel_values'] for example in examples])
    # Use the 'label' column from your dataset
    labels = torch.tensor([example['label'] for example in examples], dtype=torch.long)
    return {'pixel_values': pixel_values, 'labels': labels}

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(dataset['test'], batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(f"Data preprocessing complete.")
print(f"Created Train DataLoader with {len(train_loader)} batches of size {batch_size}")
print(f"Created Test DataLoader with {len(test_loader)} batches of size {batch_size}")

Data preprocessing complete.
Created Train DataLoader with 452 batches of size 32
Created Test DataLoader with 174 batches of size 32


## Cell 4: Model Architecture (MLP)

In [6]:
import torch.nn as nn
from torchvision import models

# --- 4. Define the Neural Network for Image Data ---
# We use a pretrained ResNet50, which is excellent for image classification.

def build_model(num_classes, pretrained=True):
    """Builds a ResNet50 model for transfer learning."""

    # Use the current recommended way to load pretrained weights
    weights = models.ResNet50_Weights.DEFAULT if pretrained else None
    model = models.resnet50(weights=weights)

    # Freeze all the parameters in the pre-trained layers
    for param in model.parameters():
        param.requires_grad = False

    # Get the number of input features for the final fully connected layer
    num_ftrs = model.fc.in_features

    # Replace the final layer to match the number of classes in your dataset
    model.fc = nn.Linear(num_ftrs, num_classes)

    return model

# --- Instantiate the model ---
# Automatically determine the number of classes from the dataset information
num_classes = dataset['train'].features['label'].num_classes
print(f"Dataset has {num_classes} classes.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = build_model(num_classes=num_classes).to(device)

print("\nModel architecture defined and moved to device:", device)
# To see the final layer, you can print the model's fully connected layer
print("\nFinal Classifier Layer:")
print(model.fc)

Dataset has 6 classes.
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:01<00:00, 83.5MB/s]



Model architecture defined and moved to device: cuda

Final Classifier Layer:
Linear(in_features=2048, out_features=6, bias=True)


## Cell 5: Training and Validation Engine

In [None]:
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall, MulticlassF1Score

# --- 5. Setup for Training ---

# CORRECTED: Use CrossEntropyLoss for multi-class classification.
# This loss function combines LogSoftmax and NLLLoss, so the model's output should be raw logits.
criterion = nn.CrossEntropyLoss()

# Optimizer (this remains the same)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# CORRECTED: Use Multiclass metrics instead of Binary metrics.
# We specify the number of classes and set average='macro' to calculate the metric for each class
# and find their unweighted mean. This is a good practice for class-imbalanced datasets.
num_classes = dataset['train'].features['label'].num_classes
accuracy = MulticlassAccuracy(num_classes=num_classes, average='macro').to(device)
precision_metric = MulticlassPrecision(num_classes=num_classes, average='macro').to(device)
recall = MulticlassRecall(num_classes=num_classes, average='macro').to(device)
f1_score = MulticlassF1Score(num_classes=num_classes, average='macro').to(device)


def train_one_epoch(model, loader, optimizer, criterion):
    """Performs one training epoch."""
    model.train()
    running_loss = 0.0
    progress_bar = tqdm(loader, desc="Training")

    # CORRECTED: Iterate through the batch dictionary
    for batch in progress_bar:
        # Unpack the dictionary by key
        inputs = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")

    return running_loss / len(loader)

def validate(model, loader, criterion):
    """Performs validation."""
    model.eval()
    running_loss = 0.0
    accuracy.reset()
    precision_metric.reset()
    recall.reset()
    f1_score.reset()

    with torch.no_grad():
        # CORRECTED: Iterate through the batch dictionary
        for batch in tqdm(loader, desc="Validating"):
            # Unpack the dictionary by key
            inputs = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Update metrics (no sigmoid needed for CrossEntropyLoss)
            accuracy.update(outputs, labels)
            precision_metric.update(outputs, labels)
            recall.update(outputs, labels)
            f1_score.update(outputs, labels)

    avg_loss = running_loss / len(loader)
    acc = accuracy.compute()
    prec = precision_metric.compute()
    rec = recall.compute()
    f1 = f1_score.compute()

    return avg_loss, acc, prec, rec, f1

## Cell 6: Training Execution

In [None]:
# --- 6. Main Training Loop ---
num_epochs = 5
best_val_f1 = 0.0

for epoch in range(num_epochs):
    print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")

    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1} Training Loss: {train_loss:.4f}")

    val_loss, val_acc, val_prec, val_rec, val_f1 = validate(model, test_loader, criterion)
    print(f"Epoch {epoch+1} Validation Loss: {val_loss:.4f}")
    print(f"Validation -> Accuracy: {val_acc:.4f}, Precision: {val_prec:.4f}, Recall: {val_rec:.4f}, F1-Score: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_susy_model.pth")
        print(f"New best model saved with F1-Score: {best_val_f1:.4f}")

print("\n--- Training Finished ---")


--- Epoch 1/5 ---


Training:   0%|          | 0/452 [00:00<?, ?it/s]

Epoch 1 Training Loss: 1.0246


Validating:   0%|          | 0/174 [00:00<?, ?it/s]

Epoch 1 Validation Loss: 0.8315
Validation -> Accuracy: 0.6425, Precision: 0.6633, Recall: 0.6425, F1-Score: 0.6431
New best model saved with F1-Score: 0.6431

--- Epoch 2/5 ---


Training:   0%|          | 0/452 [00:00<?, ?it/s]

Epoch 2 Training Loss: 0.7287


Validating:   0%|          | 0/174 [00:00<?, ?it/s]

Epoch 2 Validation Loss: 0.7736
Validation -> Accuracy: 0.6656, Precision: 0.6776, Recall: 0.6656, F1-Score: 0.6674
New best model saved with F1-Score: 0.6674

--- Epoch 3/5 ---


Training:   0%|          | 0/452 [00:00<?, ?it/s]

Epoch 3 Training Loss: 0.6572


Validating:   0%|          | 0/174 [00:00<?, ?it/s]

Epoch 3 Validation Loss: 0.7771
Validation -> Accuracy: 0.6849, Precision: 0.6843, Recall: 0.6849, F1-Score: 0.6796
New best model saved with F1-Score: 0.6796

--- Epoch 4/5 ---


Training:   0%|          | 0/452 [00:00<?, ?it/s]