# 1. Data & Model Loading

This notebook prepares the data and models used for the subsequent optimisation pipeline. This is to emulate a non-compressed model training and evaluation process, where the model is adapted to a specific dataset and then exported for further compression for embedded deployment.

The process is defined as such:
* A Torch dataset (already split into train and val) and model are loaded. Those must be specialized for classification tasks, but are agnostic
of the modality.
* The model"s classification head is adapted to the number of classes in the dataset, trained on the training set while freezing the backbone, and evaluated on the validation set.
* The whole model (backbone + classification head) is then adapted to the dataset by freezing all layers except the classification head, which is trained on the training set.
* The adapted model is then exported as a Torch model for later use in the optimisation pipeline.

2 models are exported:
* An image MobileNetV2 model with a classification head adapted to the CIFAR-10 dataset.
* An audio YAML model with a classification head adapted to the ESC-50 dataset.

## Setup

### General

In [None]:
from typing import Callable, Literal
from itertools import islice

import os
import time

import torch
import torchvision
from torch.optim import Optimizer
from torch.amp import GradScaler, autocast

import logging
from tqdm import tqdm

import psutil
try:
    import pynvml
    pynvml.nvmlInit()
    pynvml_available = True
except (ImportError, pynvml.NVMLError):
    pynvml_available = False
print(f"pynvml available: {pynvml_available}")

# Setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32

# Base directories for datasets and models
BASE_DATA_DIR = "../data"
IMAGE_DATA_DIR = os.path.join(BASE_DATA_DIR, "image")
BASE_MODEL_DIR = "../models"
MODEL_BASELINE_DIR = os.path.join(BASE_MODEL_DIR, "baseline")

# CIFAR-10 dataset directories
CIFAR10_DIR = os.path.join(IMAGE_DATA_DIR, "cifar10")
CIFAR10_TRAIN_DIR = os.path.join(CIFAR10_DIR, "train")
CIFAR10_TRAIN_PT_FILE = os.path.join(CIFAR10_TRAIN_DIR, "data.pt")
CIFAR10_VAL_DIR = os.path.join(CIFAR10_DIR, "val")
CIFAR10_VAL_PT_FILE = os.path.join(CIFAR10_VAL_DIR, "data.pt")
CIFAR10_TEST_DIR = os.path.join(CIFAR10_DIR, "test")
CIFAR10_TEST_PT_FILE = os.path.join(CIFAR10_TEST_DIR, "data.pt")

# MobileNetV2 model directory
MOBILENETV2_CIFAR10_BASELINE_PT_FILE = os.path.join(MODEL_BASELINE_DIR, "mobilenetv2_cifar10.pt")

# Setup logging
logging.basicConfig(level=logging.DEBUG, 
                    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
                    handlers=[logging.StreamHandler()])

logger = logging.getLogger(__name__)

pynvml available: True
Using device: cuda


### Utils

In [2]:
def _get_system_stats_msg(device: str) -> str:
    cpu_usage = psutil.cpu_percent()
    
    vm = psutil.virtual_memory()
    ram_used_gb = vm.used / (1024**3)
    ram_total_gb = vm.total / (1024**3)
    ram_msg = f"{ram_used_gb:.1f}/{ram_total_gb:.1f}GB ({vm.percent:.1f}%)"
    
    stats_msg = f"CPU Usage: {cpu_usage:.2f}% | RAM Usage: {ram_msg}"

    if device == "cuda" and pynvml_available and torch.cuda.is_available():
        try:
            gpu_id = torch.cuda.current_device()
            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
            gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
            
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            vram_used_gb = mem_info.used / (1024**3)
            vram_total_gb = mem_info.total / (1024**3)
            gpu_mem_percent = mem_info.used / mem_info.total * 100
            vram_msg = f"{vram_used_gb:.1f}/{vram_total_gb:.1f}GB ({gpu_mem_percent:.1f}%)"
            
            stats_msg += f" | GPU {gpu_id} Util: {gpu_util:.2f}% | GPU {gpu_id} Mem: {vram_msg}"
        except pynvml.NVMLError as e:
            stats_msg += f" | GPU Stats Error: {e}"
    return stats_msg

### Model train

In [3]:
# Helper function to run one pass (train, validation, or evaluation)
def _run_one_pass(
    model: torch.nn.Module,
    loader: torch.utils.data.DataLoader,
    criterion: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    device: str,
    use_amp: bool,
    dtype: torch.dtype,
    is_train_pass: bool,
    optimizer: Optimizer | None = None,
    scaler: GradScaler | None = None,
    desc: str = "Processing",
    enable_timing: bool = False
) -> dict[str, float]:
    if is_train_pass:
        model.train()
        if optimizer is None:
            raise ValueError("Optimizer must be provided for training pass.")
    else:
        model.eval()

    total_loss = 0.0
    correct_preds = 0
    total_samples_processed = 0
    
    batch_times = []
    
    gpu_handle = None
    if device == "cuda" and pynvml_available:
        try:
            gpu_id = torch.cuda.current_device()
            gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
        except pynvml.NVMLError as e:
            logger.warning(f"Could not get GPU handle: {e}")
            gpu_handle = None

    pbar = tqdm(loader, desc=desc)
    
    context_manager = torch.no_grad() if not is_train_pass else torch.enable_grad()
    with context_manager:
        for inputs, labels in pbar:
            inputs, labels = inputs.to(device), labels.to(device)

            batch_start_time = 0.0 
            if enable_timing:
                if device == "cuda": torch.cuda.synchronize()
                batch_start_time = time.perf_counter()

            if is_train_pass and optimizer:
                optimizer.zero_grad(set_to_none=True)

            with autocast(device_type=device, enabled=(use_amp and device == "cuda"), dtype=dtype if device == "cuda" else None):
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            
            if is_train_pass and optimizer: 
                if scaler and use_amp and device == "cuda":
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    loss.backward()
                    optimizer.step()
            
            current_batch_duration = 0.0
            if enable_timing:
                if device == "cuda": torch.cuda.synchronize()
                batch_end_time = time.perf_counter()
                current_batch_duration = batch_end_time - batch_start_time
                batch_times.append(current_batch_duration)

            total_loss += loss.item() * inputs.size(0) 
            _, predicted = outputs.max(1)
            correct_preds += predicted.eq(labels).sum().item()
            total_samples_processed += labels.size(0)

            # --- Live stats for tqdm postfix ---
            postfix_stats = {"loss": f"{loss.item():.4f}"}
            if not is_train_pass: 
                current_acc = correct_preds / total_samples_processed if total_samples_processed > 0 else 0
                postfix_stats["acc"] = f"{current_acc:.4f}"

            if enable_timing and current_batch_duration > 0:
                samples_this_batch = inputs.size(0)
                batch_throughput = samples_this_batch / current_batch_duration
                postfix_stats["samples/s"] = f"{batch_throughput:.1f}" # samples per second
            
            # System stats
            postfix_stats["cpu"] = f"{psutil.cpu_percent():.1f}%"
            
            vm = psutil.virtual_memory()
            ram_used_gb = vm.used / (1024**3)
            ram_total_gb = vm.total / (1024**3)
            postfix_stats["ram"] = f"{ram_used_gb:.1f}/{ram_total_gb:.1f}GB ({vm.percent:.1f}%)"

            if device == "cuda" and pynvml_available and gpu_handle:
                try:
                    gpu_util = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
                    mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
                    vram_used_gb = mem_info.used / (1024**3)
                    vram_total_gb = mem_info.total / (1024**3)
                    gpu_mem_percent = mem_info.used / mem_info.total * 100
                    postfix_stats["gpu_util"] = f"{gpu_util:.1f}%"
                    postfix_stats["gpu_mem"] = f"{vram_used_gb:.1f}/{vram_total_gb:.1f}GB ({gpu_mem_percent:.1f}%)"
                except pynvml.NVMLError:
                    postfix_stats["gpu_util"] = "N/A"
                    postfix_stats["gpu_mem"] = "N/A"
            
            pbar.set_postfix(**postfix_stats)
            # --- End live stats ---

    avg_loss = total_loss / total_samples_processed if total_samples_processed > 0 else 0
    accuracy = correct_preds / total_samples_processed if total_samples_processed > 0 else 0
    
    results = {"avg_loss": avg_loss, "accuracy": accuracy, "total_samples_processed": float(total_samples_processed)}
    
    if enable_timing and batch_times: 
        total_inference_time = sum(batch_times)
        num_timed_batches = len(batch_times)
        results["total_inference_time"] = total_inference_time
        results["num_timed_batches"] = float(num_timed_batches)
        
        results["avg_time_per_batch"] = total_inference_time / num_timed_batches if num_timed_batches > 0 else 0
        results["avg_time_per_sample"] = total_inference_time / total_samples_processed if total_samples_processed > 0 else 0
        results["samples_per_second"] = total_samples_processed / total_inference_time if total_inference_time > 0 else 0
    else: 
        results["total_inference_time"] = 0.0
        results["num_timed_batches"] = 0.0
        results["avg_time_per_batch"] = 0.0
        results["avg_time_per_sample"] = 0.0
        results["samples_per_second"] = 0.0

    return results

# Training loop function
def train_loop(
    model: torch.nn.Module,
    train_loader: torch.utils.data.DataLoader,
    val_loader: torch.utils.data.DataLoader,
    epochs: int,
    criterion: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    optimizer: Optimizer,
    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
    use_amp: bool = True, 
    dtype: torch.dtype = torch.bfloat16 
):
    model.to(device)
    
    scaler = GradScaler(enabled=(use_amp and device == "cuda")) 

    for epoch in range(epochs):
        # Train step
        train_results = _run_one_pass(
            model=model,
            loader=train_loader,
            criterion=criterion,
            device=device,
            use_amp=use_amp,
            dtype=dtype,
            is_train_pass=True,
            optimizer=optimizer,
            scaler=scaler,
            desc=f"Epoch {epoch+1}/{epochs} [Training]",
            enable_timing=True
        )
        
        # Validation step
        val_results = _run_one_pass(
            model=model,
            loader=val_loader,
            criterion=criterion,
            device=device,
            use_amp=use_amp,
            dtype=dtype,
            is_train_pass=False,
            desc=f"Epoch {epoch+1}/{epochs} [Validation]",
            enable_timing=True
        )
        
        stats_msg = _get_system_stats_msg(device)
        train_samples_per_second = train_results.get("samples_per_second", 0.0)
        train_accuracy = train_results.get("accuracy", 0.0)
        val_samples_per_second = val_results.get("samples_per_second", 0.0)

        tqdm.write(
            f"Epoch {epoch+1}/{epochs}, "
            f"Train Loss: {train_results['avg_loss']:.4f}, Train Acc: {train_accuracy:.4f}, Train Throughput: {train_samples_per_second:.2f} samples/s | "
            f"Val Loss: {val_results['avg_loss']:.4f}, Val Acc: {val_results['accuracy']:.4f}, Val Throughput: {val_samples_per_second:.2f} samples/s | "
            f"{stats_msg}"
        )


# Function to adapt a classification model to a new dataset as such:
# * Load the pretrained model from torchvision or torchaudio
# * Replace the final layer to match the number of classes in the new dataset
# * Train the head of the model with the backbone freezed on the new dataset
# * After training the head, unfreeze the backbone and fine-tune the entire model
# * Augment the dataset with random transformations
def adapt_model_head_to_dataset(
    model: torch.nn.Module, 
    num_classes: int, 
    train_dataset: torch.utils.data.Dataset, 
    val_dataset: torch.utils.data.Dataset,
    batch_size: int = 32,
    shuffle: bool = True,
    num_workers: int = 4,
    pin_memory: bool = True,
    head_train_epochs: int = 10,
    fine_tune_epochs: int = 5,
    optimizer_cls: type[Optimizer] = torch.optim.Adam, # Changed to optimizer_cls
    head_train_lr: float = 0.001,
    fine_tune_lr: float = 0.0001,
    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
    use_amp: bool = True,
    dtype: torch.dtype = torch.bfloat16
) -> torch.nn.Module:

    # Replace the final layer
    if hasattr(model, "fc"):
        model.fc = torch.nn.Linear(model.fc.in_features, num_classes)
    elif hasattr(model, "classifier") and isinstance(model.classifier, torch.nn.Sequential):
        # Common case for models like MobileNetV2, EfficientNet
        if hasattr(model.classifier[-1], "in_features"):
            model.classifier[-1] = torch.nn.Linear(model.classifier[-1].in_features, num_classes)
        else:
            # Fallback for other structures if needed, this might require specific handling
            logger.warning(f"Warning: Classifier final layer replacement might be inexact.")
            # Attempt to find the last linear layer if possible, or raise error
            for i in range(len(model.classifier) -1, -1, -1):
                if isinstance(model.classifier[i], torch.nn.Linear):
                    model.classifier[i] = torch.nn.Linear(model.classifier[i].in_features, num_classes)
                    break
            else:
                raise AttributeError(f"Could not find a final Linear layer in classifier")

    elif hasattr(model, "classifier") and isinstance(model.classifier, torch.nn.Linear): # e.g. some ViT models
        model.classifier = torch.nn.Linear(model.classifier.in_features, num_classes)
    else:
        raise AttributeError(f"""Model does not have "fc" or a known "classifier" structure to replace.""")


    model.to(device)

    # DataLoaders
    train_loader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size=batch_size, 
                                               shuffle=shuffle, 
                                               num_workers=num_workers, 
                                               pin_memory=pin_memory)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size, 
                                             shuffle=False, 
                                             num_workers=num_workers, 
                                             pin_memory=pin_memory)
    
    criterion = torch.nn.CrossEntropyLoss()

    # --- Train the head ---
    logger.info("Training head of the model with backbone frozen...")
    # Freeze the backbone
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the final layer (fc or classifier)
    if hasattr(model, "fc"):
        for param in model.fc.parameters():
            param.requires_grad = True
    elif hasattr(model, "classifier"):
        for param in model.classifier.parameters(): # Unfreeze all params in the classifier block
            param.requires_grad = True
    else: # Should not happen due to checks above
        raise AttributeError("Final layer not found for unfreezing.")

    optimizer_head = optimizer_cls(filter(lambda p: p.requires_grad, model.parameters()), lr=head_train_lr)
    
    train_loop(model, train_loader, val_loader, head_train_epochs, criterion, optimizer_head, device, use_amp, dtype)

    # --- Fine-tune the entire model ---
    logger.info("Fine-tuning full model...")
    # Unfreeze the backbone
    for param in model.parameters():
        param.requires_grad = True

    optimizer_finetune = optimizer_cls(model.parameters(), lr=fine_tune_lr)

    train_loop(model, train_loader, val_loader, fine_tune_epochs, criterion, optimizer_finetune, device, use_amp, dtype)

    return model

## Evaluation

In [4]:
# Function to evaluate a model on a test dataset and return the mean accuracy
def eval_model(
    model: torch.nn.Module,
    test_dataset: torch.utils.data.Dataset, # test_dataset is used for total_samples_for_throughput if loader is partial
    batch_size: int = 32,
    criterion: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = torch.nn.CrossEntropyLoss(),
    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
    use_amp: bool = True,
    dtype: torch.dtype = torch.bfloat16,
    num_warmup_batches: int = 5,
    num_workers: int = 4, 
    pin_memory: bool = True 
) -> float:
    model.to(device)
    model.eval() 
    
    test_loader = torch.utils.data.DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        shuffle=False,
        num_workers=num_workers, 
        pin_memory=pin_memory
    )

    # Warmup phase
    if num_warmup_batches > 0 and len(test_loader) > num_warmup_batches : 
        logger.info(f"Starting warmup for {num_warmup_batches} batches...")
        warmup_loader = islice(test_loader, num_warmup_batches)
        with torch.no_grad():
            for inputs, labels in tqdm(warmup_loader, total=num_warmup_batches, desc="[Warmup]"):
                inputs = inputs.to(device)
                with autocast(device_type=device, enabled=(use_amp and device == "cuda"), dtype=dtype if device == "cuda" else None):
                    _ = model(inputs)
        if device == "cuda":
            torch.cuda.synchronize()
        logger.info("Warmup complete.")

    # Main evaluation pass
    eval_results = _run_one_pass(
        model=model,
        loader=test_loader, 
        criterion=criterion,
        device=device,
        use_amp=use_amp,
        dtype=dtype,
        is_train_pass=False,
        desc="[Evaluation]",
        enable_timing=True # Crucial: ensure timing is enabled
    )

    avg_eval_loss = eval_results["avg_loss"]
    eval_accuracy = eval_results["accuracy"]
    
    # Retrieve throughput metrics directly from eval_results
    samples_per_second = eval_results.get("samples_per_second", 0)
    avg_time_per_batch = eval_results.get("avg_time_per_batch", 0)
    avg_time_per_sample = eval_results.get("avg_time_per_sample", 0)

    throughput_msg = (f"Throughput: {samples_per_second:.2f} samples/sec | "
                      f"Avg Batch Time: {avg_time_per_batch*1000:.2f} ms | "
                      f"Avg Sample Time: {avg_time_per_sample*1000:.2f} ms")
    
    stats_msg = _get_system_stats_msg(device)
    
    tqdm.write(f"Evaluation Complete: Avg Loss: {avg_eval_loss:.4f}, Accuracy: {eval_accuracy:.4f}")
    tqdm.write(throughput_msg)
    tqdm.write(f"System Stats: {stats_msg}")
    
    return eval_accuracy

# Image Export

In [5]:
# MobileNetV2 model
mobilnet_v2 = torchvision.models.mobilenet_v2(weights=torchvision.models.MobileNet_V2_Weights.DEFAULT)

# CIFAR-10 dataset transforms for MobileNetV2
image_train_cifar10_mobilenetV2_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(224),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # Use ImageNet means/stds
])
image_val_cifar10_mobilenetV2_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # Use ImageNet means/stds
])

# CIFAR-10 datasets
if (not os.path.exists(CIFAR10_TEST_PT_FILE) or 
    not os.path.exists(CIFAR10_VAL_PT_FILE)):
    logger.info("Training and/or validation dataset does not exist, creating, splitting and saving...")
    train_dataset = torchvision.datasets.CIFAR10(
        root=CIFAR10_TRAIN_DIR,
        train=True,
        transform=image_train_cifar10_mobilenetV2_transform,
        download=True
    )
    train_val_split_generator = torch.Generator().manual_seed(42069)  # For reproducibility
    train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [45000, 5000], 
                                                            generator=train_val_split_generator)
    torch.save(train_dataset, CIFAR10_TRAIN_PT_FILE)
    torch.save(val_dataset, CIFAR10_VAL_PT_FILE)
else:
    logger.info("Loading existing training and validation datasets...")
    train_dataset = torch.load(CIFAR10_TRAIN_PT_FILE, weights_only=False)
    val_dataset = torch.load(CIFAR10_VAL_PT_FILE, weights_only=False)
if not os.path.exists(CIFAR10_TEST_PT_FILE):
    logger.info("Test dataset does not exist, creating and saving...")
    test_dataset = torchvision.datasets.CIFAR10(
        root=CIFAR10_TEST_DIR,
        train=False,
        transform=image_val_cifar10_mobilenetV2_transform,
        download=True
    )
    torch.save(test_dataset, CIFAR10_TEST_PT_FILE)
else:
    logger.info("Loading existing test dataset...")
    test_dataset = torch.load(CIFAR10_TEST_PT_FILE, weights_only=False)

2025-06-10 11:31:23,636 - __main__ - INFO - Loading existing training and validation datasets...
2025-06-10 11:31:25,267 - __main__ - INFO - Loading existing test dataset...


In [6]:
# Adapt the MobileNetV2 model to CIFAR-10 dataset
adapted_model = adapt_model_head_to_dataset(
    model=mobilnet_v2,
    num_classes=10,  # CIFAR-10 has 10 classes
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    batch_size=64,  # Adjust batch size as needed
    head_train_epochs=5,  # Train head for fewer epochs
    fine_tune_epochs=3,  # Fine-tune for fewer epochs
    optimizer_cls=torch.optim.Adam,  # Use Adam optimizer
    head_train_lr=0.001,  # Learning rate for head training
    fine_tune_lr=0.0001,  # Learning rate for fine-tuning
    use_amp=True,  # Use mixed precision training
    device=DEVICE,
    dtype=DTYPE
)

2025-06-10 11:31:26,662 - __main__ - INFO - Training head of the model with backbone frozen...
Epoch 1/5 [Training]: 100%|██████████| 704/704 [00:35<00:00, 19.97it/s, cpu=4.8%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=35.0%, loss=0.8872, ram=6.0/30.9GB (22.8%), samples/s=360.9]  
Epoch 1/5 [Validation]: 100%|██████████| 79/79 [00:04<00:00, 18.48it/s, acc=0.5332, cpu=3.7%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=33.0%, loss=1.1550, ram=5.9/30.9GB (22.6%), samples/s=1300.1] 


Epoch 1/5, Train Loss: 1.5391, Train Acc: 0.4722, Train Throughput: 3829.16 samples/s | Val Loss: 1.3585, Val Acc: 0.5332, Val Throughput: 4957.31 samples/s | CPU Usage: 10.10% | RAM Usage: 5.7/30.9GB (21.9%) | GPU 0 Util: 33.00% | GPU 0 Mem: 3.3/24.0GB (13.9%)


Epoch 2/5 [Training]: 100%|██████████| 704/704 [00:33<00:00, 20.83it/s, cpu=3.1%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=45.0%, loss=0.9769, ram=5.9/30.9GB (22.6%), samples/s=1043.3] 
Epoch 2/5 [Validation]: 100%|██████████| 79/79 [00:04<00:00, 18.69it/s, acc=0.5356, cpu=0.0%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=38.0%, loss=1.5044, ram=5.9/30.9GB (22.6%), samples/s=1368.3] 


Epoch 2/5, Train Loss: 1.3754, Train Acc: 0.5198, Train Throughput: 4051.54 samples/s | Val Loss: 1.3059, Val Acc: 0.5356, Val Throughput: 5606.48 samples/s | CPU Usage: 12.90% | RAM Usage: 5.7/30.9GB (21.9%) | GPU 0 Util: 36.00% | GPU 0 Mem: 3.3/24.0GB (13.9%)


Epoch 3/5 [Training]: 100%|██████████| 704/704 [00:34<00:00, 20.19it/s, cpu=11.4%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=46.0%, loss=1.6875, ram=5.9/30.9GB (22.7%), samples/s=1016.5]
Epoch 3/5 [Validation]: 100%|██████████| 79/79 [00:04<00:00, 18.82it/s, acc=0.5504, cpu=3.4%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=35.0%, loss=1.5875, ram=5.9/30.9GB (22.6%), samples/s=1342.7] 


Epoch 3/5, Train Loss: 1.3402, Train Acc: 0.5316, Train Throughput: 3916.17 samples/s | Val Loss: 1.2706, Val Acc: 0.5504, Val Throughput: 5125.20 samples/s | CPU Usage: 10.00% | RAM Usage: 5.7/30.9GB (22.0%) | GPU 0 Util: 26.00% | GPU 0 Mem: 3.3/24.0GB (13.9%)


Epoch 4/5 [Training]: 100%|██████████| 704/704 [00:34<00:00, 20.22it/s, cpu=2.9%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=42.0%, loss=1.6812, ram=6.0/30.9GB (22.9%), samples/s=948.9]  
Epoch 4/5 [Validation]: 100%|██████████| 79/79 [00:04<00:00, 18.72it/s, acc=0.5504, cpu=7.1%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=37.0%, loss=1.6696, ram=5.9/30.9GB (22.7%), samples/s=1313.1] 


Epoch 4/5, Train Loss: 1.3324, Train Acc: 0.5344, Train Throughput: 3806.95 samples/s | Val Loss: 1.2721, Val Acc: 0.5504, Val Throughput: 4978.24 samples/s | CPU Usage: 9.70% | RAM Usage: 5.7/30.9GB (22.0%) | GPU 0 Util: 29.00% | GPU 0 Mem: 3.3/24.0GB (13.9%)


Epoch 5/5 [Training]: 100%|██████████| 704/704 [00:34<00:00, 20.27it/s, cpu=6.1%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=37.0%, loss=0.7859, ram=5.9/30.9GB (22.6%), samples/s=1078.3] 
Epoch 5/5 [Validation]: 100%|██████████| 79/79 [00:04<00:00, 18.91it/s, acc=0.5576, cpu=3.6%, gpu_mem=3.3/24.0GB (13.9%), gpu_util=36.0%, loss=1.7557, ram=6.0/30.9GB (22.8%), samples/s=1250.0] 
2025-06-10 11:34:41,294 - __main__ - INFO - Fine-tuning full model...


Epoch 5/5, Train Loss: 1.3202, Train Acc: 0.5364, Train Throughput: 3891.03 samples/s | Val Loss: 1.2601, Val Acc: 0.5576, Val Throughput: 4682.43 samples/s | CPU Usage: 11.00% | RAM Usage: 5.7/30.9GB (22.0%) | GPU 0 Util: 36.00% | GPU 0 Mem: 3.3/24.0GB (13.9%)


Epoch 1/3 [Training]: 100%|██████████| 704/704 [00:35<00:00, 19.93it/s, cpu=3.1%, gpu_mem=5.9/24.0GB (24.5%), gpu_util=69.0%, loss=1.1902, ram=6.0/30.9GB (23.1%), samples/s=167.1]  
Epoch 1/3 [Validation]: 100%|██████████| 79/79 [00:03<00:00, 22.86it/s, acc=0.7220, cpu=3.4%, gpu_mem=5.9/24.0GB (24.5%), gpu_util=31.0%, loss=1.0942, ram=6.0/30.9GB (22.9%), samples/s=1206.8] 


Epoch 1/3, Train Loss: 1.0154, Train Acc: 0.6437, Train Throughput: 1898.05 samples/s | Val Loss: 0.7955, Val Acc: 0.7220, Val Throughput: 6938.35 samples/s | CPU Usage: 11.10% | RAM Usage: 5.8/30.9GB (22.2%) | GPU 0 Util: 31.00% | GPU 0 Mem: 5.9/24.0GB (24.5%)


Epoch 2/3 [Training]: 100%|██████████| 704/704 [00:35<00:00, 20.04it/s, cpu=3.2%, gpu_mem=5.8/24.0GB (24.2%), gpu_util=65.0%, loss=1.1772, ram=6.0/30.9GB (22.9%), samples/s=467.4]  
Epoch 2/3 [Validation]: 100%|██████████| 79/79 [00:04<00:00, 18.94it/s, acc=0.7524, cpu=3.4%, gpu_mem=5.8/24.0GB (24.2%), gpu_util=35.0%, loss=1.0018, ram=6.0/30.9GB (22.8%), samples/s=1208.2] 


Epoch 2/3, Train Loss: 0.7797, Train Acc: 0.7271, Train Throughput: 1902.91 samples/s | Val Loss: 0.7145, Val Acc: 0.7524, Val Throughput: 6454.56 samples/s | CPU Usage: 10.30% | RAM Usage: 5.8/30.9GB (22.2%) | GPU 0 Util: 35.00% | GPU 0 Mem: 5.8/24.0GB (24.2%)


Epoch 3/3 [Training]: 100%|██████████| 704/704 [00:35<00:00, 20.03it/s, cpu=4.9%, gpu_mem=5.8/24.0GB (24.2%), gpu_util=68.0%, loss=0.7917, ram=6.0/30.9GB (22.9%), samples/s=483.0]  
Epoch 3/3 [Validation]: 100%|██████████| 79/79 [00:04<00:00, 18.71it/s, acc=0.7780, cpu=6.3%, gpu_mem=5.8/24.0GB (24.2%), gpu_util=33.0%, loss=0.5619, ram=6.0/30.9GB (22.9%), samples/s=1136.7]  


Epoch 3/3, Train Loss: 0.7031, Train Acc: 0.7539, Train Throughput: 1910.49 samples/s | Val Loss: 0.6464, Val Acc: 0.7780, Val Throughput: 6329.79 samples/s | CPU Usage: 10.10% | RAM Usage: 5.8/30.9GB (22.2%) | GPU 0 Util: 33.00% | GPU 0 Mem: 5.8/24.0GB (24.2%)


In [8]:
# Evaluate the adapted model on the validation set
test_accuracy = eval_model(
    model=adapted_model,
    test_dataset=test_dataset,
    batch_size=64,  # Adjust batch size as needed
    device=DEVICE,
    use_amp=True,
    dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)
logger.info(f"Test accuracy of the adapted MobileNetV2 model on CIFAR-10: {test_accuracy:.4f}")

2025-06-10 11:36:47,799 - __main__ - INFO - Starting warmup for 5 batches...
[Warmup]: 100%|██████████| 5/5 [00:00<00:00,  9.03it/s]
2025-06-10 11:36:48,431 - __main__ - INFO - Warmup complete.
[Evaluation]: 100%|██████████| 157/157 [00:03<00:00, 52.01it/s, acc=0.9011, cpu=3.6%, gpu_mem=5.8/24.0GB (24.3%), gpu_util=39.0%, loss=0.0931, ram=6.1/30.9GB (23.5%), samples/s=681.8]  
2025-06-10 11:36:51,454 - __main__ - INFO - Test accuracy of the adapted MobileNetV2 model on CIFAR-10: 0.9011


Evaluation Complete: Avg Loss: 0.2962, Accuracy: 0.9011
Throughput: 8391.41 samples/sec | Avg Batch Time: 7.59 ms | Avg Sample Time: 0.12 ms
System Stats: CPU Usage: 11.90% | RAM Usage: 5.9/30.9GB (22.8%) | GPU 0 Util: 39.00% | GPU 0 Mem: 5.8/24.0GB (24.3%)


In [None]:
# Export the adapted model
torch.save(adapted_model.state_dict(), MOBILENETV2_CIFAR10_BASELINE_PT_FILE)