# Data & Model Loading

## Setup

### General

In [2]:
from typing import Callable, Literal

from PIL import Image

import torch
import torchvision
import torchaudio
from torch.optim import Optimizer
from torch.amp import GradScaler, autocast

import logging
from tqdm import tqdm

import psutil
try:
    import pynvml
    pynvml.nvmlInit()
    pynvml_available = True
except (ImportError, pynvml.NVMLError):
    pynvml_available = False
print(f"pynvml available: {pynvml_available}")

pynvml available: True


In [3]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler()])

logger = logging.getLogger(__name__)

### Training loops

In [None]:
# Training loop function
def train_loop(
    model: torch.nn.Module,
    train_loader: torch.utils.data.DataLoader,
    val_loader: torch.utils.data.DataLoader,
    epochs: int,
    criterion: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    optimizer: Optimizer,
    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
    use_amp: bool = True, 
    dtype: torch.dtype = torch.bfloat16 
):
    model.to(device)
    
    scaler = GradScaler(device, enabled=(use_amp and device == "cuda"))

    for epoch in range(epochs):
        # Train step
        model.train()
        train_loss = 0.0
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
        for inputs, labels in train_pbar:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad(set_to_none=True)

            with autocast(device_type=device, enabled=(use_amp and device == "cuda"), dtype=dtype if device == "cuda" else None):
                outputs = model(inputs)
                loss: torch.Tensor = criterion(outputs, labels)
            
            if device == "cuda" and use_amp:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else: 
                loss.backward()
                optimizer.step()
            
            train_loss += loss.item()
            train_pbar.set_postfix(loss=loss.item())

        avg_train_loss = train_loss / len(train_loader)

        # Validation step
        model.eval()
        val_loss = 0.0
        correct = 0
        total_samples = 0
        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Validation]")
        with torch.no_grad():
            for inputs, labels in val_pbar:
                inputs, labels = inputs.to(device), labels.to(device)
                with autocast(device_type=device, enabled=(use_amp and device == "cuda"), dtype=dtype if device == "cuda" else None):
                    outputs: torch.Tensor = model(inputs)
                    loss_val: torch.Tensor = criterion(outputs, labels) 
                
                val_loss += loss_val.item() 
                _, predicted = outputs.max(1)
                correct += predicted.eq(labels).sum().item()
                total_samples += labels.size(0)
                val_pbar.set_postfix(loss=loss_val.item(), acc=correct/total_samples if total_samples > 0 else 0)
        
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct / total_samples if total_samples > 0 else 0

        # System stats reporting
        cpu_usage = psutil.cpu_percent()
        ram_usage = psutil.virtual_memory().percent
        stats_msg = f"CPU Usage: {cpu_usage:.2f}% | RAM Usage: {ram_usage:.2f}%"

        if device == "cuda" and pynvml_available and torch.cuda.is_available():
            try:
                # Assuming a single GPU is being used or you want stats for GPU 0
                gpu_id = torch.cuda.current_device() 
                handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
                gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                gpu_mem_usage = mem_info.used / mem_info.total * 100
                stats_msg += f" | GPU {gpu_id} Util: {gpu_util:.2f}% | GPU {gpu_id} Mem: {gpu_mem_usage:.2f}%"
            except pynvml.NVMLError as e:
                stats_msg += f" | GPU Stats Error: {e}"
        
        tqdm.write(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f} | {stats_msg}")


# Function to adapt a classification model to a new dataset as such:
# * Load the pretrained model from torchvision or torchaudio
# * Replace the final layer to match the number of classes in the new dataset
# * Train the head of the model with the backbone freezed on the new dataset
# * After training the head, unfreeze the backbone and fine-tune the entire model
# * Augment the dataset with random transformations
def adapt_model_head_to_dataset(
    model: torch.nn.Module, 
    num_classes: int, 
    train_dataset: torch.utils.data.Dataset, 
    val_dataset: torch.utils.data.Dataset,
    batch_size: int = 32,
    shuffle: bool = True,
    num_workers: int = 4,
    pin_memory: bool = True,
    head_train_epochs: int = 10,
    fine_tune_epochs: int = 5,
    optimizer_cls: type[Optimizer] = torch.optim.Adam, # Changed to optimizer_cls
    head_train_lr: float = 0.001,
    fine_tune_lr: float = 0.0001,
    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
    use_amp: bool = True,
    dtype: torch.dtype = torch.bfloat16
) -> torch.nn.Module:

    # Replace the final layer
    if hasattr(model, 'fc'):
        model.fc = torch.nn.Linear(model.fc.in_features, num_classes)
    elif hasattr(model, 'classifier') and isinstance(model.classifier, torch.nn.Sequential):
        # Common case for models like MobileNetV2, EfficientNet
        if hasattr(model.classifier[-1], 'in_features'):
            model.classifier[-1] = torch.nn.Linear(model.classifier[-1].in_features, num_classes)
        else:
            # Fallback for other structures if needed, this might require specific handling
            logger.warning(f"Warning: Classifier final layer replacement might be inexact.")
            # Attempt to find the last linear layer if possible, or raise error
            for i in range(len(model.classifier) -1, -1, -1):
                if isinstance(model.classifier[i], torch.nn.Linear):
                    model.classifier[i] = torch.nn.Linear(model.classifier[i].in_features, num_classes)
                    break
            else:
                raise AttributeError(f"Could not find a final Linear layer in classifier")

    elif hasattr(model, 'classifier') and isinstance(model.classifier, torch.nn.Linear): # e.g. some ViT models
        model.classifier = torch.nn.Linear(model.classifier.in_features, num_classes)
    else:
        raise AttributeError(f"Model does not have 'fc' or a known 'classifier' structure to replace.")


    model.to(device)

    # DataLoaders
    train_loader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size=batch_size, 
                                               shuffle=shuffle, 
                                               num_workers=num_workers, 
                                               pin_memory=pin_memory)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size, 
                                             shuffle=False, 
                                             num_workers=num_workers, 
                                             pin_memory=pin_memory)
    
    criterion = torch.nn.CrossEntropyLoss()

    # --- Train the head ---
    logger.info("Training head of the model with backbone frozen...")
    # Freeze the backbone
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the final layer (fc or classifier)
    if hasattr(model, 'fc'):
        for param in model.fc.parameters():
            param.requires_grad = True
    elif hasattr(model, 'classifier'):
        for param in model.classifier.parameters(): # Unfreeze all params in the classifier block
            param.requires_grad = True
    else: # Should not happen due to checks above
        raise AttributeError("Final layer not found for unfreezing.")

    optimizer_head = optimizer_cls(filter(lambda p: p.requires_grad, model.parameters()), lr=head_train_lr)
    
    train_loop(model, train_loader, val_loader, head_train_epochs, criterion, optimizer_head, device, use_amp, dtype)

    # --- Fine-tune the entire model ---
    logger.info("Fine-tuning full model...")
    # Unfreeze the backbone
    for param in model.parameters():
        param.requires_grad = True

    optimizer_finetune = optimizer_cls(model.parameters(), lr=fine_tune_lr)

    train_loop(model, train_loader, val_loader, fine_tune_epochs, criterion, optimizer_finetune, device, use_amp, dtype)

    return model

## Summary

### Image Processing Model
* Task: Image classification
* Model: MobileNetV2
* Dataset: CIFAR-10

In [5]:
# MobileNetV2 model
mobilnet_v2 = torchvision.models.mobilenet_v2(weights=torchvision.models.MobileNet_V2_Weights.DEFAULT)

# CIFAR-10 dataset transforms for MobileNetV2
image_train_cifar10_mobilenetV2_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(224),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # Use ImageNet means/stds
])
image_val_cifar10_mobilenetV2_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # Use ImageNet means/stds
])

# CIFAR-10 datasets
train_dataset = torchvision.datasets.CIFAR10(
    root="../data/image/cifar10/train",
    train=True,
    transform=image_train_cifar10_mobilenetV2_transform,
    download=True
)
val_dataset = torchvision.datasets.CIFAR10(
    root="../data/image/cifar10/test",
    train=False,
    transform=image_val_cifar10_mobilenetV2_transform,
    download=True
)

In [6]:
# Adapt the MobileNetV2 model to CIFAR-10 dataset
adapted_model = adapt_model_head_to_dataset(
    model=mobilnet_v2,
    num_classes=10,  # CIFAR-10 has 10 classes
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    batch_size=64,  # Adjust batch size as needed
    head_train_epochs=5,  # Train head for fewer epochs
    fine_tune_epochs=3,  # Fine-tune for fewer epochs
    optimizer_cls=torch.optim.Adam,  # Use Adam optimizer
    head_train_lr=0.001,  # Learning rate for head training
    fine_tune_lr=0.0001,  # Learning rate for fine-tuning
)

2025-06-09 22:35:56,537 - __main__ - INFO - Training head of the model with backbone frozen...
Epoch 1/5 [Training]:   0%|          | 0/782 [00:00<?, ?it/s]


TypeError: autocast.__init__() missing 1 required positional argument: 'device_type'

In [None]:
# Export the adapted model
torch.save(adapted_model.state_dict(), "../models/base/mobilenet_v2_cifar10.pth")

### Audio Processing Model
* Task: Audio classification
* Model: YAMNet
* Dataset: ESC-50