# Lab 2.1.2: Dataset Pipeline - SOLUTIONS

This notebook contains complete solutions for the Dataset Pipeline exercises.

---

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import numpy as np
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

---

## Exercise Solution: MixupDataset

Mixup is a data augmentation technique that creates new training samples by linearly interpolating between two random samples. This encourages the model to behave linearly between training examples, improving generalization.

In [None]:
class MixupDataset(Dataset):
    """
    Dataset wrapper that applies Mixup augmentation.
    
    Mixup creates new training samples by combining two images:
        mixed_image = lambda * image1 + (1 - lambda) * image2
        
    The label is returned as a tuple (label1, label2, lambda) so the loss
    function can compute the weighted combination.
    
    Paper: "mixup: Beyond Empirical Risk Minimization" (Zhang et al., 2018)
    
    Args:
        dataset: Base dataset to wrap
        alpha: Mixup alpha parameter for Beta distribution (default: 0.2)
    """
    
    def __init__(self, dataset: Dataset, alpha: float = 0.2):
        self.dataset = dataset
        self.alpha = alpha
        
    def __len__(self) -> int:
        return len(self.dataset)
    
    def __getitem__(self, idx: int):
        # Get first sample
        img1, label1 = self.dataset[idx]
        
        # Get random second sample
        idx2 = np.random.randint(len(self.dataset))
        img2, label2 = self.dataset[idx2]
        
        # Sample lambda from Beta(alpha, alpha) distribution
        # When alpha is small (e.g., 0.2), lambda is usually close to 0 or 1
        # When alpha is large (e.g., 1.0), lambda is more uniform
        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
        else:
            lam = 1.0  # No mixup when alpha = 0
        
        # Mix images
        mixed_img = lam * img1 + (1 - lam) * img2
        
        return mixed_img, (label1, label2, lam)


def mixup_criterion(criterion, pred, labels):
    """
    Compute mixed loss for mixup training.
    
    Args:
        criterion: Base loss function (e.g., CrossEntropyLoss)
        pred: Model predictions
        labels: Tuple of (label1, label2, lambda)
        
    Returns:
        Weighted loss
    """
    label1, label2, lam = labels
    return lam * criterion(pred, label1) + (1 - lam) * criterion(pred, label2)

In [None]:
# Create a simple test dataset
class SimpleDataset(Dataset):
    def __init__(self, num_samples=100, num_classes=10):
        self.num_samples = num_samples
        self.num_classes = num_classes
        # Pre-generate random images and labels
        self.images = torch.randn(num_samples, 3, 32, 32)
        self.labels = torch.randint(0, num_classes, (num_samples,))
    
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx].item()

# Test MixupDataset
base_dataset = SimpleDataset(num_samples=100)
mixup_dataset = MixupDataset(base_dataset, alpha=0.2)

print("=== Testing MixupDataset ===")

# Get a sample
mixed_img, (label1, label2, lam) = mixup_dataset[0]
print(f"Mixed image shape: {mixed_img.shape}")
print(f"Labels: {label1}, {label2}")
print(f"Lambda: {lam:.3f}")

# Test with DataLoader
loader = DataLoader(mixup_dataset, batch_size=4, shuffle=True)
batch_img, (batch_l1, batch_l2, batch_lam) = next(iter(loader))
print(f"\nBatch images: {batch_img.shape}")
print(f"Batch labels1: {batch_l1}")
print(f"Batch labels2: {batch_l2}")
print(f"Batch lambdas: {batch_lam}")

---

## Alternative: Batch Mixup

A more efficient approach is to apply mixup at the batch level rather than in the dataset. This is cleaner and allows you to use standard datasets.

In [None]:
def mixup_data(x, y, alpha=0.2):
    """
    Apply mixup to a batch of data.
    
    This is the more common way to implement mixup - at training time
    rather than in the dataset.
    
    Args:
        x: Input images (B, C, H, W)
        y: Labels (B,)
        alpha: Mixup alpha parameter
        
    Returns:
        mixed_x: Mixed images
        y_a: First set of labels
        y_b: Second set of labels
        lam: Mixing coefficient
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0
    
    batch_size = x.size(0)
    index = torch.randperm(batch_size)
    
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    
    return mixed_x, y_a, y_b, lam


# Example training loop with batch mixup
def train_with_mixup(model, train_loader, criterion, optimizer, alpha=0.2):
    """
    Training loop with mixup augmentation.
    """
    model.train()
    total_loss = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Apply mixup
        mixed_images, labels_a, labels_b, lam = mixup_data(images, labels, alpha)
        
        optimizer.zero_grad()
        outputs = model(mixed_images)
        
        # Compute mixed loss
        loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)


print("Batch Mixup function defined!")

# Test
x = torch.randn(8, 3, 32, 32)
y = torch.randint(0, 10, (8,))
mixed_x, y_a, y_b, lam = mixup_data(x, y, alpha=0.2)
print(f"Original: {x.shape}, Labels: {y}")
print(f"Mixed: {mixed_x.shape}, Lambda: {lam:.3f}")
print(f"Labels A: {y_a}")
print(f"Labels B: {y_b}")

---

## Challenge Solution: Cutmix

Cutmix is similar to Mixup but instead of blending entire images, it cuts a patch from one image and pastes it onto another.

In [None]:
def rand_bbox(size, lam):
    """
    Generate a random bounding box for Cutmix.
    
    Args:
        size: Image size (B, C, H, W)
        lam: Mixing coefficient
        
    Returns:
        Bounding box coordinates (bbx1, bby1, bbx2, bby2)
    """
    W = size[3]
    H = size[2]
    
    # Calculate cut size based on lambda
    cut_rat = np.sqrt(1.0 - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)
    
    # Random center point
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    
    # Calculate bounding box with clipping
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    
    return bbx1, bby1, bbx2, bby2


def cutmix_data(x, y, alpha=1.0):
    """
    Apply Cutmix to a batch of data.
    
    Args:
        x: Input images (B, C, H, W)
        y: Labels (B,)
        alpha: Cutmix alpha parameter
        
    Returns:
        mixed_x: Mixed images
        y_a: First set of labels
        y_b: Second set of labels
        lam: Actual mixing coefficient (based on box area)
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0
    
    batch_size = x.size(0)
    index = torch.randperm(batch_size)
    
    y_a = y
    y_b = y[index]
    
    # Get bounding box
    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    
    # Apply cutmix
    mixed_x = x.clone()
    mixed_x[:, :, bby1:bby2, bbx1:bbx2] = x[index, :, bby1:bby2, bbx1:bbx2]
    
    # Adjust lambda based on actual box area
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size(2) * x.size(3)))
    
    return mixed_x, y_a, y_b, lam


# Test Cutmix
print("=== Testing Cutmix ===")
x = torch.randn(4, 3, 32, 32)
y = torch.randint(0, 10, (4,))
mixed_x, y_a, y_b, lam = cutmix_data(x, y, alpha=1.0)
print(f"Mixed images shape: {mixed_x.shape}")
print(f"Lambda: {lam:.3f}")

---

## Advanced: Prefetching DataLoader

For maximum performance, we can create a DataLoader that prefetches batches in a background thread.

In [None]:
from threading import Thread
from queue import Queue

class PrefetchLoader:
    """
    DataLoader wrapper that prefetches batches in a background thread.
    
    This overlaps data loading with GPU computation for better utilization.
    
    Args:
        loader: Base DataLoader to wrap
        device: Device to prefetch to
        prefetch_count: Number of batches to keep ready
    """
    
    def __init__(self, loader, device, prefetch_count=2):
        self.loader = loader
        self.device = device
        self.prefetch_count = prefetch_count
        self.queue = Queue(maxsize=prefetch_count)
        self.stream = torch.cuda.Stream() if device.type == 'cuda' else None
        
    def __len__(self):
        return len(self.loader)
    
    def __iter__(self):
        # Start the prefetching thread
        self._prefetch_thread = Thread(
            target=self._prefetch_worker,
            daemon=True
        )
        self._prefetch_thread.start()
        
        for _ in range(len(self.loader)):
            batch = self.queue.get()
            if batch is None:
                break
            yield batch
    
    def _prefetch_worker(self):
        """Background thread that prefetches batches."""
        for batch in self.loader:
            # Transfer to GPU in stream
            if self.stream is not None:
                with torch.cuda.stream(self.stream):
                    batch = self._to_device(batch)
                self.stream.synchronize()
            else:
                batch = self._to_device(batch)
            
            self.queue.put(batch)
        
        self.queue.put(None)  # Signal end
    
    def _to_device(self, batch):
        """Move batch to device."""
        if isinstance(batch, (list, tuple)):
            return [self._to_device(x) for x in batch]
        elif isinstance(batch, torch.Tensor):
            return batch.to(self.device, non_blocking=True)
        else:
            return batch


# Example usage
print("PrefetchLoader defined!")
print("Usage: prefetch_loader = PrefetchLoader(train_loader, device)")

---

## Performance Comparison

Here's a summary of different augmentation strategies:

| Augmentation | Type | Typical Improvement | Overhead |
|--------------|------|---------------------|----------|
| Standard (flip, crop, jitter) | Spatial/Color | Baseline | Low |
| Mixup | Interpolation | +1-2% | Minimal |
| Cutmix | Region-based | +1-2% | Minimal |
| RandAugment | Automated | +2-3% | Medium |
| AutoAugment | Learned | +2-3% | High (training) |

**Recommendation for DGX Spark:**
- Use standard augmentation + Mixup/Cutmix
- Consider RandAugment for larger models
- Batch-level augmentation is more efficient than dataset-level

In [None]:
# Cleanup
import gc
torch.cuda.empty_cache()
gc.collect()
print("Cleanup complete!")