# Task 7.2 Solution: Transfer Learning Project

**Module:** 7 - Computer Vision  
**Type:** Solution Notebook

---

This notebook contains solutions for achieving >90% accuracy on CIFAR-100 using transfer learning.

In [None]:
import torch
import torch.nn as nn
import numpy as np
from typing import Tuple

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## Solution: Achieving >90% Accuracy on CIFAR-100

Key strategies:
1. Use a larger pretrained model (EfficientNet-B3 or ConvNeXt)
2. Train on full dataset (not subset)
3. Longer training (50+ epochs)
4. Advanced techniques: Mixup, Label Smoothing, Gradual Unfreezing

In [None]:
# Configuration for >90% accuracy on CIFAR-100
training_config = {
    'model': 'efficientnet_b3',  # Larger model for better capacity
    'dataset_subset': None,  # Full dataset (50,000 images)
    'epochs': 50,
    'batch_size': 32,
    'base_lr': 1e-4,
    'weight_decay': 0.1,
    'warmup_epochs': 5,
    'label_smoothing': 0.1,
    'mixup_alpha': 0.2,
    'strategy': 'gradual_unfreeze',
}

print("Configuration for >90% accuracy on CIFAR-100:")
print("="*50)
for key, value in training_config.items():
    print(f"  {key}: {value}")

## Mixup Augmentation Implementation

Mixup creates virtual training examples by combining pairs of images and their labels.

In [None]:
def mixup_data(x: torch.Tensor, y: torch.Tensor, alpha: float = 0.2) -> Tuple:
    """
    Apply Mixup augmentation.
    
    Mixup creates virtual training examples by:
    x_new = lambda * x_i + (1 - lambda) * x_j
    y_new = lambda * y_i + (1 - lambda) * y_j
    
    Args:
        x: Input images [B, C, H, W]
        y: Labels [B]
        alpha: Mixup interpolation strength
    
    Returns:
        mixed_x: Mixed images
        y_a, y_b: Original labels for loss computation
        lam: Mixing coefficient
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(x.device)
    
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion: nn.Module, pred: torch.Tensor, 
                    y_a: torch.Tensor, y_b: torch.Tensor, lam: float) -> torch.Tensor:
    """
    Compute loss for Mixup.
    
    The loss is a weighted combination of losses for both labels.
    """
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)


# Demonstration
print("Mixup Augmentation Demo")
print("="*50)
x = torch.randn(4, 3, 32, 32)
y = torch.tensor([0, 1, 2, 3])

mixed_x, y_a, y_b, lam = mixup_data(x, y, alpha=0.2)
print(f"Original labels: {y.tolist()}")
print(f"y_a (first set): {y_a.tolist()}")
print(f"y_b (shuffled): {y_b.tolist()}")
print(f"Lambda (mixing coefficient): {lam:.3f}")

## Label Smoothing Implementation

Label smoothing prevents overconfidence by softening the target distribution.

In [None]:
class LabelSmoothingCrossEntropy(nn.Module):
    """
    Cross-entropy loss with label smoothing.
    
    Instead of hard labels [0, 0, 1, 0], uses soft labels:
    [eps/K, eps/K, 1-eps+eps/K, eps/K]
    
    This prevents overconfidence and improves generalization.
    """
    
    def __init__(self, smoothing: float = 0.1):
        super().__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
    
    def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        num_classes = pred.size(-1)
        
        # Log softmax for numerical stability
        log_probs = torch.log_softmax(pred, dim=-1)
        
        # Create smooth labels
        smooth_labels = torch.full_like(log_probs, self.smoothing / num_classes)
        smooth_labels.scatter_(1, target.unsqueeze(1), self.confidence)
        
        # Compute loss
        loss = -torch.sum(smooth_labels * log_probs, dim=-1)
        return loss.mean()


# Demonstration
criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
pred = torch.randn(4, 100)  # 100 classes for CIFAR-100
target = torch.tensor([0, 25, 50, 75])

loss = criterion(pred, target)
print(f"Label Smoothing Loss: {loss.item():.4f}")

## Gradual Unfreezing Strategy

Gradually unfreeze layers from top to bottom during training.

In [None]:
def gradual_unfreeze(model: nn.Module, epoch: int, total_epochs: int, num_stages: int = 4):
    """
    Gradually unfreeze model layers during training.
    
    Starts with only classifier trainable, then progressively unfreezes
    deeper layers as training progresses.
    
    Args:
        model: The model with 'features' and 'classifier' attributes
        epoch: Current epoch
        total_epochs: Total training epochs
        num_stages: Number of unfreezing stages
    """
    # Get all layers
    if hasattr(model, 'features'):
        layers = list(model.features.children())
    else:
        layers = list(model.children())[:-1]  # Exclude classifier
    
    epochs_per_stage = total_epochs // num_stages
    current_stage = min(epoch // epochs_per_stage, num_stages - 1)
    
    # Calculate how many layers to unfreeze
    layers_to_unfreeze = (current_stage + 1) * (len(layers) // num_stages)
    
    # Freeze all layers first
    for param in model.parameters():
        param.requires_grad = False
    
    # Unfreeze classifier (always trainable)
    if hasattr(model, 'classifier'):
        for param in model.classifier.parameters():
            param.requires_grad = True
    elif hasattr(model, 'fc'):
        for param in model.fc.parameters():
            param.requires_grad = True
    
    # Unfreeze top N layers
    for layer in layers[-layers_to_unfreeze:]:
        for param in layer.parameters():
            param.requires_grad = True
    
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    
    print(f"Stage {current_stage + 1}/{num_stages}: {trainable:,}/{total:,} params trainable ({100*trainable/total:.1f}%)")


# Demonstration
print("Gradual Unfreezing Strategy")
print("="*50)
print("\nThis strategy trains classifier first, then progressively")
print("unfreezes deeper layers for fine-grained adaptation.")

## Complete Training Loop

In [None]:
def train_epoch_with_mixup(
    model: nn.Module,
    dataloader,
    optimizer,
    criterion,
    device: torch.device,
    mixup_alpha: float = 0.2
) -> float:
    """
    Train one epoch with Mixup augmentation.
    
    Returns:
        Average loss for the epoch
    """
    model.train()
    total_loss = 0.0
    num_batches = 0
    
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        
        # Apply Mixup
        mixed_images, y_a, y_b, lam = mixup_data(images, labels, mixup_alpha)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(mixed_images)
        
        # Mixup loss
        loss = mixup_criterion(criterion, outputs, y_a, y_b, lam)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    return total_loss / num_batches


print("Training functions defined!")
print("\nTo achieve >90% on CIFAR-100:")
print("1. Use EfficientNet-B3 or ConvNeXt-Tiny as backbone")
print("2. Train for 50+ epochs with gradual unfreezing")
print("3. Apply Mixup (alpha=0.2) and Label Smoothing (0.1)")
print("4. Use cosine annealing LR schedule with warmup")

## Summary

Key techniques for achieving >90% on CIFAR-100:

1. **Larger Model**: EfficientNet-B3 has more capacity than ResNet-18
2. **Mixup**: Creates virtual training samples, reduces overfitting
3. **Label Smoothing**: Prevents overconfident predictions
4. **Gradual Unfreezing**: Starts from pretrained weights, adapts carefully
5. **Longer Training**: 50+ epochs with proper LR schedule

Expected results:
- ResNet-18 baseline: ~75-80%
- With techniques above: ~90-92%

In [None]:
# Cleanup
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Cleanup complete!")