<a href="https://colab.research.google.com/github/PETEROA/AutoML/blob/main/Basic_Distill.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook 2: Basic Knowledge Distillation

This notebook implements and compares different knowledge distillation techniques.
- Load teacher and student models
- Implement multiple distillation loss functions
- Train students with different strategies
- Compare: Vanilla training vs. Distillation
- Measure accuracy retention and speedup

Outputs:
- Distilled student models
- Comparison metrics
- Best distillation strategies for NAS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

# Check what's in your Google Drive root
print("Files in MyDrive:")
print(os.listdir('/content/drive/MyDrive'))
print("\n" + "="*60)

# Check if automl-distill folder exists
if os.path.exists('/content/drive/MyDrive/AutoML'):
    print("\n✓ automl-distill folder found!")
    print("\nFiles in automl-distill:")
    print(os.listdir('/content/drive/MyDrive/AutoML'))
else:
    print("\n✗ automl-distill folder NOT found")
    print("You need to create it first")

Files in MyDrive:
['Colab Notebooks', 'Peter', 'Untitled Diagram.drawio', 'T.pdf', 'INTERNATIONAL PASSPORT.pdf', 'Pricilia_Agida.docx', 'BIS_Coursework 2.docx', 'PETER OWAN AGIDA-converted (1).pdf', 'PETER OWAN AGIDA-converted.pdf', '4070 .gdoc', 'CW 3 G4 REPORT.docx', 'FIN_ SPONSORSHIP_.docx', 'NG_CA_EN.docx', 'Lodger_Agreement (Peter Agida).docx', 'Copy of #csquad budget template.xlsx', 'Coach Ticket EURHM942.pdf', 'RSA slides.pptx', 'pres12345.mp4', 'Diagrammatic problem (1).pptx', 'Diagrammatic problem.pptx', 'Diagrammatic problem.gslides', 'Peter+Agida+-+CV.docx', 'BUILDING A MACHINE LEARNING POWERED APPLICATION.gdoc', 'lab.db', 'ASIA_social sector_AI Saturdays.jpeg', 'TEAM MANDELA : Project Proposal.gdoc', 'Resume_1.pdf', 'Untitled document (12).gdoc', 'ML Engineer (1).pdf', 'ML Engineer.pdf', 'Peter_N_CV.gdoc', 'Untitled document (11).gdoc', 'PAPERS REVIEW.gdoc', 'SAFE RL.gdoc', 'SAFE RL.pdf', 'TOTAL CONCERN ASSET LTD.pdf', 'TOTAL CONCERN ASSET LTD.gdoc', 'Untitled document (10)

In [None]:
!pip install torch torchvision transformers -q
!pip install matplotlib seaborn pandas numpy tqdm -q

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Subset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Add project path BEFORE importing utils
import sys
sys.path.append('/content/drive/MyDrive/AutoML')  # Adjust to your p

# Import utilities (adjust path for Colab if needed)
try:
    from Utils import *
except:
    print("Note: utils.py not found, defining functions inline")

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Setup directories
# For Colab, use: OUTPUT_DIR = Path('/content/drive/MyDrive/AutoML/outputs')
# For local/Claude: OUTPUT_DIR = Path('/mnt/user-data/outputs')
OUTPUT_DIR = Path('/content/outputs')  # Change this based on your setup
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MODELS_DIR = OUTPUT_DIR / 'models'
MODELS_DIR.mkdir(exist_ok=True)

RESULTS_DIR = OUTPUT_DIR / 'results'
RESULTS_DIR.mkdir(exist_ok=True)

Using device: cpu


In [None]:
# Load model profiles from Notebook 1
profile_file = OUTPUT_DIR / 'all_model_profiles.json'

if profile_file.exists():
    with open(profile_file, 'r') as f:
        all_profiles = json.load(f)
    print("✓ Loaded profiles from Notebook 1")
    print(f"  Vision models: {list(all_profiles.get('vision_models', {}).keys())}")
    print(f"  Language models: {list(all_profiles.get('language_models', {}).keys())}")
else:
    print("⚠ Profile file not found - proceeding without it")
    print("  (Profiling data is optional for this notebook)")
    all_profiles = {'vision_models': {}, 'language_models': {}}

print("\n✓ Ready to proceed with distillation training")

⚠ Profile file not found - proceeding without it
  (Profiling data is optional for this notebook)

✓ Ready to proceed with distillation training


In [None]:
## 2. Setup Dataset (CIFAR-10 for quick experiments)
def get_cifar10_dataloaders(batch_size=32, num_train_samples=5000, num_val_samples=1000):
    """
    Get CIFAR-10 dataloaders
    Using subset for faster experiments
    """
    # Data augmentation for training
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    # Download datasets
    trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform_train
    )

    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test
    )

    # Create subsets for faster training
    train_indices = torch.randperm(len(trainset))[:num_train_samples]
    val_indices = torch.randperm(len(testset))[:num_val_samples]

    train_subset = Subset(trainset, train_indices)
    val_subset = Subset(testset, val_indices)

    # Create dataloaders
    trainloader = DataLoader(
        train_subset, batch_size=batch_size, shuffle=True, num_workers=2
    )

    valloader = DataLoader(
        val_subset, batch_size=batch_size, shuffle=False, num_workers=2
    )

    print(f"Train samples: {len(train_subset)}, Val samples: {len(val_subset)}")

    return trainloader, valloader

# Get dataloaders
trainloader, valloader = get_cifar10_dataloaders(
    batch_size=64,
    num_train_samples=10000,  # Use 10k samples for faster training
    num_val_samples=2000
)

100%|██████████| 170M/170M [00:10<00:00, 16.1MB/s]


Train samples: 10000, Val samples: 2000


In [None]:
## 3. Load Teacher and Student Models
def load_model_for_cifar(model_name, num_classes=10, pretrained=True):
    """Load and adapt model for CIFAR-10"""
    if model_name == 'resnet18':
        model = models.resnet18(pretrained=pretrained)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    elif model_name == 'resnet50':
        model = models.resnet50(pretrained=pretrained)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    elif model_name == 'mobilenet_v2':
        model = models.mobilenet_v2(pretrained=pretrained)
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
    else:
        raise ValueError(f"Model {model_name} not supported")

    return model

# Load teacher (ResNet-50) and student (ResNet-18)
teacher = load_model_for_cifar('resnet50', pretrained=True).to(device)
student = load_model_for_cifar('resnet18', pretrained=True).to(device)

teacher.eval()

# Count parameters
teacher_params = sum(p.numel() for p in teacher.parameters())
student_params = sum(p.numel() for p in student.parameters())

print(f"\nTeacher (ResNet-50): {teacher_params:,} parameters")
print(f"Student (ResNet-18): {student_params:,} parameters")
print(f"Compression ratio: {teacher_params / student_params:.2f}x")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:01<00:00, 77.5MB/s]


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 68.6MB/s]



Teacher (ResNet-50): 23,528,522 parameters
Student (ResNet-18): 11,181,642 parameters
Compression ratio: 2.10x


In [None]:
## 4. Distillation Loss Functions
class DistillationLoss(nn.Module):
    """Combined distillation loss"""
    def __init__(self, alpha=0.5, temperature=3.0):
        super().__init__()
        self.alpha = alpha
        self.temperature = temperature
        self.ce_loss = nn.CrossEntropyLoss()

    def forward(self, student_logits, teacher_logits, targets):
        # Hard label loss
        hard_loss = self.ce_loss(student_logits, targets)

        # Soft label loss (KL divergence)
        soft_loss = self.kl_divergence_loss(student_logits, teacher_logits)

        # Combined loss
        total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss

        return total_loss, hard_loss, soft_loss

    def kl_divergence_loss(self, student_logits, teacher_logits):
        """KL divergence loss with temperature"""
        student_soft = F.log_softmax(student_logits / self.temperature, dim=1)
        teacher_soft = F.softmax(teacher_logits / self.temperature, dim=1)

        kl_loss = F.kl_div(
            student_soft,
            teacher_soft,
            reduction='batchmean'
        ) * (self.temperature ** 2)

        return kl_loss

class FeatureMatchingLoss(nn.Module):
    """Feature matching loss for intermediate layers"""
    def __init__(self):
        super().__init__()
        self.mse_loss = nn.MSELoss()

    def forward(self, student_features, teacher_features):
        # Normalize features
        student_norm = F.normalize(student_features, p=2, dim=1)
        teacher_norm = F.normalize(teacher_features, p=2, dim=1)

        return self.mse_loss(student_norm, teacher_norm)

In [None]:
## 5. Training Functions
def train_with_distillation(
    student,
    teacher,
    trainloader,
    valloader,
    num_epochs=5,
    alpha=0.5,
    temperature=3.0,
    lr=0.001,
    device='cuda'
):
    """
    Train student model with knowledge distillation
    """
    student.train()
    teacher.eval()

    optimizer = torch.optim.Adam(student.parameters(), lr=lr)
    criterion = DistillationLoss(alpha=alpha, temperature=temperature)

    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': [],
        'hard_loss': [],
        'soft_loss': []
    }

    for epoch in range(num_epochs):
        # Training
        student.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        hard_losses = 0
        soft_losses = 0

        pbar = tqdm(trainloader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for images, labels in pbar:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            student_logits = student(images)

            with torch.no_grad():
                teacher_logits = teacher(images)

            # Compute loss
            loss, hard_loss, soft_loss = criterion(student_logits, teacher_logits, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Statistics
            train_loss += loss.item()
            hard_losses += hard_loss.item()
            soft_losses += soft_loss.item()
            _, predicted = student_logits.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()

            pbar.set_postfix({
                'loss': loss.item(),
                'acc': 100. * train_correct / train_total
            })

        train_loss /= len(trainloader)
        train_acc = 100. * train_correct / train_total
        hard_losses /= len(trainloader)
        soft_losses /= len(trainloader)

        # Validation
        val_loss, val_acc = evaluate_model(student, valloader, device)

        # Save history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['hard_loss'].append(hard_losses)
        history['soft_loss'].append(soft_losses)

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.2f}%, "
              f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.2f}%")

    return student, history

def train_vanilla(
    model,
    trainloader,
    valloader,
    num_epochs=5,
    lr=0.001,
    device='cuda'
):
    """
    Train model without distillation (baseline)
    """
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': []
    }

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        pbar = tqdm(trainloader, desc=f'Epoch {epoch+1}/{num_epochs} (Vanilla)')
        for images, labels in pbar:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Statistics
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()

            pbar.set_postfix({
                'loss': loss.item(),
                'acc': 100. * train_correct / train_total
            })

        train_loss /= len(trainloader)
        train_acc = 100. * train_correct / train_total

        # Validation
        val_loss, val_acc = evaluate_model(model, valloader, device)

        # Save history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.2f}%, "
              f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.2f}%")

    return model, history

def evaluate_model(model, dataloader, device):
    """Evaluate model on validation set"""
    model.eval()
    criterion = nn.CrossEntropyLoss()

    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = 100. * correct / total

    return avg_loss, accuracy

In [None]:
## 6. Experiment 1: Vanilla Training (Baseline)
print("="*80)
print("EXPERIMENT 1: Vanilla Training (Baseline)")
print("="*80)

# Create fresh student model
student_vanilla = load_model_for_cifar('resnet18', pretrained=True).to(device)

# Train
student_vanilla, history_vanilla = train_vanilla(
    student_vanilla,
    trainloader,
    valloader,
    num_epochs=5,
    lr=0.001,
    device=device
)

# Save model
torch.save(student_vanilla.state_dict(), MODELS_DIR / 'student_vanilla.pth')
print(f"\n✓ Vanilla student saved")

EXPERIMENT 1: Vanilla Training (Baseline)


Epoch 1/5 (Vanilla): 100%|██████████| 157/157 [02:56<00:00,  1.13s/it, loss=1.53, acc=45.7]


Epoch 1: Train Loss=1.5460, Train Acc=45.70%, Val Loss=2.2260, Val Acc=51.10%


Epoch 2/5 (Vanilla): 100%|██████████| 157/157 [02:54<00:00,  1.11s/it, loss=1.32, acc=60.6]


Epoch 2: Train Loss=1.1443, Train Acc=60.56%, Val Loss=1.1473, Val Acc=61.15%


Epoch 3/5 (Vanilla): 100%|██████████| 157/157 [02:55<00:00,  1.12s/it, loss=1.02, acc=65.6]


Epoch 3: Train Loss=1.0043, Train Acc=65.57%, Val Loss=0.9849, Val Acc=66.30%


Epoch 4/5 (Vanilla): 100%|██████████| 157/157 [02:59<00:00,  1.14s/it, loss=1.28, acc=68.4]


Epoch 4: Train Loss=0.9374, Train Acc=68.38%, Val Loss=0.8555, Val Acc=70.10%


Epoch 5/5 (Vanilla): 100%|██████████| 157/157 [03:01<00:00,  1.15s/it, loss=1.29, acc=71.7]


Epoch 5: Train Loss=0.8356, Train Acc=71.73%, Val Loss=0.8039, Val Acc=71.80%

✓ Vanilla student saved


In [None]:
## 7. Experiment 2: Knowledge Distillation with Different Temperatures
print("\n" + "="*80)
print("EXPERIMENT 2: Knowledge Distillation (Temperature Sweep)")
print("="*80)

temperatures = [2.0, 3.0, 5.0]
distillation_results = {}

for temp in temperatures:
    print(f"\n{'='*60}")
    print(f"Training with Temperature = {temp}")
    print(f"{'='*60}")

    # Create fresh student model
    student_kd = load_model_for_cifar('resnet18', pretrained=True).to(device)

    # Train with distillation
    student_kd, history_kd = train_with_distillation(
        student_kd,
        teacher,
        trainloader,
        valloader,
        num_epochs=5,
        alpha=0.5,
        temperature=temp,
        lr=0.001,
        device=device
    )

    # Save model and results
    torch.save(student_kd.state_dict(), MODELS_DIR / f'student_kd_temp{temp}.pth')
    distillation_results[f'temp_{temp}'] = history_kd

    print(f"\n✓ Student with T={temp} saved")

    # Clean up
    del student_kd
    torch.cuda.empty_cache() if torch.cuda.is_available() else None


EXPERIMENT 2: Knowledge Distillation (Temperature Sweep)

Training with Temperature = 2.0


Epoch 1/5: 100%|██████████| 157/157 [04:10<00:00,  1.59s/it, loss=1.07, acc=46.3]


Epoch 1: Train Loss=1.0164, Train Acc=46.33%, Val Loss=1.4894, Val Acc=56.50%


Epoch 2/5: 100%|██████████| 157/157 [04:20<00:00,  1.66s/it, loss=0.915, acc=58.5]


Epoch 2: Train Loss=0.9159, Train Acc=58.49%, Val Loss=1.4537, Val Acc=59.60%


Epoch 3/5: 100%|██████████| 157/157 [04:19<00:00,  1.66s/it, loss=1.02, acc=64.8]


Epoch 3: Train Loss=0.8565, Train Acc=64.84%, Val Loss=1.2207, Val Acc=67.50%


Epoch 4/5: 100%|██████████| 157/157 [04:17<00:00,  1.64s/it, loss=1.02, acc=68.2]


Epoch 4: Train Loss=0.8232, Train Acc=68.25%, Val Loss=1.2074, Val Acc=66.85%


Epoch 5/5: 100%|██████████| 157/157 [04:16<00:00,  1.63s/it, loss=0.887, acc=69.5]


Epoch 5: Train Loss=0.8128, Train Acc=69.48%, Val Loss=1.2670, Val Acc=65.10%

✓ Student with T=2.0 saved

Training with Temperature = 3.0


Epoch 1/5: 100%|██████████| 157/157 [04:22<00:00,  1.67s/it, loss=0.814, acc=43.9]


Epoch 1: Train Loss=1.0308, Train Acc=43.89%, Val Loss=1.5274, Val Acc=54.10%


Epoch 2/5: 100%|██████████| 157/157 [04:17<00:00,  1.64s/it, loss=1.05, acc=60.9]


Epoch 2: Train Loss=0.8858, Train Acc=60.90%, Val Loss=1.5059, Val Acc=57.35%


Epoch 3/5: 100%|██████████| 157/157 [04:19<00:00,  1.65s/it, loss=0.787, acc=63.9]


Epoch 3: Train Loss=0.8544, Train Acc=63.89%, Val Loss=1.2628, Val Acc=65.95%


Epoch 4/5: 100%|██████████| 157/157 [04:22<00:00,  1.67s/it, loss=1.12, acc=68.5]


Epoch 4: Train Loss=0.8144, Train Acc=68.45%, Val Loss=1.1803, Val Acc=69.90%


Epoch 5/5: 100%|██████████| 157/157 [04:20<00:00,  1.66s/it, loss=0.81, acc=70.7]


Epoch 5: Train Loss=0.7951, Train Acc=70.67%, Val Loss=1.1564, Val Acc=72.15%

✓ Student with T=3.0 saved

Training with Temperature = 5.0


Epoch 1/5: 100%|██████████| 157/157 [04:12<00:00,  1.61s/it, loss=0.997, acc=44]


Epoch 1: Train Loss=1.0249, Train Acc=44.04%, Val Loss=1.5759, Val Acc=53.70%


Epoch 2/5: 100%|██████████| 157/157 [04:11<00:00,  1.60s/it, loss=0.934, acc=60.5]


Epoch 2: Train Loss=0.8847, Train Acc=60.45%, Val Loss=1.3386, Val Acc=61.50%


Epoch 3/5: 100%|██████████| 157/157 [04:11<00:00,  1.60s/it, loss=0.98, acc=66.4]


Epoch 3: Train Loss=0.8272, Train Acc=66.39%, Val Loss=1.2839, Val Acc=64.00%


Epoch 4/5: 100%|██████████| 157/157 [04:12<00:00,  1.61s/it, loss=0.922, acc=67.5]


Epoch 4: Train Loss=0.8173, Train Acc=67.47%, Val Loss=1.2048, Val Acc=67.30%


Epoch 5/5: 100%|██████████| 157/157 [04:13<00:00,  1.61s/it, loss=0.712, acc=70.1]


Epoch 5: Train Loss=0.7856, Train Acc=70.07%, Val Loss=1.1238, Val Acc=70.95%

✓ Student with T=5.0 saved


In [None]:
## 7. Experiment 2: Knowledge Distillation with Different Temperatures
print("\n" + "="*80)
print("EXPERIMENT 2: Knowledge Distillation (Temperature Sweep)")
print("="*80)

temperatures = [2.0, 3.0, 5.0]
distillation_results = {}

for temp in temperatures:
    print(f"\n{'='*60}")
    print(f"Training with Temperature = {temp}")
    print(f"{'='*60}")

    # Create fresh student model
    student_kd = load_model_for_cifar('resnet18', pretrained=True).to(device)

    # Train with distillation
    student_kd, history_kd = train_with_distillation(
        student_kd,
        teacher,
        trainloader,
        valloader,
        num_epochs=5,
        alpha=0.5,
        temperature=temp,
        lr=0.001,
        device=device
    )

    # Save model and results
    torch.save(student_kd.state_dict(), MODELS_DIR / f'student_kd_temp{temp}.pth')
    distillation_results[f'temp_{temp}'] = history_kd

    print(f"\n✓ Student with T={temp} saved")

    # Clean up
    del student_kd
    torch.cuda.empty_cache() if torch.cuda.is_available() else None


EXPERIMENT 2: Knowledge Distillation (Temperature Sweep)

Training with Temperature = 2.0


Epoch 1/5: 100%|██████████| 157/157 [04:22<00:00,  1.67s/it, loss=0.884, acc=42.3]


Epoch 1: Train Loss=1.0430, Train Acc=42.34%, Val Loss=1.4447, Val Acc=59.05%


Epoch 2/5: 100%|██████████| 157/157 [04:22<00:00,  1.67s/it, loss=0.842, acc=59.6]


Epoch 2: Train Loss=0.9030, Train Acc=59.64%, Val Loss=1.2735, Val Acc=66.00%


Epoch 3/5: 100%|██████████| 157/157 [04:20<00:00,  1.66s/it, loss=0.798, acc=65]


Epoch 3: Train Loss=0.8565, Train Acc=65.05%, Val Loss=1.3287, Val Acc=64.75%


Epoch 4/5: 100%|██████████| 157/157 [04:23<00:00,  1.68s/it, loss=0.763, acc=68.3]


Epoch 4: Train Loss=0.8210, Train Acc=68.32%, Val Loss=1.2249, Val Acc=66.95%


Epoch 5/5: 100%|██████████| 157/157 [04:16<00:00,  1.64s/it, loss=0.706, acc=70.9]


Epoch 5: Train Loss=0.7951, Train Acc=70.90%, Val Loss=1.1767, Val Acc=69.45%

✓ Student with T=2.0 saved

Training with Temperature = 3.0


Epoch 1/5: 100%|██████████| 157/157 [04:16<00:00,  1.63s/it, loss=0.811, acc=43.6]


Epoch 1: Train Loss=1.0317, Train Acc=43.58%, Val Loss=1.4196, Val Acc=58.40%


Epoch 2/5: 100%|██████████| 157/157 [04:14<00:00,  1.62s/it, loss=0.938, acc=58.8]


Epoch 2: Train Loss=0.9060, Train Acc=58.83%, Val Loss=1.3301, Val Acc=63.80%


Epoch 3/5: 100%|██████████| 157/157 [04:18<00:00,  1.64s/it, loss=0.906, acc=63.3]


Epoch 3: Train Loss=0.8625, Train Acc=63.32%, Val Loss=1.3864, Val Acc=61.40%


Epoch 4/5: 100%|██████████| 157/157 [04:16<00:00,  1.63s/it, loss=0.622, acc=67.3]


Epoch 4: Train Loss=0.8232, Train Acc=67.26%, Val Loss=1.2210, Val Acc=68.65%


Epoch 5/5: 100%|██████████| 157/157 [04:20<00:00,  1.66s/it, loss=0.914, acc=68.7]


Epoch 5: Train Loss=0.8095, Train Acc=68.72%, Val Loss=1.4277, Val Acc=61.35%

✓ Student with T=3.0 saved

Training with Temperature = 5.0


Epoch 1/5: 100%|██████████| 157/157 [04:16<00:00,  1.64s/it, loss=1.08, acc=43.6]


Epoch 1: Train Loss=1.0360, Train Acc=43.56%, Val Loss=1.4189, Val Acc=58.90%


Epoch 2/5: 100%|██████████| 157/157 [04:16<00:00,  1.64s/it, loss=0.991, acc=59.8]


Epoch 2: Train Loss=0.8920, Train Acc=59.84%, Val Loss=1.5631, Val Acc=52.60%


Epoch 3/5: 100%|██████████| 157/157 [04:20<00:00,  1.66s/it, loss=0.915, acc=62.4]


Epoch 3: Train Loss=0.8687, Train Acc=62.44%, Val Loss=1.2384, Val Acc=65.45%


Epoch 4/5: 100%|██████████| 157/157 [04:15<00:00,  1.63s/it, loss=0.795, acc=65.3]


Epoch 4: Train Loss=0.8332, Train Acc=65.32%, Val Loss=1.1911, Val Acc=68.60%


Epoch 5/5: 100%|██████████| 157/157 [04:21<00:00,  1.66s/it, loss=0.98, acc=69.6]


Epoch 5: Train Loss=0.7973, Train Acc=69.56%, Val Loss=1.3171, Val Acc=62.45%

✓ Student with T=5.0 saved


In [None]:
# 8 . Experiment 3: Different Alpha Values
print("\n" + "="*80)
print("EXPERIMENT 3: Knowledge Distillation (Alpha Sweep)")
print("="*80)

alphas = [0.3, 0.5, 0.7]
alpha_results = {}

for alpha in alphas:
    print(f"\n{'='*60}")
    print(f"Training with Alpha = {alpha}")
    print(f"{'='*60}")

    # Create fresh student model
    student_kd = load_model_for_cifar('resnet18', pretrained=True).to(device)

    # Train with distillation
    student_kd, history_kd = train_with_distillation(
        student_kd,
        teacher,
        trainloader,
        valloader,
        num_epochs=5,
        alpha=alpha,
        temperature=3.0,
        lr=0.001,
        device=device
    )

    # Save model and results
    torch.save(student_kd.state_dict(), MODELS_DIR / f'student_kd_alpha{alpha}.pth')
    alpha_results[f'alpha_{alpha}'] = history_kd

    print(f"\n✓ Student with alpha={alpha} saved")

    # Clean up
    del student_kd
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
## 9. Comparison & Visualization
# Compile all results
all_results = {
    'vanilla': history_vanilla,
    **distillation_results,
    **alpha_results
}

# Create comparison DataFrame
comparison_data = []
for name, history in all_results.items():
    comparison_data.append({
        'Method': name,
        'Final Train Acc': history['train_acc'][-1],
        'Final Val Acc': history['val_acc'][-1],
        'Best Val Acc': max(history['val_acc']),
        'Final Train Loss': history['train_loss'][-1],
        'Final Val Loss': history['val_loss'][-1]
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Best Val Acc', ascending=False)

print("\n" + "="*80)
print("DISTILLATION RESULTS COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))

# Save comparison
comparison_df.to_csv(RESULTS_DIR / 'distillation_comparison.csv', index=False)

In [None]:
# Visualization 1: Validation Accuracy Comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Val Accuracy over epochs
for name, history in all_results.items():
    axes[0].plot(history['val_acc'], marker='o', label=name)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Validation Accuracy (%)')
axes[0].set_title('Validation Accuracy Comparison')
axes[0].legend()
axes[0].grid(True)

# Plot 2: Final accuracy comparison
methods = comparison_df['Method'].tolist()
accuracies = comparison_df['Best Val Acc'].tolist()
colors = ['red' if 'vanilla' in m else 'green' for m in methods]

axes[1].barh(methods, accuracies, color=colors, alpha=0.7)
axes[1].set_xlabel('Best Validation Accuracy (%)')
axes[1].set_title('Final Accuracy Comparison')
axes[1].grid(True, axis='x')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'distillation_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Visualization saved to:", OUTPUT_DIR / 'distillation_comparison.png')

In [None]:
# Visualization 2: Loss Components (for distillation methods)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for name, history in distillation_results.items():
    if 'hard_loss' in history:
        axes[0].plot(history['hard_loss'], marker='o', label=name)
        axes[1].plot(history['soft_loss'], marker='s', label=name)

axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Hard Loss (Cross-Entropy)')
axes[0].set_title('Hard Label Loss')
axes[0].legend()
axes[0].grid(True)

axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Soft Loss (KL Divergence)')
axes[1].set_title('Knowledge Distillation Loss')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'loss_components.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Loss components visualization saved")

In [None]:
## 10. Key Findings & Recommendations for NAS
# Find best configuration
best_method = comparison_df.iloc[0]

print("\n" + "="*80)
print("KEY FINDINGS")
print("="*80)
print(f"\n✓ Best Method: {best_method['Method']}")
print(f"  - Validation Accuracy: {best_method['Best Val Acc']:.2f}%")
print(f"  - Improvement over vanilla: {best_method['Best Val Acc'] - comparison_df[comparison_df['Method']=='vanilla']['Best Val Acc'].values[0]:.2f}%")

print("\n" + "="*80)
print("RECOMMENDATIONS FOR NAS (NOTEBOOK 3)")
print("="*80)

# Temperature recommendations
temp_results = {k: v['val_acc'][-1] for k, v in distillation_results.items()}
best_temp = max(temp_results, key=temp_results.get)
print(f"\n1. Temperature Range:")
print(f"   - Best: {best_temp}")
print(f"   - Recommended search space: [2.0, 3.0, 4.0, 5.0]")

# Alpha recommendations
alpha_vals = {k: v['val_acc'][-1] for k, v in alpha_results.items()}
best_alpha = max(alpha_vals, key=alpha_vals.get)
print(f"\n2. Alpha (Distillation Weight):")
print(f"   - Best: {best_alpha}")
print(f"   - Recommended search space: [0.3, 0.5, 0.7, 0.9]")

print(f"\n3. Distillation is effective:")
vanilla_acc = comparison_df[comparison_df['Method']=='vanilla']['Best Val Acc'].values[0]
best_distill_acc = comparison_df[comparison_df['Method']!='vanilla']['Best Val Acc'].max()
improvement = best_distill_acc - vanilla_acc
print(f"   - Average improvement: {improvement:.2f}% over vanilla training")
print(f"   - Distillation SHOULD be included in NAS search space")

# Save recommendations
recommendations = {
    'best_method': best_method['Method'],
    'best_accuracy': float(best_method['Best Val Acc']),
    'improvement_over_vanilla': float(improvement),
    'recommended_temperature_range': [2.0, 3.0, 4.0, 5.0],
    'recommended_alpha_range': [0.3, 0.5, 0.7, 0.9],
    'best_temperature': best_temp,
    'best_alpha': best_alpha
}

with open(RESULTS_DIR / 'distillation_recommendations.json', 'w') as f:
    json.dump(recommendations, f, indent=2)

print(f"\n✓ Recommendations saved to: {RESULTS_DIR / 'distillation_recommendations.json'}")

In [None]:
## 11. Save All Results
# Save all training histories
with open(RESULTS_DIR / 'all_training_histories.json', 'w') as f:
    json.dump(all_results, f, indent=2)

print("\n" + "="*80)
print("OUTPUTS CREATED")
print("="*80)
print(f"\n1. Models saved in: {MODELS_DIR}")
print(f"   - student_vanilla.pth")
print(f"   - student_kd_temp*.pth (multiple temperatures)")
print(f"   - student_kd_alpha*.pth (multiple alphas)")

print(f"\n2. Results saved in: {RESULTS_DIR}")
print(f"   - distillation_comparison.csv")
print(f"   - distillation_recommendations.json")
print(f"   - all_training_histories.json")

print(f"\n3. Visualizations saved in: {OUTPUT_DIR}")
print(f"   - distillation_comparison.png")
print(f"   - loss_components.png")

print("\n" + "="*80)
print("NEXT STEPS")
print("="*80)
print("\nNotebook 3 will use these results to:")
print("  1. Design the NAS search space using recommended ranges")
print("  2. Include distillation strategies in architecture search")
print("  3. Optimize for accuracy + latency + model size")
print("="*80)

What I accomplished:
- Implemented knowledge distillation from ResNet-50 (teacher) to ResNet-18 (student)
- Compared vanilla training vs. distillation
- Experimented with different temperatures and alpha values
- Found optimal distillation hyperparameters

Key insights:
- Distillation improves student accuracy by ~X% over vanilla training
- Temperature and alpha significantly impact performance
- Best configuration will be used in NAS search space

Files created for next notebooks:
- Distilled model checkpoints
- Training histories
- Hyperparameter recommendations
- Comparison metrics