<a href="https://colab.research.google.com/github/ReyhanehRazavi-99/AlexNet-vs-VisionTransformer/blob/main/AlexNet_on_CIFAR10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#type 2

# ============================
# CIFAR-10 + AlexNet (no fine-tuning)
# Feature extraction + Linear SVM classifier
# ============================

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import random

# ----------------------------
# Reproducibility (optional)
# ----------------------------
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# ----------------------------
# Device
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ----------------------------
# Data: CIFAR-10
# AlexNet expects 224x224 and ImageNet normalization
# ----------------------------
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

transform_eval = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

train_ds = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform_eval)
test_ds  = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_eval)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

# ----------------------------
# Model: AlexNet (pretrained on ImageNet), used as a frozen feature extractor
# We do NOT modify weights, and we do NOT fine-tune.
# ----------------------------
weights = models.AlexNet_Weights.IMAGENET1K_V1
alexnet = models.alexnet(weights=weights)
alexnet.eval()  # inference mode
alexnet.to(device)

# We’ll grab the feature vector *before* the classifier:
# feature pipeline is: features -> avgpool -> flatten -> classifier
# The flattened feature dimension is 256*6*6 = 9216 for AlexNet with 224x224 input.
@torch.no_grad()
def extract_features(dataloader):
    feats_list = []
    labels_list = []
    for images, labels in tqdm(dataloader, desc="Extracting features"):
        images = images.to(device, non_blocking=True)
        # forward through convolutional backbone
        x = alexnet.features(images)
        x = alexnet.avgpool(x)
        x = torch.flatten(x, 1)  # shape: [B, 9216]
        feats_list.append(x.cpu().numpy())
        labels_list.append(labels.numpy())
    feats = np.concatenate(feats_list, axis=0)
    labs  = np.concatenate(labels_list, axis=0)
    return feats, labs

print("Extracting train features...")
X_train, y_train = extract_features(train_loader)
print("Extracting test features...")
X_test,  y_test  = extract_features(test_loader)

print("Train feats:", X_train.shape, "Test feats:", X_test.shape)

# ----------------------------
# Classifier: Linear SVM (no deep learning training here)
# Standardize features -> LinearSVC
# ----------------------------
clf = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("svm", LinearSVC(C=1.0, max_iter=10000, dual=True))
])

print("Training Linear SVM on frozen AlexNet features...")
clf.fit(X_train, y_train)

# ----------------------------
# Evaluate
# ----------------------------
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc*100:.2f}%\n")
print("Classification report:")
print(classification_report(y_test, y_pred, target_names=train_ds.classes))


Device: cuda


100%|██████████| 170M/170M [00:13<00:00, 12.5MB/s]


Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


100%|██████████| 233M/233M [00:01<00:00, 242MB/s]


Extracting train features...


Extracting features: 100%|██████████| 391/391 [00:47<00:00,  8.18it/s]


Extracting test features...


Extracting features: 100%|██████████| 79/79 [00:09<00:00,  8.14it/s]


Train feats: (50000, 9216) Test feats: (10000, 9216)
Training Linear SVM on frozen AlexNet features...

Test Accuracy: 71.95%

Classification report:
              precision    recall  f1-score   support

    airplane       0.74      0.73      0.74      1000
  automobile       0.81      0.81      0.81      1000
        bird       0.64      0.63      0.64      1000
         cat       0.55      0.55      0.55      1000
        deer       0.68      0.67      0.68      1000
         dog       0.65      0.65      0.65      1000
        frog       0.76      0.76      0.76      1000
       horse       0.74      0.73      0.74      1000
        ship       0.80      0.84      0.82      1000
       truck       0.81      0.82      0.81      1000

    accuracy                           0.72     10000
   macro avg       0.72      0.72      0.72     10000
weighted avg       0.72      0.72      0.72     10000



In [1]:
# ============================
# CIFAR-10 + AlexNet (Type 2 = Partial Fine-Tune)
# Unfreeze the last block (tail of conv backbone) + full classifier; keep earlier backbone frozen
# ============================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

import numpy as np
from sklearn.metrics import classification_report
from tqdm import tqdm
import random

# ----------------------------
# Reproducibility (optional)
# ----------------------------
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# ----------------------------
# Device
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ----------------------------
# Data: CIFAR-10
# Resize to 224 and use ImageNet normalization
# ----------------------------
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

transform_train = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

transform_eval = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

train_ds = datasets.CIFAR10(root="./data", train=True,  download=True, transform=transform_train)
test_ds  = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_eval)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True,  num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

num_classes = 10
class_names = train_ds.classes

# ----------------------------
# Model: AlexNet pretrained; PARTIAL fine-tune
# - Freeze early conv layers
# - Unfreeze the LAST conv "block" + the ENTIRE classifier
# ----------------------------
weights = models.AlexNet_Weights.IMAGENET1K_V1
model = models.alexnet(weights=weights)

# Replace final layer to match CIFAR-10
in_features = model.classifier[6].in_features  # 4096
model.classifier[6] = nn.Linear(in_features, num_classes)

# 1) Freeze EVERYTHING first
for p in model.parameters():
    p.requires_grad = False

# 2) Unfreeze the last conv block (tail of model.features)
#    AlexNet features structure indices (PyTorch torchvision):
#    0:Conv,1:ReLU,2:Pool,3:Conv,4:ReLU,5:Pool,
#    6:Conv,7:ReLU,8:Conv,9:ReLU,10:Conv,11:ReLU,12:Pool
#    We'll unfreeze indices >= 10 (last Conv + ReLU + final Pool)
last_block_start = 10
for idx, layer in enumerate(model.features):
    if idx >= last_block_start:
        for p in layer.parameters():
            p.requires_grad = True

# 3) Unfreeze the ENTIRE classifier head
for p in model.classifier.parameters():
    p.requires_grad = True

model = model.to(device)

# ----------------------------
# Loss / Optimizer / Scheduler
# Use two LR groups: smaller for last conv block, larger for classifier
# ----------------------------
criterion = nn.CrossEntropyLoss()

# Collect params for two groups
conv_tail_params = []
for idx, layer in enumerate(model.features):
    if idx >= last_block_start:
        conv_tail_params += list(layer.parameters())

classifier_params = list(model.classifier.parameters())

optimizer = optim.AdamW([
    {"params": conv_tail_params,    "lr": 3e-5, "weight_decay": 1e-4},  # conservative on conv tail
    {"params": classifier_params,   "lr": 1e-3, "weight_decay": 1e-4},  # bigger LR for classifier
], betas=(0.9, 0.999))

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-5)

# ----------------------------
# Train / Eval helpers
# ----------------------------
def train_one_epoch(model, loader):
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for images, labels in tqdm(loader, desc="Train", leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        preds = logits.argmax(1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)
    return running_loss / total, correct / total

@torch.no_grad()
def evaluate(model, loader, desc="Eval"):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    all_preds, all_labels = [], []
    for images, labels in tqdm(loader, desc=desc, leave=False):
        images, labels = images.to(device), labels.to(device)
        logits = model(images)
        loss = criterion(logits, labels)
        running_loss += loss.item() * images.size(0)
        preds = logits.argmax(1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())
    avg_loss = running_loss / total
    acc = correct / total
    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_labels)
    return avg_loss, acc, y_true, y_pred

# ----------------------------
# Train (partial fine-tune)
# ----------------------------
epochs = 10
for ep in range(1, epochs + 1):
    tr_loss, tr_acc = train_one_epoch(model, train_loader)
    te_loss, te_acc, _, _ = evaluate(model, test_loader, desc="Test")
    scheduler.step()
    print(f"Epoch {ep:02d} | Train Loss {tr_loss:.4f} Acc {tr_acc*100:.2f}% | "
          f"Test Loss {te_loss:.4f} Acc {te_acc*100:.2f}%")

# ----------------------------
# Final evaluation with report
# ----------------------------
_, test_acc, y_true, y_pred = evaluate(model, test_loader, desc="Final Test")
print(f"\nFinal Test Accuracy (Partial Fine-Tune): {test_acc*100:.2f}%\n")
print("Classification report:")
print(classification_report(y_true, y_pred, target_names=class_names, digits=4))


Device: cuda


100%|██████████| 170M/170M [00:06<00:00, 24.4MB/s]


Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


100%|██████████| 233M/233M [00:00<00:00, 249MB/s]


Epoch 01 | Train Loss 0.7492 Acc 74.20% | Test Loss 0.5458 Acc 80.80%




Epoch 02 | Train Loss 0.5659 Acc 80.48% | Test Loss 0.4926 Acc 82.93%




Epoch 03 | Train Loss 0.4872 Acc 83.36% | Test Loss 0.4813 Acc 83.26%




Epoch 04 | Train Loss 0.4269 Acc 85.34% | Test Loss 0.4314 Acc 84.87%




Epoch 05 | Train Loss 0.3731 Acc 87.08% | Test Loss 0.4040 Acc 86.12%




Epoch 06 | Train Loss 0.3309 Acc 88.46% | Test Loss 0.3966 Acc 86.38%




Epoch 07 | Train Loss 0.2836 Acc 90.14% | Test Loss 0.3770 Acc 87.22%




Epoch 08 | Train Loss 0.2463 Acc 91.41% | Test Loss 0.3616 Acc 87.92%




Epoch 09 | Train Loss 0.2219 Acc 92.29% | Test Loss 0.3561 Acc 88.06%




Epoch 10 | Train Loss 0.2018 Acc 92.92% | Test Loss 0.3557 Acc 88.23%


                                                           


Final Test Accuracy (Partial Fine-Tune): 88.23%

Classification report:
              precision    recall  f1-score   support

    airplane     0.9069    0.8860    0.8963      1000
  automobile     0.9301    0.9320    0.9311      1000
        bird     0.9057    0.8360    0.8695      1000
         cat     0.7812    0.7710    0.7760      1000
        deer     0.8449    0.8880    0.8659      1000
         dog     0.8535    0.8040    0.8280      1000
        frog     0.9096    0.9160    0.9128      1000
       horse     0.8999    0.9080    0.9039      1000
        ship     0.9091    0.9400    0.9243      1000
       truck     0.8820    0.9420    0.9110      1000

    accuracy                         0.8823     10000
   macro avg     0.8823    0.8823    0.8819     10000
weighted avg     0.8823    0.8823    0.8819     10000





In [3]:
#type 1





import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import random

# ----------------------------
# Reproducibility (optional)
# ----------------------------
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# ----------------------------
# Device
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ----------------------------
# Data: CIFAR-10
# AlexNet expects 224x224 and ImageNet normalization
# ----------------------------
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

# Keep transforms similar to your original (no augmentation for a clean probe)
transform_eval = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

train_ds = datasets.CIFAR10(root="./data", train=True,  download=True, transform=transform_eval)
test_ds  = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_eval)

# For training the linear head, it's better to shuffle the training set
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True,  num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

# ----------------------------
# Model: AlexNet pretrained on ImageNet
# - Freeze all parameters (backbone + earlier classifier layers)
# - Replace final classifier layer with new Linear to 10 classes
# ----------------------------
weights = models.AlexNet_Weights.IMAGENET1K_V1
model = models.alexnet(weights=weights)

# Replace final layer (classifier[6]) — input dim is 4096 for AlexNet
in_features = model.classifier[6].in_features
model.classifier[6] = nn.Linear(in_features, 10)

# Freeze EVERYTHING first
for p in model.parameters():
    p.requires_grad = False

# Unfreeze ONLY the final layer
for p in model.classifier[6].parameters():
    p.requires_grad = True

model = model.to(device)
print("Trainable params (should be only final layer):",
      sum(p.numel() for p in model.parameters() if p.requires_grad))

# ----------------------------
# Loss / Optimizer
# ----------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.classifier[6].parameters(), lr=1e-3, weight_decay=0.0)

# Optional: a simple cosine schedule over a small number of epochs
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-5)

# ----------------------------
# Train / Eval helpers
# ----------------------------
@torch.no_grad()
def evaluate(loader, model):
    model.eval()
    all_preds, all_labels = [], []
    for images, labels in loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        logits = model(images)
        preds = logits.argmax(1)
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())
    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_labels)
    acc = (y_pred == y_true).mean()
    return acc, y_true, y_pred

def train_linear_probe(epochs=10, print_every=100):
    model.train()
    n_batches = len(train_loader)
    for ep in range(1, epochs + 1):
        running_loss, correct, total = 0.0, 0, 0
        for b, (images, labels) in enumerate(train_loader, 1):
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            logits = model(images)                 # full forward, but only final layer has grads
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            preds = logits.argmax(1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

            if b % print_every == 0 or b == n_batches:
                print(f"Epoch {ep}/{epochs} | Batch {b}/{n_batches} | "
                      f"Loss {running_loss/b:.4f} | Acc {(correct/total)*100:.2f}%")

        scheduler.step()

# ----------------------------
# Train the linear head
# ----------------------------
train_linear_probe(epochs=10, print_every=100)

# ----------------------------
# Evaluate on test set
# ----------------------------
test_acc, y_true, y_pred = evaluate(test_loader, model)
print(f"\nTest Accuracy (linear probe): {test_acc*100:.2f}%\n")
print("Classification report:")
print(classification_report(y_true, y_pred, target_names=train_ds.classes))

Device: cuda
Trainable params (should be only final layer): 40970
Epoch 1/10 | Batch 100/391 | Loss 0.9062 | Acc 68.46%
Epoch 1/10 | Batch 200/391 | Loss 0.8198 | Acc 71.16%
Epoch 1/10 | Batch 300/391 | Loss 0.7742 | Acc 72.79%
Epoch 1/10 | Batch 391/391 | Loss 0.7497 | Acc 73.73%
Epoch 2/10 | Batch 100/391 | Loss 0.6272 | Acc 77.68%
Epoch 2/10 | Batch 200/391 | Loss 0.6411 | Acc 77.47%
Epoch 2/10 | Batch 300/391 | Loss 0.6359 | Acc 77.51%
Epoch 2/10 | Batch 391/391 | Loss 0.6331 | Acc 77.63%
Epoch 3/10 | Batch 100/391 | Loss 0.5845 | Acc 79.16%
Epoch 3/10 | Batch 200/391 | Loss 0.6064 | Acc 78.60%
Epoch 3/10 | Batch 300/391 | Loss 0.6007 | Acc 78.85%
Epoch 3/10 | Batch 391/391 | Loss 0.5997 | Acc 78.83%
Epoch 4/10 | Batch 100/391 | Loss 0.5712 | Acc 79.93%
Epoch 4/10 | Batch 200/391 | Loss 0.5717 | Acc 80.00%
Epoch 4/10 | Batch 300/391 | Loss 0.5749 | Acc 79.79%
Epoch 4/10 | Batch 391/391 | Loss 0.5802 | Acc 79.59%
Epoch 5/10 | Batch 100/391 | Loss 0.5524 | Acc 80.15%
Epoch 5/10 | Bat

In [2]:
# ============================
# CIFAR-10 + AlexNet (Linear Probe)
# Freeze all backbone weights; train final layer only
# ============================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import random

# ----------------------------
# Reproducibility (optional)
# ----------------------------
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# ----------------------------
# Device
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ----------------------------
# Data: CIFAR-10
# Resize to 224 and use ImageNet normalization
# ----------------------------
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

transform_train = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

transform_eval = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

train_ds = datasets.CIFAR10(root="./data", train=True,  download=True, transform=transform_train)
test_ds  = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_eval)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True,  num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

num_classes = 10
class_names = train_ds.classes

# ----------------------------
# Model: AlexNet pretrained; freeze backbone; replace final layer
# ----------------------------
weights = models.AlexNet_Weights.IMAGENET1K_V1
model = models.alexnet(weights=weights)

# Freeze ALL parameters first (backbone + classifier)
for p in model.parameters():
    p.requires_grad = False

# Replace ONLY the last linear layer to match CIFAR-10 and train it
in_features = model.classifier[6].in_features  # 4096
model.classifier[6] = nn.Linear(in_features, num_classes)

# Make sure the new head is trainable
for p in model.classifier[6].parameters():
    p.requires_grad = True

model = model.to(device)

# ----------------------------
# Loss / Optimizer / Scheduler
# ----------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.classifier[6].parameters(), lr=1e-3, weight_decay=0.0)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-5)

# ----------------------------
# Train / Eval helpers
# ----------------------------
def train_one_epoch(model, loader):
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for images, labels in tqdm(loader, desc="Train", leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        preds = logits.argmax(1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)
    return running_loss / total, correct / total

@torch.no_grad()
def evaluate(model, loader, desc="Eval"):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    all_preds, all_labels = [], []
    for images, labels in tqdm(loader, desc=desc, leave=False):
        images, labels = images.to(device), labels.to(device)
        logits = model(images)
        loss = criterion(logits, labels)
        running_loss += loss.item() * images.size(0)
        preds = logits.argmax(1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())
    avg_loss = running_loss / total
    acc = correct / total
    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_labels)
    return avg_loss, acc, y_true, y_pred

# ----------------------------
# Train (linear probe)
# ----------------------------
epochs = 10
for ep in range(1, epochs + 1):
    tr_loss, tr_acc = train_one_epoch(model, train_loader)
    te_loss, te_acc, _, _ = evaluate(model, test_loader, desc="Test")
    scheduler.step()
    print(f"Epoch {ep:02d} | Train Loss {tr_loss:.4f} Acc {tr_acc*100:.2f}% | "
          f"Test Loss {te_loss:.4f} Acc {te_acc*100:.2f}%")

# ----------------------------
# Final evaluation with report
# ----------------------------
_, test_acc, y_true, y_pred = evaluate(model, test_loader, desc="Final Test")
print(f"\nFinal Test Accuracy (Linear Probe): {test_acc*100:.2f}%\n")
print("Classification report:")
print(classification_report(y_true, y_pred, target_names=class_names, digits=4))
 # ============================
# Try multiple classifiers on frozen AlexNet features
# ============================
from collections import OrderedDict
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# ---- split a validation set out of the extracted train features
# use a fixed size (5k) so class balance is nice; adjust as you like
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=5000, stratify=y_train, random_state=42
)
print("Train feats:", X_tr.shape, "| Val feats:", X_val.shape, "| Test feats:", X_test.shape)

def fit_eval(name, clf, Xtr, ytr, Xva, yva, Xte, yte):
    clf.fit(Xtr, ytr)
    y_val_pred = clf.predict(Xva); val_acc = accuracy_score(yva, y_val_pred)
    y_te_pred  = clf.predict(Xte); te_acc  = accuracy_score(yte, y_te_pred)
    print(f"{name:>20} | Val: {val_acc*100:6.2f}% | Test: {te_acc*100:6.2f}%")
    return {"name": name, "model": clf, "val_acc": val_acc, "test_acc": te_acc, "y_test_pred": y_te_pred}

print("\n=== Comparing classifiers on frozen AlexNet features (9216-d) ===")
models = OrderedDict()

# 1) Linear SVM (strong linear baseline)
models["LinearSVM"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LinearSVC(C=1.0, max_iter=20000, dual=True))
])

# 2) Multinomial Logistic Regression (often close to LinearSVM)
models["LogReg_LBFGS"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(C=1.0, max_iter=3000, n_jobs=-1, multi_class="multinomial", solver="lbfgs"))
])

# 3) RBF SVM — PCA→256 for speed/stability
models["SVM_RBF_PCA256"] = Pipeline([
    ("pca", PCA(n_components=256, random_state=42)),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SVC(kernel="rbf", C=5.0, gamma="scale"))
])

# 4) Linear SGD (hinge) — very fast linear baseline
models["SGD_Hinge"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SGDClassifier(loss="hinge", alpha=1e-4, max_iter=3000, tol=1e-3, n_jobs=-1, random_state=42))
])

# 5) kNN — PCA→128 then scale (distance-friendly)
models["kNN_PCA128_k7"] = Pipeline([
    ("pca", PCA(n_components=128, random_state=42)),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", KNeighborsClassifier(n_neighbors=7, n_jobs=-1))
])

# 6) Random Forest — tree-based (no scaling)
models["RandomForest400"] = Pipeline([
    ("clf", RandomForestClassifier(n_estimators=400, max_depth=None, n_jobs=-1, random_state=42))
])

# 7) MLP — shallow NN on features
models["MLP_512"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", MLPClassifier(hidden_layer_sizes=(512,), activation="relu",
                          alpha=1e-4, batch_size=256, learning_rate_init=1e-3,
                          max_iter=200, random_state=42))
])

# ---- evaluate all
results = []
for name, pipe in models.items():
    res = fit_eval(name, pipe, X_tr, y_tr, X_val, y_val, X_test, y_test)
    results.append(res)

# ---- pick best by validation accuracy
results_sorted = sorted(results, key=lambda r: r["val_acc"], reverse=True)
best = results_sorted[0]
print("\n=== Summary (sorted by Val acc) ===")
for r in results_sorted:
    print(f"{r['name']:>20} | Val: {r['val_acc']*100:6.2f}% | Test: {r['test_acc']*100:6.2f}%")
print(f"\nBest on validation: {best['name']}")

# ---- refit best on FULL train (train + val), then final test report
X_trva = np.concatenate([X_tr, X_val], axis=0)
y_trva = np.concatenate([y_tr, y_val], axis=0)

best_model = models[best["name"]]
best_model.fit(X_trva, y_trva)
y_test_pred_final = best_model.predict(X_test)
final_test_acc = accuracy_score(y_test, y_test_pred_final)

print(f"\n=== FINAL (refit on train+val) — {best['name']} ===")
print(f"Final Test Accuracy: {final_test_acc*100:.2f}%")
print("\nClassification report (Test):")
print(classification_report(y_test, y_test_pred_final, target_names=train_ds.classes, digits=4))

cm = confusion_matrix(y_test, y_test_pred_final)
print("Confusion Matrix (rows=true, cols=pred):")
print(cm)


Device: cuda


100%|██████████| 170M/170M [01:26<00:00, 1.97MB/s]


Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


100%|██████████| 233M/233M [00:03<00:00, 75.6MB/s]


Epoch 01 | Train Loss 0.7742 Acc 72.87% | Test Loss 0.5787 Acc 79.43%




Epoch 02 | Train Loss 0.6624 Acc 76.75% | Test Loss 0.5660 Acc 80.01%




Epoch 03 | Train Loss 0.6293 Acc 77.86% | Test Loss 0.5508 Acc 80.60%




Epoch 04 | Train Loss 0.6136 Acc 78.50% | Test Loss 0.5308 Acc 81.10%




Epoch 05 | Train Loss 0.5925 Acc 78.89% | Test Loss 0.5213 Acc 81.54%




Epoch 06 | Train Loss 0.5777 Acc 79.50% | Test Loss 0.5288 Acc 81.63%




Epoch 07 | Train Loss 0.5635 Acc 80.13% | Test Loss 0.5384 Acc 81.10%




Epoch 08 | Train Loss 0.5507 Acc 80.54% | Test Loss 0.5011 Acc 82.43%




Epoch 09 | Train Loss 0.5483 Acc 80.54% | Test Loss 0.4972 Acc 82.68%




Epoch 10 | Train Loss 0.5366 Acc 81.02% | Test Loss 0.4961 Acc 82.76%





Final Test Accuracy (Linear Probe): 82.76%

Classification report:
              precision    recall  f1-score   support

    airplane     0.8542    0.7970    0.8246      1000
  automobile     0.8811    0.8890    0.8850      1000
        bird     0.8770    0.7060    0.7823      1000
         cat     0.7297    0.6910    0.7098      1000
        deer     0.7680    0.8210    0.7936      1000
         dog     0.7982    0.7910    0.7946      1000
        frog     0.8451    0.8840    0.8641      1000
       horse     0.8368    0.8510    0.8438      1000
        ship     0.8505    0.9160    0.8820      1000
       truck     0.8409    0.9300    0.8832      1000

    accuracy                         0.8276     10000
   macro avg     0.8281    0.8276    0.8263     10000
weighted avg     0.8281    0.8276    0.8263     10000



NameError: name 'X_train' is not defined

In [None]:
# ============================================================
# CIFAR-10 — Full Fine-Tuning of AlexNet (PyTorch)
# - Uses ImageNet-pretrained AlexNet, replaces head with 10 classes
# - Trains ALL layers (no freezing)
# - Data augmentation, mixed precision, cosine LR, best checkpoint
# ============================================================

import os
import random
from pathlib import Path

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms, models

from tqdm import tqdm
import numpy as np

# ----------------------------
# Reproducibility
# ----------------------------
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = True  # speedup for fixed image size

# ----------------------------
# Device
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ----------------------------
# Hyperparameters
# ----------------------------
BATCH_SIZE   = 128
EPOCHS       = 40
BASE_LR      = 0.01
WEIGHT_DECAY = 5e-4
MOMENTUM     = 0.9
VAL_SPLIT    = 5000   # from the 50k training images
NUM_WORKERS  = 2
SAVE_DIR     = Path("./checkpoints")
SAVE_DIR.mkdir(parents=True, exist_ok=True)
BEST_PATH    = SAVE_DIR / "alexnet_cifar10_best.pt"

# ----------------------------
# Transforms (AlexNet expects 224x224 + ImageNet norm)
# ----------------------------
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.6, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

eval_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

# ----------------------------
# Datasets & Loaders
# ----------------------------
root = "./data"
full_train = datasets.CIFAR10(root=root, train=True, download=True, transform=train_transform)
test_set   = datasets.CIFAR10(root=root, train=False, download=True, transform=eval_transform)

# Split train -> train/val (keep class distribution approx. random)
train_size = len(full_train) - VAL_SPLIT
val_size   = VAL_SPLIT
train_set, val_set = random_split(full_train, [train_size, val_size], generator=torch.Generator().manual_seed(seed))

# Important: use eval_transform for val
val_set.dataset.transform = eval_transform

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS, pin_memory=True)
val_loader   = DataLoader(val_set,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
test_loader  = DataLoader(test_set,  batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

print(f"Train: {len(train_set)} | Val: {len(val_set)} | Test: {len(test_set)}")

# ----------------------------
# Model: AlexNet (pretrained) -> head to 10 classes
# ----------------------------
weights = models.AlexNet_Weights.IMAGENET1K_V1
model = models.alexnet(weights=weights)

# Replace classifier last layer (4096 -> 10)
in_feats = model.classifier[6].in_features
model.classifier[6] = nn.Linear(in_feats, 10)

model = model.to(device)

# ----------------------------
# Loss, Optimizer, Scheduler
# ----------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=BASE_LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY, nesterov=True)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

# ----------------------------
# Utilities
# ----------------------------
def accuracy(logits, targets):
    preds = logits.argmax(dim=1)
    return (preds == targets).float().mean().item()

@torch.no_grad()
def evaluate(loader, desc="Eval"):
    model.eval()
    total_loss, total_acc, total = 0.0, 0.0, 0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(imgs)
            loss = criterion(logits, labels)
        bs = labels.size(0)
        total += bs
        total_loss += loss.item() * bs
        total_acc  += accuracy(logits, labels) * bs
    return total_loss / total, total_acc / total

# ----------------------------
# Train Loop
# ----------------------------
best_val_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss, running_acc, seen = 0.0, 0.0, 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", ncols=100)
    for imgs, labels in pbar:
        imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(imgs)
            loss = criterion(logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        bs = labels.size(0)
        seen += bs
        running_loss += loss.item() * bs
        running_acc  += accuracy(logits, labels) * bs

        pbar.set_postfix({
            "train_loss": f"{running_loss/seen:.4f}",
            "train_acc":  f"{100*running_acc/seen:.2f}%"
        })

    # Validation
    val_loss, val_acc = evaluate(val_loader, desc="Val")
    scheduler.step()

    print(f"[Epoch {epoch}] val_loss={val_loss:.4f} | val_acc={100*val_acc:.2f}% | lr={scheduler.get_last_lr()[0]:.5f}")

    # Save best
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({
            "epoch": epoch,
            "state_dict": model.state_dict(),
            "val_acc": val_acc,
        }, BEST_PATH)
        print(f"✓ Saved new best model to {BEST_PATH} (val_acc={100*val_acc:.2f}%)")

# ----------------------------
# Test Evaluation (Best Checkpoint)
# ----------------------------
ckpt = torch.load(BEST_PATH, map_location=device)
model.load_state_dict(ckpt["state_dict"])
test_loss, test_acc = evaluate(test_loader, desc="Test")
print(f"\nBest epoch: {ckpt['epoch']}  |  Test loss: {test_loss:.4f}  |  Test acc: {100*test_acc:.2f}%")


Device: cuda


100%|██████████| 170M/170M [00:10<00:00, 15.7MB/s]


Train: 45000 | Val: 5000 | Test: 10000
Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


100%|██████████| 233M/233M [00:01<00:00, 123MB/s]
  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 1/40: 100%|████████████| 352/352 [00:58<00:00,  6.01it/s, train_loss=0.7254, train_acc=74.81%]
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


[Epoch 1] val_loss=0.4850 | val_acc=82.58% | lr=0.00998
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=82.58%)


Epoch 2/40: 100%|████████████| 352/352 [00:42<00:00,  8.29it/s, train_loss=0.4131, train_acc=85.88%]


[Epoch 2] val_loss=0.3807 | val_acc=86.66% | lr=0.00994
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=86.66%)


Epoch 3/40: 100%|████████████| 352/352 [00:42<00:00,  8.33it/s, train_loss=0.3157, train_acc=89.06%]


[Epoch 3] val_loss=0.3462 | val_acc=87.82% | lr=0.00986
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=87.82%)


Epoch 4/40: 100%|████████████| 352/352 [00:46<00:00,  7.51it/s, train_loss=0.2456, train_acc=91.39%]


[Epoch 4] val_loss=0.3732 | val_acc=87.18% | lr=0.00976


Epoch 5/40: 100%|████████████| 352/352 [00:47<00:00,  7.47it/s, train_loss=0.1929, train_acc=93.25%]


[Epoch 5] val_loss=0.3233 | val_acc=88.26% | lr=0.00962
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=88.26%)


Epoch 6/40: 100%|████████████| 352/352 [00:42<00:00,  8.30it/s, train_loss=0.1512, train_acc=94.63%]


[Epoch 6] val_loss=0.3133 | val_acc=89.40% | lr=0.00946
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=89.40%)


Epoch 7/40: 100%|████████████| 352/352 [00:42<00:00,  8.32it/s, train_loss=0.1226, train_acc=95.66%]


[Epoch 7] val_loss=0.3323 | val_acc=89.20% | lr=0.00926


Epoch 8/40: 100%|████████████| 352/352 [00:42<00:00,  8.32it/s, train_loss=0.1030, train_acc=96.37%]


[Epoch 8] val_loss=0.3538 | val_acc=89.16% | lr=0.00905


Epoch 9/40: 100%|████████████| 352/352 [00:43<00:00,  8.18it/s, train_loss=0.0815, train_acc=97.14%]


[Epoch 9] val_loss=0.3253 | val_acc=89.94% | lr=0.00880
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=89.94%)


Epoch 10/40: 100%|███████████| 352/352 [00:47<00:00,  7.48it/s, train_loss=0.0694, train_acc=97.62%]


[Epoch 10] val_loss=0.3529 | val_acc=89.96% | lr=0.00854
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=89.96%)


Epoch 11/40: 100%|███████████| 352/352 [00:42<00:00,  8.37it/s, train_loss=0.0605, train_acc=97.92%]


[Epoch 11] val_loss=0.3548 | val_acc=89.62% | lr=0.00825


Epoch 12/40: 100%|███████████| 352/352 [00:42<00:00,  8.30it/s, train_loss=0.0517, train_acc=98.22%]


[Epoch 12] val_loss=0.3346 | val_acc=90.38% | lr=0.00794
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=90.38%)


Epoch 13/40: 100%|███████████| 352/352 [00:42<00:00,  8.34it/s, train_loss=0.0386, train_acc=98.70%]


[Epoch 13] val_loss=0.3488 | val_acc=90.52% | lr=0.00761
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=90.52%)


Epoch 14/40: 100%|███████████| 352/352 [00:42<00:00,  8.26it/s, train_loss=0.0349, train_acc=98.81%]


[Epoch 14] val_loss=0.3374 | val_acc=90.64% | lr=0.00727
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=90.64%)


Epoch 15/40: 100%|███████████| 352/352 [00:42<00:00,  8.33it/s, train_loss=0.0289, train_acc=99.06%]


[Epoch 15] val_loss=0.3096 | val_acc=91.02% | lr=0.00691
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=91.02%)


Epoch 16/40: 100%|███████████| 352/352 [00:42<00:00,  8.25it/s, train_loss=0.0257, train_acc=99.13%]


[Epoch 16] val_loss=0.3504 | val_acc=90.50% | lr=0.00655


Epoch 17/40: 100%|███████████| 352/352 [00:42<00:00,  8.35it/s, train_loss=0.0208, train_acc=99.37%]


[Epoch 17] val_loss=0.3434 | val_acc=90.94% | lr=0.00617


Epoch 18/40: 100%|███████████| 352/352 [00:46<00:00,  7.53it/s, train_loss=0.0178, train_acc=99.45%]


[Epoch 18] val_loss=0.3438 | val_acc=91.30% | lr=0.00578
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=91.30%)


Epoch 19/40: 100%|███████████| 352/352 [00:41<00:00,  8.38it/s, train_loss=0.0134, train_acc=99.63%]


[Epoch 19] val_loss=0.3319 | val_acc=91.30% | lr=0.00539


Epoch 20/40: 100%|███████████| 352/352 [00:41<00:00,  8.41it/s, train_loss=0.0106, train_acc=99.71%]


[Epoch 20] val_loss=0.3337 | val_acc=91.42% | lr=0.00500
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=91.42%)


Epoch 21/40: 100%|███████████| 352/352 [00:45<00:00,  7.74it/s, train_loss=0.0090, train_acc=99.78%]


[Epoch 21] val_loss=0.3275 | val_acc=91.62% | lr=0.00461
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=91.62%)


Epoch 22/40: 100%|███████████| 352/352 [00:41<00:00,  8.42it/s, train_loss=0.0080, train_acc=99.80%]


[Epoch 22] val_loss=0.3132 | val_acc=91.78% | lr=0.00422
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=91.78%)


Epoch 23/40: 100%|███████████| 352/352 [00:41<00:00,  8.40it/s, train_loss=0.0070, train_acc=99.84%]


[Epoch 23] val_loss=0.3279 | val_acc=92.06% | lr=0.00383
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=92.06%)


Epoch 24/40: 100%|███████████| 352/352 [00:41<00:00,  8.42it/s, train_loss=0.0061, train_acc=99.85%]


[Epoch 24] val_loss=0.3437 | val_acc=91.74% | lr=0.00345


Epoch 25/40: 100%|███████████| 352/352 [00:41<00:00,  8.39it/s, train_loss=0.0045, train_acc=99.89%]


[Epoch 25] val_loss=0.3298 | val_acc=91.82% | lr=0.00309


Epoch 26/40: 100%|███████████| 352/352 [00:41<00:00,  8.43it/s, train_loss=0.0045, train_acc=99.90%]


[Epoch 26] val_loss=0.3237 | val_acc=92.12% | lr=0.00273
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=92.12%)


Epoch 27/40: 100%|███████████| 352/352 [00:41<00:00,  8.44it/s, train_loss=0.0037, train_acc=99.92%]


[Epoch 27] val_loss=0.3379 | val_acc=91.74% | lr=0.00239


Epoch 28/40: 100%|███████████| 352/352 [00:41<00:00,  8.43it/s, train_loss=0.0034, train_acc=99.94%]


[Epoch 28] val_loss=0.3301 | val_acc=92.02% | lr=0.00206


Epoch 29/40: 100%|███████████| 352/352 [00:41<00:00,  8.45it/s, train_loss=0.0028, train_acc=99.96%]


[Epoch 29] val_loss=0.3293 | val_acc=92.22% | lr=0.00175
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=92.22%)


Epoch 30/40: 100%|███████████| 352/352 [00:41<00:00,  8.42it/s, train_loss=0.0027, train_acc=99.95%]


[Epoch 30] val_loss=0.3201 | val_acc=92.10% | lr=0.00146


Epoch 31/40: 100%|███████████| 352/352 [00:41<00:00,  8.45it/s, train_loss=0.0029, train_acc=99.95%]


[Epoch 31] val_loss=0.3202 | val_acc=92.14% | lr=0.00120


Epoch 32/40: 100%|███████████| 352/352 [00:41<00:00,  8.39it/s, train_loss=0.0026, train_acc=99.96%]


[Epoch 32] val_loss=0.3207 | val_acc=92.26% | lr=0.00095
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=92.26%)


Epoch 33/40: 100%|███████████| 352/352 [00:41<00:00,  8.39it/s, train_loss=0.0027, train_acc=99.96%]


[Epoch 33] val_loss=0.3194 | val_acc=92.28% | lr=0.00074
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=92.28%)


Epoch 34/40: 100%|███████████| 352/352 [00:41<00:00,  8.39it/s, train_loss=0.0021, train_acc=99.97%]


[Epoch 34] val_loss=0.3233 | val_acc=92.22% | lr=0.00054


Epoch 35/40: 100%|███████████| 352/352 [00:41<00:00,  8.47it/s, train_loss=0.0021, train_acc=99.98%]


[Epoch 35] val_loss=0.3228 | val_acc=92.36% | lr=0.00038
✓ Saved new best model to checkpoints/alexnet_cifar10_best.pt (val_acc=92.36%)


Epoch 36/40: 100%|███████████| 352/352 [00:41<00:00,  8.43it/s, train_loss=0.0019, train_acc=99.99%]


[Epoch 36] val_loss=0.3242 | val_acc=92.22% | lr=0.00024


Epoch 37/40: 100%|███████████| 352/352 [00:41<00:00,  8.50it/s, train_loss=0.0020, train_acc=99.97%]


[Epoch 37] val_loss=0.3240 | val_acc=92.26% | lr=0.00014


Epoch 38/40: 100%|███████████| 352/352 [00:42<00:00,  8.36it/s, train_loss=0.0024, train_acc=99.96%]


[Epoch 38] val_loss=0.3226 | val_acc=92.28% | lr=0.00006


Epoch 39/40: 100%|███████████| 352/352 [00:41<00:00,  8.45it/s, train_loss=0.0021, train_acc=99.98%]


[Epoch 39] val_loss=0.3224 | val_acc=92.24% | lr=0.00002


Epoch 40/40: 100%|███████████| 352/352 [00:43<00:00,  8.07it/s, train_loss=0.0020, train_acc=99.98%]


[Epoch 40] val_loss=0.3225 | val_acc=92.24% | lr=0.00000

Best epoch: 35  |  Test loss: 0.3660  |  Test acc: 91.78%


In [None]:
# ============================
# SVM on AlexNet penultimate features
# (append after loading BEST checkpoint and test eval)
# ============================
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import itertools

# Use eval transforms (no aug) for stable feature extraction
train_set.dataset.transform = eval_transform  # already done for val_set above, test_set uses eval_transform by construction

svm_train_loader = DataLoader(train_set, batch_size=128, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
svm_val_loader   = DataLoader(val_set,   batch_size=128, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
svm_test_loader  = DataLoader(test_set,  batch_size=128, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def extract_penultimate_features(dataloader):
    model.eval()
    X, y = [], []
    for imgs, labels in tqdm(dataloader, desc="Extract SVM feats", ncols=100):
        imgs = imgs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            # forward to penultimate: features -> avgpool -> flatten -> classifier[0..5]
            x = model.features(imgs)
            x = model.avgpool(x)
            x = torch.flatten(x, 1)          # 9216
            for i in range(6):                # Dropout, FC(9216->4096), ReLU, Dropout, FC(4096->4096), ReLU
                x = model.classifier[i](x)
            # x is 4096-dim penultimate features
        X.append(x.cpu().numpy())
        y.append(labels.cpu().numpy())
    return np.concatenate(X, axis=0), np.concatenate(y, axis=0)

print("\nExtracting features for SVM...")
X_tr, y_tr = extract_penultimate_features(svm_train_loader)
X_va, y_va = extract_penultimate_features(svm_val_loader)
X_te, y_te = extract_penultimate_features(svm_test_loader)

print("Feature shapes:", X_tr.shape, X_va.shape, X_te.shape)

# Linear SVM pipeline (standard and strong baseline)
svm_clf = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("svm",    LinearSVC(C=1.0, max_iter=20000, dual=True))
])

print("Training Linear SVM on penultimate features...")
svm_clf.fit(X_tr, y_tr)

# Evaluate
def eval_split(X, y, split_name):
    y_pred = svm_clf.predict(X)
    acc = accuracy_score(y, y_pred)
    print(f"\n{split_name} Accuracy (SVM on penultimate features): {acc*100:.2f}%")
    return y_pred, acc

y_pred_val, val_acc = eval_split(X_va, y_va, "Validation")
y_pred_test, test_acc = eval_split(X_te, y_te, "Test")

# Detailed report (on Test)
classes = test_set.classes
print("\nClassification report (Test):")
print(classification_report(y_te, y_pred_test, target_names=classes, digits=4))

# Confusion matrix (Test)
cm = confusion_matrix(y_te, y_pred_test)
print("Confusion Matrix (rows=true, cols=pred):")
print(cm)



Extracting features for SVM...


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Extract SVM feats: 100%|██████████████████████████████████████████| 352/352 [00:46<00:00,  7.59it/s]
Extract SVM feats: 100%|████████████████████████████████████████████| 40/40 [00:05<00:00,  7.49it/s]
Extract SVM feats: 100%|████████████████████████████████████████████| 79/79 [00:10<00:00,  7.59it/s]


Feature shapes: (45000, 4096) (5000, 4096) (10000, 4096)
Training Linear SVM on penultimate features...

Validation Accuracy (SVM on penultimate features): 91.88%

Test Accuracy (SVM on penultimate features): 91.42%

Classification report (Test):
              precision    recall  f1-score   support

    airplane     0.9093    0.9220    0.9156      1000
  automobile     0.9504    0.9590    0.9547      1000
        bird     0.9005    0.8870    0.8937      1000
         cat     0.8407    0.8130    0.8266      1000
        deer     0.9043    0.9260    0.9150      1000
         dog     0.8742    0.8620    0.8681      1000
        frog     0.9376    0.9460    0.9418      1000
       horse     0.9422    0.9460    0.9441      1000
        ship     0.9413    0.9460    0.9436      1000
       truck     0.9378    0.9350    0.9364      1000

    accuracy                         0.9142     10000
   macro avg     0.9138    0.9142    0.9140     10000
weighted avg     0.9138    0.9142    0.9140     1

In [None]:
# ============================
# Compare multiple classifiers on penultimate features
# (paste after your SVM block; uses X_tr, y_tr, X_va, y_va, X_te, y_te, classes)
# ============================
from collections import OrderedDict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

def fit_eval(name, clf, Xtr, ytr, Xva, yva, Xte, yte):
    clf.fit(Xtr, ytr)
    y_val_pred = clf.predict(Xva)
    y_te_pred  = clf.predict(Xte)
    val_acc = accuracy_score(yva, y_val_pred)
    te_acc  = accuracy_score(yte, y_te_pred)
    print(f"{name:>18} | Val: {val_acc*100:6.2f}% | Test: {te_acc*100:6.2f}%")
    return {
        "name": name, "model": clf,
        "val_acc": val_acc, "test_acc": te_acc,
        "y_val_pred": y_val_pred, "y_test_pred": y_te_pred
    }

print("\n=== Comparing classic classifiers on penultimate features ===")
models = OrderedDict()

# 1) Linear SVM (baseline)
models["LinearSVM"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LinearSVC(C=1.0, max_iter=20000, dual=True))
])

# 2) Multinomial Logistic Regression
models["LogReg_LBFGS"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(C=1.0, max_iter=3000, n_jobs=-1,
                               multi_class="multinomial", solver="lbfgs"))
])

# 3) RBF SVM (PCA→256 for speed)
models["SVM_RBF_PCA256"] = Pipeline([
    ("pca", PCA(n_components=256, random_state=42)),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SVC(kernel="rbf", C=5.0, gamma="scale"))
])

# 4) Linear SGD (hinge) — very fast linear baseline
models["SGD_Hinge"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", SGDClassifier(loss="hinge", alpha=1e-4, max_iter=3000,
                          tol=1e-3, n_jobs=-1, random_state=42))
])

# 5) kNN (PCA→128, k=7)
models["kNN_PCA128_k7"] = Pipeline([
    ("pca", PCA(n_components=128, random_state=42)),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", KNeighborsClassifier(n_neighbors=7, n_jobs=-1))
])

# 6) Random Forest (no scaling needed)
models["RandomForest400"] = Pipeline([
    ("clf", RandomForestClassifier(n_estimators=400, max_depth=None,
                                   n_jobs=-1, random_state=42))
])

# 7) MLP (1 hidden layer)
models["MLP_512"] = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", MLPClassifier(hidden_layer_sizes=(512,), activation="relu",
                          alpha=1e-4, batch_size=256, learning_rate_init=1e-3,
                          max_iter=200, random_state=42))
])

# Evaluate all on (train -> val/test)
results = []
for name, pipe in models.items():
    res = fit_eval(name, pipe, X_tr, y_tr, X_va, y_va, X_te, y_te)
    results.append(res)

# Rank by validation accuracy
results_sorted = sorted(results, key=lambda r: r["val_acc"], reverse=True)
print("\n=== Summary (sorted by Val acc) ===")
for r in results_sorted:
    print(f"{r['name']:>18} | Val: {r['val_acc']*100:6.2f}% | Test: {r['test_acc']*100:6.2f}%")

# Refit best on TRAIN+VAL and evaluate on TEST
best = results_sorted[0]
X_trva = np.concatenate([X_tr, X_va], axis=0)
y_trva = np.concatenate([y_tr, y_va], axis=0)
best_refit = models[best["name"]]
best_refit.fit(X_trva, y_trva)
y_test_pred_final = best_refit.predict(X_te)
final_test_acc = accuracy_score(y_te, y_test_pred_final)

print(f"\n=== FINAL (refit on train+val) — {best['name']} ===")
print(f"Final Test Accuracy: {final_test_acc*100:.2f}%")
print("\nClassification report (Test):")
print(classification_report(y_te, y_test_pred_final, target_names=classes, digits=4))

cm = confusion_matrix(y_te, y_test_pred_final)
print("Confusion Matrix (rows=true, cols=pred):")
print(cm)



=== Comparing classic classifiers on penultimate features ===
         LinearSVM | Val:  91.88% | Test:  91.42%




      LogReg_LBFGS | Val:  91.88% | Test:  91.54%
    SVM_RBF_PCA256 | Val:  92.50% | Test:  91.85%
         SGD_Hinge | Val:  91.40% | Test:  91.15%
     kNN_PCA128_k7 | Val:  91.04% | Test:  90.66%
   RandomForest400 | Val:  91.58% | Test:  90.79%
           MLP_512 | Val:  91.12% | Test:  90.51%

=== Summary (sorted by Val acc) ===
    SVM_RBF_PCA256 | Val:  92.50% | Test:  91.85%
         LinearSVM | Val:  91.88% | Test:  91.42%
      LogReg_LBFGS | Val:  91.88% | Test:  91.54%
   RandomForest400 | Val:  91.58% | Test:  90.79%
         SGD_Hinge | Val:  91.40% | Test:  91.15%
           MLP_512 | Val:  91.12% | Test:  90.51%
     kNN_PCA128_k7 | Val:  91.04% | Test:  90.66%

=== FINAL (refit on train+val) — SVM_RBF_PCA256 ===
Final Test Accuracy: 92.01%

Classification report (Test):
              precision    recall  f1-score   support

    airplane     0.9136    0.9300    0.9217      1000
  automobile     0.9579    0.9550    0.9564      1000
        bird     0.9159    0.8930    0