In [1]:
import os, random, math, json
import numpy as np
import scipy.io
import torch, torch.nn as nn
from PIL import Image
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
from sklearn.model_selection import train_test_split
import timm
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score


# Kaggle paths
DATA_ROOT = "/kaggle/input/stanford-cars-dataset"
TRAIN_DIR = os.path.join(DATA_ROOT, "cars_train/cars_train")
TEST_DIR  = os.path.join(DATA_ROOT, "cars_test/cars_test")     # (labels usually not available)
DEVKIT    = os.path.join(DATA_ROOT, "car_devkit", "devkit")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


IMAGENET_MEAN=[0.485,0.456,0.406]; IMAGENET_STD=[0.229,0.224,0.225]
NUM_WORKERS = 2


In [2]:
# Meta
meta = scipy.io.loadmat(os.path.join(DEVKIT, "cars_meta.mat"))

# len = 196
CLASS_NAMES = [c[0] for c in meta["class_names"][0]]

print(f"Number of Classes: {len(CLASS_NAMES)}")


# Train annotations (fname, class (1..196), bbox, etc.)
train_annos = scipy.io.loadmat(os.path.join(DEVKIT, "cars_train_annos.mat"))["annotations"][0]




Number of Classes: 196


In [3]:
class StanfordCars(Dataset):
    """Uses .mat annotations; returns (image_tensor, class_id[0..195])."""
    def __init__(self, img_dir, annos, transform=None, use_bbox=False):
        self.img_dir = img_dir
        self.annos = annos
        self.transform = transform
        self.use_bbox = use_bbox

    def __len__(self):
        return len(self.annos)

    def __getitem__(self, i):
        a = self.annos[i]
        fname = str(a["fname"][0])
        y = int(a["class"][0,0]) - 1  # 0..195
        path = os.path.join(self.img_dir, fname)
        img = Image.open(path).convert("RGB")

        if self.use_bbox:
            # optional cropping to car bbox for stronger fine-grained signal
            x1, y1, x2, y2 = [int(a[k][0,0]) for k in ("bbox_x1","bbox_y1","bbox_x2","bbox_y2")]
            img = img.crop((x1, y1, x2, y2))

        if self.transform:
            img = self.transform(img)
            
        return img, y

In [4]:
train_tf = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.7, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.05),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])
val_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

In [5]:
# Build base datasets after transforms are defined
train_base = StanfordCars(TRAIN_DIR, train_annos, transform=train_tf, use_bbox=False)
val_base = StanfordCars(TRAIN_DIR, train_annos, transform=val_tf,   use_bbox=False)

# Stratified split from the same annotation order
labels = np.array([int(a["class"][0,0])-1 for a in train_annos])
indices = np.arange(len(train_annos))

train_idx, val_idx = train_test_split(indices, test_size=0.15, stratify=labels, random_state=42)

train_ds = Subset(train_base, train_idx)
val_ds = Subset(val_base, val_idx)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

# batch of images: shape (B, C, H, W) = (32, 3, 224, 224).
xb, yb = next(iter(train_loader))

# average pixel intensity of that channel after preprocessing.
print("Batch mean:", xb.mean(dim=(0,2,3)).tolist(), "std:", xb.std(dim=(0,2,3)).tolist())
print("Train/Val sizes:", len(train_ds), len(val_ds), "Num classes:", len(CLASS_NAMES))


Batch mean: [-0.2167356312274933, -0.17636573314666748, -0.03354310244321823] std: [1.2685903310775757, 1.2715939283370972, 1.2584930658340454]
Train/Val sizes: 6922 1222 Num classes: 196


In [6]:
def train_model(epochs_frozen=3, epochs_full=17, arch="timm:convnext_tiny",
                lr_head=3e-4, lr_full=1e-4, out_dir="/kaggle/working/models"):
    os.makedirs(out_dir, exist_ok=True)
    num_classes = len(CLASS_NAMES)
    model = timm.create_model("timm:convnext_tiny", pretrained=True, num_classes=num_classes).to(DEVICE)

    # Metrics setup
    acc_metric = MulticlassAccuracy(num_classes=num_classes).to(DEVICE)
    f1_metric  = MulticlassF1Score(num_classes=num_classes, average="macro").to(DEVICE)
    crit = nn.CrossEntropyLoss()

    # Stage A: freeze backbone (train only head)
    for n,p in model.named_parameters():
        if not any(k in n for k in ("head","classifier")):
            p.requires_grad = False
    opt = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                            lr=lr_head, weight_decay=1e-4)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs_frozen)
    scaler = torch.amp.GradScaler('cuda')

    best_f1 = 0.0
    def run_epoch(train=True):
        loader = train_loader if train else val_loader
        model.train() if train else model.eval()
        total = correct = 0
        acc_metric.reset()
        f1_metric.reset()
        running_loss = 0.0
        for xb,yb in loader:
            xb,yb = xb.to(DEVICE), yb.to(DEVICE)
            if train:
                opt.zero_grad(set_to_none=True)
                with torch.amp.autocast('cuda'):
                    logits = model(xb)
                    loss = crit(logits, yb)
                scaler.scale(loss).backward()
                scaler.step(opt)
                scaler.update()
                running_loss += loss.item() * xb.size(0)
            else:
                with torch.no_grad():
                    logits = model(xb)
            acc_metric.update(logits, yb)
            f1_metric.update(logits, yb)
            correct += (logits.argmax(1)==yb).sum().item()
            total += yb.numel()
        avg_loss = running_loss / max(1, (len(loader.dataset) if train else len(loader.dataset)))
        return correct/total, acc_metric.compute().item(), f1_metric.compute().item(), avg_loss

    # Stage A
    for ep in range(1, epochs_frozen+1):
        tr_acc, _, _, tr_loss = run_epoch(True)
        va_acc, va_acc_m, va_f1, _ = run_epoch(False)
        print(f"[A] E{ep:02d} train_acc={tr_acc:.3f} val_acc={va_acc:.3f} val_f1={va_f1:.3f}")
        if va_f1 > best_f1:
            best_f1 = va_f1
            torch.save({"model": model.state_dict(), "classes": CLASS_NAMES, "arch": arch},
                       os.path.join(out_dir, "best.pt"))
        sched.step()

    # Stage B: unfreeze all, lower LR
    for p in model.parameters(): 
        p.requires_grad = True
    opt = torch.optim.AdamW(model.parameters(), lr=lr_full, weight_decay=1e-4)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs_full)
    scaler = torch.cuda.amp.GradScaler()

    for ep in range(1, epochs_full+1):
        tr_acc, _, _, tr_loss = run_epoch(True)
        va_acc, va_acc_m, va_f1, _ = run_epoch(False)
        if va_f1 > best_f1:
            best_f1 = va_f1
            torch.save({"model": model.state_dict(), "classes": CLASS_NAMES, "arch": arch},
                       os.path.join(out_dir, "best.pt"))
        print(f"[B] E{ep:02d} train_acc={tr_acc:.3f} val_acc={va_acc:.3f} val_f1={va_f1:.3f} (best_f1={best_f1:.3f})")
        sched.step()

    print("Saved:", os.path.join(out_dir, "best.pt"))
    return model


In [7]:
model = train_model(
    epochs_frozen=3,   # 3–5 is fine
    epochs_full=17,    # total ~20 epochs
    arch="timm:convnext_tiny",
    lr_head=3e-4,
    lr_full=1e-4
)


model.safetensors:   0%|          | 0.00/114M [00:00<?, ?B/s]

[A] E01 train_acc=0.132 val_acc=0.243 val_f1=0.203
[A] E02 train_acc=0.366 val_acc=0.337 val_f1=0.313
[A] E03 train_acc=0.481 val_acc=0.402 val_f1=0.382


  scaler = torch.cuda.amp.GradScaler()


[B] E01 train_acc=0.538 val_acc=0.718 val_f1=0.703 (best_f1=0.703)
[B] E02 train_acc=0.894 val_acc=0.804 val_f1=0.798 (best_f1=0.798)
[B] E03 train_acc=0.966 val_acc=0.818 val_f1=0.810 (best_f1=0.810)
[B] E04 train_acc=0.987 val_acc=0.849 val_f1=0.844 (best_f1=0.844)
[B] E05 train_acc=0.993 val_acc=0.843 val_f1=0.837 (best_f1=0.844)
[B] E06 train_acc=0.994 val_acc=0.842 val_f1=0.835 (best_f1=0.844)
[B] E07 train_acc=0.993 val_acc=0.836 val_f1=0.832 (best_f1=0.844)
[B] E08 train_acc=0.991 val_acc=0.859 val_f1=0.853 (best_f1=0.853)
[B] E09 train_acc=0.997 val_acc=0.874 val_f1=0.869 (best_f1=0.869)
[B] E10 train_acc=0.999 val_acc=0.885 val_f1=0.881 (best_f1=0.881)
[B] E11 train_acc=0.999 val_acc=0.891 val_f1=0.887 (best_f1=0.887)
[B] E12 train_acc=0.998 val_acc=0.885 val_f1=0.881 (best_f1=0.887)
[B] E13 train_acc=0.999 val_acc=0.889 val_f1=0.885 (best_f1=0.887)
[B] E14 train_acc=0.999 val_acc=0.892 val_f1=0.889 (best_f1=0.889)
[B] E15 train_acc=0.999 val_acc=0.894 val_f1=0.892 (best_f1=0.