In [32]:
import os, time, random, gc
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
import pandas as pd

In [33]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_ROOT = "data/CUB_200_2011"
NUM_CLASSES = 200
INIT_LR = 1e-4
MID_LR = 1e-5
FINAL_LR = 1e-6
WEIGHT_DECAY = 1e-4
BATCH_SIZE = 32
STEP_SIZE = 7
EPOCHS_STAGE1 = 10
EPOCHS_STAGE2 = 50
EPOCHS_STAGE3 = 250
SEED = 87

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)



In [34]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.5, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.RandomRotation(degrees=15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

class CUBDataset(Dataset):
    def __init__(self, root, train=True, transform=None):
        img_txt = os.path.join(root, "images.txt")
        label_txt = os.path.join(root, "image_class_labels.txt")
        split_txt = os.path.join(root, "train_test_split.txt")

        with open(img_txt) as f:
            imgs = [x.strip().split(" ") for x in f.readlines()]
        with open(label_txt) as f:
            labels = [int(x.strip().split(" ")[1]) - 1 for x in f.readlines()]
        with open(split_txt) as f:
            split = [int(x.strip().split(" ")[1]) for x in f.readlines()]

        self.samples = []
        for (img_id, img_path), label, is_train in zip(imgs, labels, split):
            if (train and is_train == 1) or (not train and is_train == 0):
                self.samples.append((os.path.join(root, "images", img_path), label))
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label

train_loader = DataLoader(
    CUBDataset(DATA_ROOT, True, train_transforms),
    batch_size=BATCH_SIZE, shuffle=True, num_workers=10,
    pin_memory=True, persistent_workers=True, prefetch_factor=4
)
test_loader = DataLoader(
    CUBDataset(DATA_ROOT, False, test_transforms),
    batch_size=BATCH_SIZE, shuffle=False, num_workers=10,
    pin_memory=True, persistent_workers=True, prefetch_factor=4
)

def mixup_data(x, y, alpha=0.2, device='cuda'):
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

@torch.no_grad()
def evaluate_model(model, loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    for images, labels in loader:
        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        outputs = model(images)
        loss = criterion(outputs, labels)
        total_loss += loss.item() * labels.size(0)
        correct += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)
    return 100 * correct / total, total_loss / total

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss, total = 0.0, 0

    for images, labels in loader:
        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)

        # MixUp
        mixed_imgs, y_a, y_b, lam = mixup_data(images, labels, device=device)

        optimizer.zero_grad()
        outputs = model(mixed_imgs)
        loss = mixup_criterion(criterion, outputs, y_a, y_b, lam)

        # correctly accumulate *sample-level* loss
        running_loss += loss.item() * labels.size(0)
        total += labels.size(0)

        loss.backward()
        optimizer.step()

    return running_loss / total



# Model: ResNet152

In [35]:
def get_resnet():
    model = models.resnet152(weights=models.ResNet152_Weights.IMAGENET1K_V1)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, NUM_CLASSES)
    return model

In [36]:
def run_training_3stage(model_name, model):
    print("=== Training ===")
    criterion = nn.CrossEntropyLoss()

    # ============================================================
    # Stage 1: Train classifier only (linear probing)
    # ============================================================
    for p in model.parameters(): 
        p.requires_grad = False
    for p in model.fc.parameters(): 
        p.requires_grad = True

    print("--- Stage 1 ---")
    optimizer = optim.AdamW(model.fc.parameters(), lr=INIT_LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=0.1)

    stage1_hist = []
    for epoch in range(EPOCHS_STAGE1):
        tr_loss = train_one_epoch(model, train_loader, optimizer, criterion, DEVICE)
        te_acc, te_loss = evaluate_model(model, test_loader, criterion, DEVICE)
        scheduler.step()

        stage1_hist.append((epoch+1, tr_loss, te_loss, te_acc))

        print(f"[S1-Epoch {epoch+1:02d}] "
              f"Train Loss={tr_loss:.4f} | Test Loss={te_loss:.4f} | Test Acc={te_acc:.2f}%")


    # ============================================================
    # Stage 2: Fine-tune entire network (moderate LR)
    # ============================================================
    for p in model.parameters():
        p.requires_grad = True

    print("--- Stage 2---")
    optimizer = optim.AdamW(model.parameters(), lr=MID_LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=EPOCHS_STAGE2, gamma=0.1)

    stage2_hist = []
    best_acc = 0.0

    for epoch in range(EPOCHS_STAGE2):
        tr_loss = train_one_epoch(model, train_loader, optimizer, criterion, DEVICE)
        te_acc, te_loss = evaluate_model(model, test_loader, criterion, DEVICE)
        scheduler.step()

        stage2_hist.append((epoch+1, tr_loss, te_loss, te_acc))

        if te_acc > best_acc:
            best_acc = te_acc
            torch.save(model.state_dict(), f"best_{model_name}.pt")

        print(f"[S2-Epoch {epoch+1:03d}] "
              f"Train Loss={tr_loss:.4f} | Test Loss={te_loss:.4f} | Test Acc={te_acc:.2f}%")


    # ============================================================
    # Stage 3: Low-LR refinement (very small LR)
    # ============================================================
    print("--- Stage 3---")
    optimizer = optim.AdamW(model.parameters(), lr=FINAL_LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=EPOCHS_STAGE3, gamma=0.1)

    stage3_hist = []

    for epoch in range(EPOCHS_STAGE3):
        tr_loss = train_one_epoch(model, train_loader, optimizer, criterion, DEVICE)
        te_acc, te_loss = evaluate_model(model, test_loader, criterion, DEVICE)
        scheduler.step()

        stage3_hist.append((epoch+1, tr_loss, te_loss, te_acc))

        print(f"[S3-Epoch {epoch+1:03d}] "
              f"Train Loss={tr_loss:.4f} | Test Loss={te_loss:.4f} | Test Acc={te_acc:.2f}%")


    return stage1_hist, stage2_hist, stage3_hist


In [37]:
gc.collect()
torch.cuda.empty_cache()

results = {}

for name, model_fn in [("ResNet",get_resnet)]:
    model = model_fn().to(DEVICE)
    s1, s2, s3 = run_training_3stage(name, model)
    df1 = pd.DataFrame(s1, columns=["epoch", "train_loss", "test_loss", "test_acc"])
    df1["stage"] = "Stage 1"
    df1["model"] = name
    
    df2 = pd.DataFrame(s2, columns=["epoch", "train_loss", "test_loss", "test_acc"])
    df2["stage"] = "Stage 2"
    df2["model"] = name
    
    df3 = pd.DataFrame(s3, columns=["epoch", "train_loss", "test_loss", "test_acc"])
    df3["stage"] = "Stage 3"
    df3["model"] = name
    hist = pd.concat([df1, df2, df3], ignore_index=True)

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.12/multiprocessing/queues.py", line 259, in _feed
    reader_close()
  File "/root/miniconda3/lib/python3.12/multiprocessing/connection.py", line 178, in close
    self._close()
  File "/root/miniconda3/lib/python3.12/multiprocessing/connection.py", line 377, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor


=== Training ===
--- Stage 1 ---
[S1-Epoch 01] Train Loss=5.2301 | Test Loss=4.9211 | Test Acc=7.47%
[S1-Epoch 02] Train Loss=4.8781 | Test Loss=4.5642 | Test Acc=16.60%
[S1-Epoch 03] Train Loss=4.5674 | Test Loss=4.2221 | Test Acc=28.22%
[S1-Epoch 04] Train Loss=4.3136 | Test Loss=3.9506 | Test Acc=33.02%
[S1-Epoch 05] Train Loss=4.1176 | Test Loss=3.7169 | Test Acc=35.69%
[S1-Epoch 06] Train Loss=3.9044 | Test Loss=3.4722 | Test Acc=40.30%
[S1-Epoch 07] Train Loss=3.8009 | Test Loss=3.3109 | Test Acc=41.80%
[S1-Epoch 08] Train Loss=3.6076 | Test Loss=3.2835 | Test Acc=45.58%
[S1-Epoch 09] Train Loss=3.5737 | Test Loss=3.2716 | Test Acc=47.70%
[S1-Epoch 10] Train Loss=3.5548 | Test Loss=3.2311 | Test Acc=50.29%
--- Stage 2---
[S2-Epoch 001] Train Loss=3.0986 | Test Loss=2.2417 | Test Acc=60.03%
[S2-Epoch 002] Train Loss=2.6874 | Test Loss=1.9290 | Test Acc=64.64%
[S2-Epoch 003] Train Loss=2.4526 | Test Loss=1.7393 | Test Acc=68.88%
[S2-Epoch 004] Train Loss=2.2076 | Test Loss=1.5912 |

In [38]:
hist.to_csv("ResNet_3stg.csv", index=False)