In [1]:
!pip install -q kaggle torch torchvision tqdm matplotlib

# --- Kaggle setup (upload kaggle.json first) ---
!mkdir -p ~/.kaggle && cp kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d ifigotin/imagenetmini-1000 -p /content
!unzip -q /content/imagenetmini-1000.zip -d /content/imagenet-mini


Dataset URL: https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000
License(s): unknown
Downloading imagenetmini-1000.zip to /content
 99% 3.90G/3.92G [01:02<00:00, 196MB/s]
100% 3.92G/3.92G [01:03<00:00, 66.7MB/s]


In [3]:
import torch, torch.nn as nn, torch.optim as optim
from torchvision import datasets, transforms
from torchvision.models import resnet50
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
NUM_CLASSES = 1000
BATCH_SIZE  = 256          # larger batch speeds convergence
EPOCHS      = 30
MAX_LR      = 0.2          # higher LR works with OneCycle
MOMENTUM    = 0.9
WEIGHT_DECAY = 1e-4
LABEL_SMOOTH = 0.1

mean = [0.485,0.456,0.406]; std=[0.229,0.224,0.225]
train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.4,0.4,0.4,0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean,std)
])
val_tfms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean,std)
])

train_ds = datasets.ImageFolder("/content/imagenet-mini/imagenet-mini/train", transform=train_tfms)
val_ds   = datasets.ImageFolder("/content/imagenet-mini/imagenet-mini/val",   transform=val_tfms)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)




In [4]:
def rand_bbox(size, lam):
    W,H = size[2], size[3]
    cut_rat = (1 - lam) ** 0.5
    cut_w, cut_h = int(W*cut_rat), int(H*cut_rat)
    cx, cy = random.randint(0, W), random.randint(0, H)
    x1, y1 = max(cx-cut_w//2,0), max(cy-cut_h//2,0)
    x2, y2 = min(cx+cut_w//2,W), min(cy+cut_h//2,H)
    return x1, y1, x2, y2

def mixup_cutmix_data(x, y, alpha=1.0, mixup_prob=0.5):
    if random.random() < mixup_prob:
        lam = np.random.beta(alpha, alpha)
        batch_size = x.size()[0]
        index = torch.randperm(batch_size).to(x.device)
        if random.random() < 0.5:     # MixUp
            mixed_x = lam * x + (1 - lam) * x[index, :]
        else:                         # CutMix
            x1,y1,x2,y2 = rand_bbox(x.size(), lam)
            mixed_x = x.clone()
            mixed_x[:,:,x1:x2,y1:y2] = x[index,:,x1:x2,y1:y2]
            lam = 1 - ((x2-x1)*(y2-y1)/(x.size()[-1]*x.size()[-2]))
        y_a, y_b = y, y[index]
        return mixed_x, y_a, y_b, lam
    else:
        return x, y, y, 1.0


In [5]:
model = resnet50(weights=None, num_classes=NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTH)
optimizer = optim.SGD(model.parameters(), lr=MAX_LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scaler = torch.cuda.amp.GradScaler()

# OneCycleLR setup
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=MAX_LR,
    steps_per_epoch=len(train_loader), epochs=EPOCHS,
    pct_start=0.3, anneal_strategy='cos', div_factor=25.0, final_div_factor=1e4
)


  scaler = torch.cuda.amp.GradScaler()


In [6]:
import copy, numpy as np
def lr_finder(model, loader, optimizer, init_value=1e-5, final_value=1, beta=0.98):
    num = len(loader)-1
    mult = (final_value / init_value) ** (1/num)
    lr = init_value; optimizer.param_groups[0]['lr']=lr
    avg_loss, best_loss = 0.,0.; losses=[]; log_lrs=[]
    for i,(x,y) in enumerate(loader):
        x,y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            out = model(x); loss = criterion(out,y)
        avg_loss = beta*avg_loss + (1-beta)*loss.item()
        smoothed = avg_loss/(1-beta**(i+1))
        if i>0 and smoothed>4*best_loss: break
        if smoothed<best_loss or i==0: best_loss = smoothed
        losses.append(smoothed); log_lrs.append(np.log10(lr))
        scaler.scale(loss).backward(); scaler.step(optimizer); scaler.update()
        lr *= mult; optimizer.param_groups[0]['lr']=lr
    plt.plot(log_lrs,losses); plt.xlabel("log10 LR"); plt.ylabel("Loss"); plt.title("LR Finder"); plt.show()

# optional quick check
# lr_finder(copy.deepcopy(model), train_loader, optimizer)


In [None]:
@torch.no_grad()
def accuracy_top1(model, loader):
    model.eval(); correct=0; total=0
    for x,y in loader:
        x,y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        _,pred = out.max(1)
        correct += (pred==y).sum().item()
        total += y.size(0)
    return correct/total

def train_one_epoch(epoch):
    model.train(); run_loss=0; correct=0; total=0
    for x,y in tqdm(train_loader, leave=False):
        x,y = x.to(DEVICE), y.to(DEVICE)
        x,y_a,y_b,lam = mixup_cutmix_data(x,y)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            out = model(x)
            loss = lam*criterion(out,y_a) + (1-lam)*criterion(out,y_b)
        scaler.scale(loss).backward(); scaler.step(optimizer); scaler.update(); scheduler.step()
        run_loss += loss.item()
        _,pred = out.max(1)
        correct += (pred==y).sum().item(); total += y.size(0)
    train_acc = correct/total
    return run_loss/len(train_loader), train_acc

# ---- main loop ----
best=0; train_losses=[]; train_accs=[]; val_accs=[]
for ep in range(EPOCHS):
    tl,ta = train_one_epoch(ep)
    va = accuracy_top1(model,val_loader)
    train_losses.append(tl); train_accs.append(ta); val_accs.append(va)
    print(f"Epoch {ep+1:02d} | Loss {tl:.4f} | Train@1 {ta*100:.2f}% | Val@1 {va*100:.2f}%")
    if va>best:
        best=va; torch.save(model.state_dict(),"resnet50_imagenetmini_mixcut_best.pth")
        print(f"✅ Saved new best model ({best*100:.2f}%)")


In [None]:
# Training summary
plt.figure(figsize=(14,4))
plt.subplot(1,3,1)
plt.plot(train_losses); plt.title("Train Loss")
plt.subplot(1,3,2)
plt.plot([v*100 for v in train_accs],label='Train@1')
plt.plot([v*100 for v in val_accs],label='Val@1'); plt.legend(); plt.title("Accuracy (%)")
plt.subplot(1,3,3)
lrs = [group['lr'] for _ in range(EPOCHS*len(train_loader)) for group in optimizer.param_groups]
plt.plot(lrs[:len(train_loader)], color='orange'); plt.title("One-Cycle LR (first epoch)")
plt.tight_layout(); plt.show()

print(f"🏁 Best Val Top-1: {best*100:.2f}%")
