
# Vision Playground — Multi-task PyTorch Notebook

This notebook contains modular examples for **multiple computer-vision tasks**, each in its own section so you can run them separately:
- Classification: ResNet50 (CIFAR10) and MobileNetV2 (CIFAR100)
- Object Detection: Faster R-CNN (ResNet50-FPN)
- Instance Segmentation: Mask R-CNN (ResNet50-FPN)
- Semantic Segmentation: DeepLabV3 (ResNet50)
- Keypoint Detection: Keypoint R-CNN (ResNet50-FPN)
- Super-Resolution: SRCNN (simple example)
- Generative Models: DCGAN (MNIST example)

**Notes before running heavy cells**
- Some datasets (COCO) are large and not auto-downloaded here. For detection/segmentation you can test on small local images or use lightweight datasets (PennFudan, VOC) included in torchvision.
- Training large models requires a GPU. Many training cells are provided as demonstrative 'one-epoch' or 'smoke-test' runs and are commented for convenience.
- The notebook is modular: run only the sections you need.


In [None]:

# Utilities: data loaders, training loop, checkpoints, plotting, tensorboard
import os, torch, torch.nn as nn, torch.optim as optim
import torchvision
import torchvision.transforms as T
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
from collections import Counter

def imshow_batch(images, titles=None, n=6):
    fig, axes = plt.subplots(1, n, figsize=(15,3))
    for i in range(n):
        img = images[i].cpu().permute(1,2,0) * 0.5 + 0.5
        axes[i].imshow(img)
        if titles: axes[i].set_title(titles[i])
        axes[i].axis('off')
    plt.show()

def save_checkpoint(model, optimizer, epoch, path):
    torch.save({
        'model_state': model.state_dict(),
        'optim_state': optimizer.state_dict(),
        'epoch': epoch
    }, path)
    print(f"Checkpoint saved at {path}")

def load_checkpoint(model, optimizer, path):
    if os.path.exists(path):
        ckpt = torch.load(path)
        model.load_state_dict(ckpt['model_state'])
        if optimizer is not None:
            optimizer.load_state_dict(ckpt['optim_state'])
        print(f"Loaded checkpoint from {path}, epoch {ckpt.get('epoch')}")
        return ckpt.get('epoch', 0)
    else:
        print(f"No checkpoint found at {path}")
        return 0

# Simple train/eval helpers for classification tasks
def train_one_epoch_classification(model, loader, optimizer, criterion, device):
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for x,y in tqdm(loader, leave=False):
        x,y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * x.size(0)
        _, preds = out.max(1)
        correct += preds.eq(y).sum().item()
        total += y.size(0)
    return running_loss/len(loader.dataset), correct/total

def evaluate_classification(model, loader, criterion, device):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for x,y in loader:
            x,y = x.to(device), y.to(device)
            out = model(x)
            loss = criterion(out, y)
            running_loss += loss.item() * x.size(0)
            _, preds = out.max(1)
            correct += preds.eq(y).sum().item()
            total += y.size(0)
    return running_loss/len(loader.dataset), correct/total



## Classification

Two examples:
- ResNet50 on CIFAR10 (demo)
- MobileNetV2 on CIFAR100 (demo)

These are small, runnable examples using torchvision datasets.


In [None]:

# Classification examples: CIFAR10 + CIFAR100 demos
from torchvision import models

def get_classification_loaders(dataset='CIFAR10', batch_size=128, augment=True):
    if dataset == 'CIFAR10':
        mean, std = (0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)
        train_tfms = T.Compose([T.RandomHorizontalFlip(), T.RandomCrop(32, padding=4), T.ToTensor(), T.Normalize(mean, std)]) if augment else T.Compose([T.ToTensor(), T.Normalize(mean,std)])
        test_tfms = T.Compose([T.ToTensor(), T.Normalize(mean,std)])
        train_set = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=train_tfms)
        test_set = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=test_tfms)
    elif dataset == 'CIFAR100':
        mean, std = (0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762)
        train_tfms = T.Compose([T.RandomHorizontalFlip(), T.RandomCrop(32, padding=4), T.ToTensor(), T.Normalize(mean, std)]) if augment else T.Compose([T.ToTensor(), T.Normalize(mean,std)])
        test_tfms = T.Compose([T.ToTensor(), T.Normalize(mean,std)])
        train_set = torchvision.datasets.CIFAR100('./data', train=True, download=True, transform=train_tfms)
        test_set = torchvision.datasets.CIFAR100('./data', train=False, download=True, transform=test_tfms)
    else:
        raise ValueError('dataset not supported')
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)
    return train_loader, test_loader, train_set.classes, train_set

# ResNet50 on CIFAR10 (demo)
train_loader, test_loader, classes, train_set = get_classification_loaders('CIFAR10', batch_size=128)
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(classes))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Smoke test: run one training epoch (uncomment to run)
train_one_epoch_classification(model, train_loader, optimizer, criterion, device)

# Visualize a batch
images, labels = next(iter(train_loader))
imshow_batch(images, [classes[l] for l in labels], n=6)



## Object Detection — Faster R-CNN (ResNet50-FPN)

Uses `torchvision.models.detection.fasterrcnn_resnet50_fpn`. For demo purposes you can use a small custom dataset or PennFudan dataset (included in torchvision tutorials).


In [None]:

# Detection model creation + inference smoke test (no full training here)
from torchvision.models.detection import fasterrcnn_resnet50_fpn
# create model, set number of classes (including background)
num_classes = 2  # example: background + 1 object class
model_det = fasterrcnn_resnet50_fpn(pretrained=True)
# For fine-tuning with different number of classes you'd replace the box predictor
# but for inference demo we'll use the pretrained model as-is.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_det = model_det.to(device)
model_det.eval()

# Run inference on a batch of CIFAR images resized to expected sizes as quick demo
from torchvision.transforms.functional import resize
batch = [T.ToPILImage()(images[i]) for i in range(4)]
inputs = [T.ToTensor()(resize(img, (224,224))).to(device) for img in batch]
with torch.no_grad():
    preds = model_det(inputs)
for i,p in enumerate(preds):
    print(f'Image {i}: {len(p["boxes"])} boxes (showing scores):', p.get('scores')[:5])



## Instance Segmentation — Mask R-CNN (ResNet50-FPN)

`torchvision.models.detection.maskrcnn_resnet50_fpn` can produce per-instance masks.


In [None]:

from torchvision.models.detection import maskrcnn_resnet50_fpn
model_mask = maskrcnn_resnet50_fpn(pretrained=True).to(device).eval()
# Run same small inference demo as detection
with torch.no_grad():
    preds = model_mask(inputs)
for i,p in enumerate(preds):
    print(f'Image {i}: {len(p["masks"])} masks, boxes: {len(p["boxes"])}')



## Semantic Segmentation — DeepLabV3 (ResNet50 backbone)

Use `torchvision.models.segmentation.deeplabv3_resnet50`. Example uses VOC or resized CIFAR images for quick inference demo.


In [None]:

from torchvision.models.segmentation import deeplabv3_resnet50
model_seg = deeplabv3_resnet50(pretrained=True).to(device).eval()

# run inference on a resized batch
inputs_seg = torch.stack([T.ToTensor()(resize(img, (224,224))) for img in batch]).to(device)
with torch.no_grad():
    out = model_seg(inputs_seg)['out']  # (N, C, H, W)
    pred_mask = out.argmax(1)
print('Segmentation output shape:', out.shape)



## Keypoint Detection — Keypoint R-CNN (pose estimation)

`torchvision.models.detection.keypointrcnn_resnet50_fpn` estimates COCO-style keypoints.


In [None]:

from torchvision.models.detection import keypointrcnn_resnet50_fpn
model_kp = keypointrcnn_resnet50_fpn(pretrained=True).to(device).eval()
with torch.no_grad():
    kp_preds = model_kp(inputs)
for i,p in enumerate(kp_preds):
    print(f'Image {i}: keypoints tensor shape for first instance (if any):', p.get('keypoints').shape if len(p.get('keypoints',[]))>0 else 'none')



## Super-Resolution — SRCNN (simple example)

A small SRCNN-like model to demonstrate upsampling and training for super-resolution.


In [None]:

# Small SRCNN-like model
class SRCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3,64,9,padding=4),
            nn.ReLU(),
            nn.Conv2d(64,32,5,padding=2),
            nn.ReLU(),
            nn.Conv2d(32,3,5,padding=2)
        )
    def forward(self,x): return self.net(x)

# demo: take CIFAR images, create low-res by downsampling and upsampling back
import torch.nn.functional as F
model_sr = SRCNN().to(device)
imgs = images[:4].to(device)
low = F.interpolate(imgs, scale_factor=0.5, mode='bilinear', align_corners=False)
low_up = F.interpolate(low, scale_factor=2.0, mode='bilinear', align_corners=False)
with torch.no_grad():
    out = model_sr(low_up)
print('SR output shape:', out.shape)
imshow_batch(out.cpu(), n=4)



## Generative Models — DCGAN (MNIST demo)

Simple DCGAN generator + discriminator and a training loop for MNIST. Provided as a runnable example.


In [None]:

# DCGAN simple implementation (MNIST)
import torch.nn.functional as F
from torchvision import datasets

# Data
mnist_tfms = T.Compose([T.ToTensor(), T.Normalize((0.5,),(0.5,))])
mnist = datasets.MNIST('./data', download=True, train=True, transform=mnist_tfms)
mn_loader = DataLoader(mnist, batch_size=128, shuffle=True, num_workers=2)

# Models (small DCGAN)
class DCGAN_G(nn.Module):
    def __init__(self, zdim=100):
        super().__init__()
        self.net = nn.Sequential(
            nn.ConvTranspose2d(zdim, 128, 4, 1, 0, bias=False),
            nn.BatchNorm2d(128), nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64), nn.ReLU(True),
            nn.ConvTranspose2d(64, 1, 4, 2, 1, bias=False),
            nn.Tanh()
        )
    def forward(self,z): return self.net(z)

class DCGAN_D(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1,64,4,2,1), nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64,128,4,2,1), nn.BatchNorm2d(128), nn.LeakyReLU(0.2, inplace=True),
            nn.Flatten(), nn.Linear(128*7*7,1), nn.Sigmoid()
        )
    def forward(self,x): return self.net(x)

G = DCGAN_G().to(device)
D = DCGAN_D().to(device)
criterion = nn.BCELoss()
optG = optim.Adam(G.parameters(), lr=2e-4, betas=(0.5,0.999))
optD = optim.Adam(D.parameters(), lr=2e-4, betas=(0.5,0.999))

# Smoke-test: generate samples
with torch.no_grad():
    z = torch.randn(16,100,1,1, device=device)
    samples = G(z).cpu()
    # normalize to [0,1] for plotting
    samples = (samples + 1) / 2
    fig, axes = plt.subplots(1,8, figsize=(12,2))
    for i in range(8):
        axes[i].imshow(samples[i,0].numpy(), cmap='gray')
        axes[i].axis('off')
    plt.show()



---

### Notes and tips
- For detection/segmentation/keypoint tasks you usually need COCO or Pascal VOC datasets. These are large; for quick experiments prefer small datasets like PennFudan for instance segmentation (see torchvision tutorials).
- Many models above are pretrained and intended for inference or fine-tuning. Fine-tuning requires adjusting the head (box predictor, mask predictor, classifier layer) for the desired number of classes.
- If you want I can: add ready-made dataset adapters (PennFudan loader), add full training loops for detection/segmentation, or include automated finetune recipes.
