<a href="https://colab.research.google.com/github/Santhosh-2308/AIRL-Assignment/blob/main/q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Colab cell (bash)
# (q2 will need extra libs; adapt as needed)
!nvidia-smi
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install timm
!pip install einops
!pip install pytorch_lightning
!pip install wandb # optional for logging
# Q2 additional installs (run only in q2.ipynb)
# !pip install git+https://github.com/facebookresearch/segment-anything.git
# !pip install groundingdino clipseg  # if available; else use repo clones


/bin/bash: line 1: nvidia-smi: command not found
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting pytorch_lightning
  Downloading pytorch_lightning-2.5.5-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading pytorch_lightning-2.5.5-py3-none-any.whl (832 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m832.4/832.4 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successf

In [None]:
# Q1 - Vision Transformer on CIFAR-10 (PyTorch)
# Google Colab Notebook

# -------------------------------
# 1. Imports
# -------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader
import time

# -------------------------------
# 2. Vision Transformer (ViT)
# -------------------------------
class PatchEmbed(nn.Module):
    def __init__(self, img_size=32, patch_size=4, in_chans=3, embed_dim=128):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)  # (B, embed_dim, H/patch, W/patch)
        x = x.flatten(2).transpose(1, 2)  # (B, n_patches, embed_dim)
        return x

class Attention(nn.Module):
    def __init__(self, dim, num_heads=4, qkv_bias=True):
        super().__init__()
        self.num_heads = num_heads
        self.scale = (dim // num_heads) ** -0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, B, heads, N, C//heads)
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        out = (attn @ v).transpose(1, 2).reshape(B, N, C)
        return self.proj(out)

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, p=0.1):
        super().__init__()
        hidden_features = hidden_features or in_features * 4
        out_features = out_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        x = self.fc1(x)
        x = F.gelu(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class Block(nn.Module):
    def __init__(self, dim, num_heads):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, num_heads)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp = MLP(dim)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

class ViT(nn.Module):
    def __init__(self, img_size=32, patch_size=4, num_classes=10,
                 embed_dim=128, depth=6, num_heads=4):
        super().__init__()
        self.patch_embed = PatchEmbed(img_size, patch_size, 3, embed_dim)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim))
        self.blocks = nn.ModuleList([Block(embed_dim, num_heads) for _ in range(depth)])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return self.head(x[:, 0])

# -------------------------------
# 3. Config
# -------------------------------
config = {
    "img_size": 32,
    "patch_size": 4,
    "embed_dim": 128,
    "depth": 6,
    "num_heads": 4,
    "batch_size": 128,
    "epochs": 5,   #
    "lr": 3e-4,
    "weight_decay": 0.05,
    "label_smoothing": 0.1
}

# -------------------------------
# 4. Data
# -------------------------------
mean = (0.4914, 0.4822, 0.4465)
std = (0.247, 0.243, 0.261)

train_transform = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean, std)
])
test_transform = T.Compose([
    T.ToTensor(),
    T.Normalize(mean, std)
])

trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=train_transform)
trainloader = DataLoader(trainset, batch_size=config["batch_size"], shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=test_transform)
testloader = DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

# -------------------------------
# 5. Training Utils
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ViT(
    img_size=config["img_size"],
    patch_size=config["patch_size"],
    num_classes=10,
    embed_dim=config["embed_dim"],
    depth=config["depth"],
    num_heads=config["num_heads"]
).to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=config["label_smoothing"])
optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
scheduler = CosineAnnealingLR(optimizer, T_max=config["epochs"])

def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)
            _, preds = outputs.max(1)
            correct += preds.eq(targets).sum().item()
            total += targets.size(0)
    return running_loss/total, 100.*correct/total


# -------------------------------
# 6. ⚡ Updated Fast Training Loop with torch.amp (future-proof)
# ----------------------------

from torch.amp import GradScaler, autocast

scaler = GradScaler("cuda" if torch.cuda.is_available() else "cpu")
best_acc = 0.0

for epoch in range(config["epochs"]):
    model.train()
    train_loss, correct, total = 0, 0, 0

    for inputs, targets in trainloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        with autocast("cuda" if torch.cuda.is_available() else "cpu"):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * inputs.size(0)
        _, preds = outputs.max(1)
        correct += preds.eq(targets).sum().item()
        total += targets.size(0)

    train_acc = 100.*correct/total
    val_loss, val_acc = evaluate(model, testloader, criterion, device)
    scheduler.step()

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_vit_cifar10.pth")

    print(f"Epoch {epoch+1}/{config['epochs']} "
          f"Train Acc={train_acc:.2f}% | Val Acc={val_acc:.2f}% | Best={best_acc:.2f}%")

print(f"\n✅ Done. Best Test Accuracy: {best_acc:.2f}%")


KeyboardInterrupt: 

In [1]:
# Q1 - Vision Transformer (Tiny ViT) on CIFAR-10 (Fast + Boosted Accuracy)

# -------------------------------
# 1. Imports
# -------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader

# -------------------------------
# 2. Tiny ViT
# -------------------------------
class PatchEmbed(nn.Module):
    def __init__(self, img_size=32, patch_size=4, in_chans=3, embed_dim=80):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class Attention(nn.Module):
    def __init__(self, dim, num_heads=2, qkv_bias=True):
        super().__init__()
        self.num_heads = num_heads
        self.scale = (dim // num_heads) ** -0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
        qkv = qkv.permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        out = (attn @ v).transpose(1, 2).reshape(B, N, C)
        return self.proj(out)

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, p=0.1):
        super().__init__()
        hidden_features = hidden_features or in_features * 4
        out_features = out_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        x = F.gelu(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class Block(nn.Module):
    def __init__(self, dim, num_heads):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, num_heads)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp = MLP(dim)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

class ViT(nn.Module):
    def __init__(self, img_size=32, patch_size=4, num_classes=10,
                 embed_dim=80, depth=5, num_heads=2):
        super().__init__()
        self.patch_embed = PatchEmbed(img_size, patch_size, 3, embed_dim)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim))
        self.blocks = nn.ModuleList([Block(embed_dim, num_heads) for _ in range(depth)])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return self.head(x[:, 0])

# -------------------------------
# 3. Config
# -------------------------------
config = {
    "img_size": 32,
    "patch_size": 4,
    "embed_dim": 80,
    "depth": 5,
    "num_heads": 2,
    "batch_size": 128,
    "epochs": 5,
    "lr": 5e-4,
    "weight_decay": 0.05,
    "label_smoothing": 0.1
}

# -------------------------------
# 4. Data
# -------------------------------
mean = (0.4914, 0.4822, 0.4465)
std = (0.247, 0.243, 0.261)

train_transform = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.ColorJitter(0.2,0.2,0.2,0.1),
    T.RandomRotation(15),
    T.ToTensor(),
    T.Normalize(mean, std)
])

test_transform = T.Compose([
    T.ToTensor(),
    T.Normalize(mean, std)
])

trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=train_transform)
trainloader = DataLoader(trainset, batch_size=config["batch_size"], shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=test_transform)
testloader = DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

# -------------------------------
# 5. Training Setup
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ViT(
    img_size=config["img_size"],
    patch_size=config["patch_size"],
    num_classes=10,
    embed_dim=config["embed_dim"],
    depth=config["depth"],
    num_heads=config["num_heads"]
).to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=config["label_smoothing"])
optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
scheduler = CosineAnnealingLR(optimizer, T_max=config["epochs"])

def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss, correct, total = 0,0,0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)
            _, preds = outputs.max(1)
            correct += preds.eq(targets).sum().item()
            total += targets.size(0)
    return running_loss/total, 100.*correct/total

# -------------------------------
# 6. Training Loop (Fast + AMP)
# -------------------------------
from torch.amp import GradScaler, autocast
scaler = GradScaler("cuda" if torch.cuda.is_available() else "cpu")
best_acc = 0.0

for epoch in range(config["epochs"]):
    model.train()
    train_loss, correct, total = 0,0,0

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        with autocast("cuda" if torch.cuda.is_available() else "cpu"):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * inputs.size(0)
        _, preds = outputs.max(1)
        correct += preds.eq(targets).sum().item()
        total += targets.size(0)

        if batch_idx % 50 == 0:
            print(f"Batch {batch_idx}/{len(trainloader)} - loss {loss.item():.4f}")

    train_acc = 100.*correct/total
    val_loss, val_acc = evaluate(model, testloader, criterion, device)
    scheduler.step()

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_vit_cifar10.pth")

    print(f"Epoch {epoch+1}/{config['epochs']} "
          f"Train Acc={train_acc:.2f}% | Val Acc={val_acc:.2f}% | Best={best_acc:.2f}%")

print(f"\n✅ Done. Best Test Accuracy: {best_acc:.2f}%")


100%|██████████| 170M/170M [00:03<00:00, 54.0MB/s]


Batch 0/391 - loss 2.5479
Batch 50/391 - loss 2.0050
Batch 100/391 - loss 1.9643
Batch 150/391 - loss 1.9690
Batch 200/391 - loss 1.8594
Batch 250/391 - loss 1.9397
Batch 300/391 - loss 1.7940
Batch 350/391 - loss 1.8874
Epoch 1/5 Train Acc=30.65% | Val Acc=36.19% | Best=36.19%
Batch 0/391 - loss 1.7627
Batch 50/391 - loss 1.8145
Batch 100/391 - loss 1.7740
Batch 150/391 - loss 1.8155
Batch 200/391 - loss 1.6993
Batch 250/391 - loss 1.6656
Batch 300/391 - loss 1.6939
Batch 350/391 - loss 1.7121
Epoch 2/5 Train Acc=40.54% | Val Acc=48.50% | Best=48.50%
Batch 0/391 - loss 1.7314
Batch 50/391 - loss 1.7696
Batch 100/391 - loss 1.6109
Batch 150/391 - loss 1.6845
Batch 200/391 - loss 1.7322
Batch 250/391 - loss 1.6117
Batch 300/391 - loss 1.6073
Batch 350/391 - loss 1.5497
Epoch 3/5 Train Acc=47.15% | Val Acc=50.33% | Best=50.33%
Batch 0/391 - loss 1.6512
Batch 50/391 - loss 1.6222
Batch 100/391 - loss 1.6121
Batch 150/391 - loss 1.5614
Batch 200/391 - loss 1.4664
Batch 250/391 - loss 1.493