<a href="https://colab.research.google.com/github/SaiRajesh228/DA6401_Assignment2/blob/main/DA6401_Assignment2_PartB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
import time
from tqdm.notebook import tqdm
import numpy as np
import torch
import torchvision
import torch.nn as nn
from torchvision import transforms
from torchvision.models import resnet50, ResNet50_Weights
import wandb
from torch.utils.data import ConcatDataset

# Set seeds
seed = 76
torch.manual_seed(seed)
np.random.seed(seed)

class DataPreparation:
    def __init__(self, data_dir, device, default_transforms=None):
        self.base_dir = data_dir
        self.device = device
        self.default_transforms = default_transforms

    def create_dataloader(self, sub_dir, batch_size=16, shuffle=True,
                         num_workers=0,  # Force 0 workers for Colab
                         data_augmentation_transforms=None,
                         pin_memory=False):
        print(f"\nPreparing data from {sub_dir}")

        full_path = os.path.join(self.base_dir, sub_dir)
        if not os.path.exists(full_path):
            raise FileNotFoundError(f"Directory {full_path} not found!")

        print(f"Found {len(os.listdir(full_path))} classes in {full_path}")

        if ("train" in sub_dir) and data_augmentation_transforms:
            print("Applying data augmentations")
            original_dataset = torchvision.datasets.ImageFolder(
                full_path,
                transform=self.default_transforms
            )
            dataset_list = [original_dataset]

            for aug_transform in data_augmentation_transforms:
                composed_transforms = transforms.Compose([
                    *aug_transform,
                    self.default_transforms
                ])
                aug_dataset = torchvision.datasets.ImageFolder(
                    full_path,
                    transform=composed_transforms
                )
                dataset_list.append(aug_dataset)

            self.dataset = ConcatDataset(dataset_list)
        else:
            self.dataset = torchvision.datasets.ImageFolder(
                full_path,
                transform=self.default_transforms
            )

        print(f"Total samples: {len(self.dataset)}")
        return torch.utils.data.DataLoader(
            self.dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,  # Critical for Colab
            pin_memory=pin_memory
        )

class Experiment:
    def __init__(self, device, base_dir, wandb_logging=False):
        self.device = device
        self.base_data_dir = base_dir
        self.wandb_logging = wandb_logging

        print(f"\nUsing device: {device}")
        if str(device) == 'cuda':
            print(f"GPU: {torch.cuda.get_device_name(0)}")

    def createResNet(self, num_output_neurons):
        self.model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        self.default_transforms = ResNet50_Weights.IMAGENET1K_V2.transforms()

        # Freeze base layers
        for param in self.model.parameters():
            param.requires_grad = False

        # Modify final layer
        num_ftrs = self.model.fc.in_features
        self.model.fc = nn.Linear(num_ftrs, num_output_neurons)

        # Initialize weights
        nn.init.xavier_uniform_(self.model.fc.weight)
        self.model.fc.bias.data.fill_(0.01)

        # Unfreeze final layer
        for param in self.model.fc.parameters():
            param.requires_grad = True

        self.model.to(self.device)
        print("\nModel architecture:")
        print(self.model)

    def create_dataloaders(self, batch_size, shuffle,
                          list_of_train_data_augmentation_transforms):
        dataprep = DataPreparation(
            self.base_data_dir,
            self.device,
            self.default_transforms
        )

        train_loader = dataprep.create_dataloader(
            sub_dir="train/",
            batch_size=batch_size,
            shuffle=shuffle,
            data_augmentation_transforms=list_of_train_data_augmentation_transforms
        )

        val_loader = dataprep.create_dataloader(
            sub_dir="val/",
            batch_size=batch_size,
            shuffle=False
        )

        return train_loader, val_loader

    def compute_accuracy(self, model, data_loader):
        model.eval()
        correct = 0
        total = 0
        loss = 0.0

        with torch.no_grad():
            for images, labels in tqdm(data_loader, desc="Validating", leave=False):
                images, labels = images.to(self.device), labels.to(self.device)
                outputs = model(images)
                loss += self.criterion(outputs, labels).item() * images.size(0)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        return loss/total, 100 * correct/total

    def train(self, lr, weight_decay, optimiser, epochs):
        optimisers = {
            'adam': torch.optim.Adam,
            'nadam': torch.optim.NAdam,
            'rmsprop': torch.optim.RMSprop
        }
        self.optimiser = optimisers[optimiser.lower()](
            self.model.parameters(),
            lr=lr,
            weight_decay=weight_decay
        )

        self.criterion = nn.CrossEntropyLoss()

        # Test single batch first
        print("\nTesting single batch...")
        test_inputs, test_labels = next(iter(self.train_loader))
        test_inputs = test_inputs.to(self.device)
        test_labels = test_labels.to(self.device)
        outputs = self.model(test_inputs)
        print("Batch test successful!")
        print(f"Input shape: {test_inputs.shape}")
        print(f"Output shape: {outputs.shape}")

        start_time = time.time()
        for epoch in tqdm(range(epochs), desc="Epochs"):
            self.model.train()
            running_loss = 0.0
            correct = 0
            total = 0

            for images, labels in tqdm(self.train_loader, desc="Batches", leave=False):
                images, labels = images.to(self.device), labels.to(self.device)

                self.optimiser.zero_grad()
                outputs = self.model(images)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimiser.step()

                running_loss += loss.item() * images.size(0)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            train_loss = running_loss / total
            train_acc = 100 * correct / total
            val_loss, val_acc = self.compute_accuracy(self.model, self.val_loader)

            if self.wandb_logging:
                wandb.log({
                    "epoch": epoch+1,
                    "train_loss": train_loss,
                    "train_acc": train_acc,
                    "val_loss": val_loss,
                    "val_acc": val_acc
                })

            print(f"\nEpoch {epoch+1}/{epochs}")
            print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.2f}%")
            print(f"Val Loss: {val_loss:.4f} | Acc: {val_acc:.2f}%")
            print("-" * 50)

        print(f"\nTraining completed in {(time.time()-start_time)/60:.2f} minutes")

# Sweep Configuration
sweep_config = {
    'method': 'random',
    'name': 'PA2 ResNet Fine Tuning',
    'metric': {'name': 'val_acc', 'goal': 'maximize'},
    'parameters': {
        'optimiser': {'values': ["adam", "rmsprop", "nadam"]},
        'lr': {'values': [1e-3, 1e-4, 3e-4]},
        'weight_decay': {'values': [0]},
        'data_aug': {'values': [0, 1]},
        'epochs': {'values': [3, 5, 8]},
        'batch_size': {'values': [16, 32]}
    }
}

def main():
    wandb.init()
    config = wandb.config

    # Verify paths
    base_path = '/content/drive/MyDrive/inaturalist_12K/'
    print("\nDirectory structure verification:")
    print(f"Base path exists: {os.path.exists(base_path)}")
    print(f"Train folder exists: {os.path.exists(os.path.join(base_path, 'train'))}")
    print(f"Validation folder exists: {os.path.exists(os.path.join(base_path, 'validation'))}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    experiment = Experiment(
        device=device,
        base_dir=base_path,
        wandb_logging=True
    )
    experiment.createResNet(num_output_neurons=10)

    augmentation_transforms = []
    if config.data_aug == 1:
        augmentation_transforms = [
            [transforms.RandomHorizontalFlip(p=0.5),
             transforms.ColorJitter(brightness=0.2)]
        ]

    train_loader, val_loader = experiment.create_dataloaders(
        batch_size=config.batch_size,
        shuffle=True,
        list_of_train_data_augmentation_transforms=augmentation_transforms
    )
    experiment.train_loader = train_loader
    experiment.val_loader = val_loader

    experiment.train(
        lr=config.lr,
        weight_decay=config.weight_decay,
        optimiser=config.optimiser,
        epochs=config.epochs
    )

# Authenticate and run
wandb.login()
sweep_id = wandb.sweep(sweep_config, project="DA6401_Assignment2_PartB")
wandb.agent(sweep_id, function=main, count=30)
wandb.finish()

Mounted at /content/drive
Create sweep with ID: mk9ba53d
Sweep URL: https://wandb.ai/cs24m040-iit-madras/DA6401_Assignment2_PartB/sweeps/mk9ba53d


[34m[1mwandb[0m: Agent Starting Run: 12as7eul with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	data_aug: 1
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	optimiser: nadam
[34m[1mwandb[0m: 	weight_decay: 0



Directory structure verification:
Base path exists: True
Train folder exists: True
Validation folder exists: False

Using device: cuda
GPU: Tesla T4

Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentu

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 1/5
Train Loss: 0.9041 | Acc: 73.93%
Val Loss: 0.5810 | Acc: 82.95%
--------------------------------------------------


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 2/5
Train Loss: 0.5445 | Acc: 83.49%
Val Loss: 0.5210 | Acc: 84.05%
--------------------------------------------------


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 3/5
Train Loss: 0.4707 | Acc: 85.33%
Val Loss: 0.5052 | Acc: 83.95%
--------------------------------------------------


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 4/5
Train Loss: 0.4141 | Acc: 87.08%
Val Loss: 0.5096 | Acc: 83.00%
--------------------------------------------------


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 5/5
Train Loss: 0.3769 | Acc: 88.18%
Val Loss: 0.5065 | Acc: 83.35%
--------------------------------------------------

Training completed in 90.78 minutes


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇▇█
train_loss,█▃▂▁▁
val_acc,▁█▇▁▄
val_loss,█▂▁▁▁

0,1
epoch,5.0
train_acc,88.18382
train_loss,0.37693
val_acc,83.35
val_loss,0.50654


[34m[1mwandb[0m: Agent Starting Run: hhqjxk6v with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	data_aug: 0
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimiser: nadam
[34m[1mwandb[0m: 	weight_decay: 0



Directory structure verification:
Base path exists: True
Train folder exists: True
Validation folder exists: False

Using device: cuda
GPU: Tesla T4

Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentu

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 1/5
Train Loss: 1.9774 | Acc: 36.70%
Val Loss: 1.6840 | Acc: 58.20%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 2/5
Train Loss: 1.4867 | Acc: 62.44%
Val Loss: 1.3339 | Acc: 69.00%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 3/5
Train Loss: 1.2310 | Acc: 70.22%
Val Loss: 1.1384 | Acc: 73.55%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 4/5
Train Loss: 1.0790 | Acc: 72.63%
Val Loss: 1.0237 | Acc: 75.70%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 5/5
Train Loss: 0.9743 | Acc: 74.71%
Val Loss: 0.9204 | Acc: 77.60%
--------------------------------------------------

Training completed in 18.26 minutes


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▅▃▂▁
val_acc,▁▅▇▇█
val_loss,█▅▃▂▁

0,1
epoch,5.0
train_acc,74.70747
train_loss,0.97434
val_acc,77.6
val_loss,0.92036


[34m[1mwandb[0m: Agent Starting Run: guam9dra with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	data_aug: 0
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	optimiser: adam
[34m[1mwandb[0m: 	weight_decay: 0



Directory structure verification:
Base path exists: True
Train folder exists: True
Validation folder exists: False

Using device: cuda
GPU: Tesla T4

Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentu

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 1/5
Train Loss: 1.9726 | Acc: 37.78%
Val Loss: 1.6791 | Acc: 60.05%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 2/5
Train Loss: 1.4828 | Acc: 63.03%
Val Loss: 1.3439 | Acc: 70.35%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 3/5
Train Loss: 1.2220 | Acc: 70.31%
Val Loss: 1.1290 | Acc: 76.35%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 4/5
Train Loss: 1.0731 | Acc: 72.66%
Val Loss: 1.0119 | Acc: 77.55%
--------------------------------------------------


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 5/5
Train Loss: 0.9714 | Acc: 74.95%
Val Loss: 0.9157 | Acc: 79.20%
--------------------------------------------------

Training completed in 18.07 minutes


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▅▃▂▁
val_acc,▁▅▇▇█
val_loss,█▅▃▂▁

0,1
epoch,5.0
train_acc,74.94749
train_loss,0.9714
val_acc,79.2
val_loss,0.91571


[34m[1mwandb[0m: Agent Starting Run: hn4pluki with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	data_aug: 1
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	lr: 0.0003
[34m[1mwandb[0m: 	optimiser: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0



Directory structure verification:
Base path exists: True
Train folder exists: True
Validation folder exists: False

Using device: cuda
GPU: Tesla T4

Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentu

Epochs:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Validating:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 1/8
Train Loss: 1.0569 | Acc: 70.52%
Val Loss: 0.6890 | Acc: 81.65%
--------------------------------------------------


Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Validating:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 2/8
Train Loss: 0.6955 | Acc: 79.13%
Val Loss: 0.5754 | Acc: 83.10%
--------------------------------------------------


Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Validating:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 3/8
Train Loss: 0.6054 | Acc: 81.33%
Val Loss: 0.5205 | Acc: 84.00%
--------------------------------------------------


Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Validating:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 4/8
Train Loss: 0.5584 | Acc: 82.61%
Val Loss: 0.5090 | Acc: 84.20%
--------------------------------------------------


Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Validating:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 5/8
Train Loss: 0.5273 | Acc: 83.56%
Val Loss: 0.4879 | Acc: 84.70%
--------------------------------------------------


Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Validating:   0%|          | 0/125 [00:00<?, ?it/s]


Epoch 6/8
Train Loss: 0.5035 | Acc: 84.16%
Val Loss: 0.4891 | Acc: 84.45%
--------------------------------------------------


Batches:   0%|          | 0/1250 [00:00<?, ?it/s]