<a href="https://colab.research.google.com/github/SaiRajesh228/DA6401_Assignment2/blob/main/DA6401_Assignment2_PartA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, ConcatDataset
from torchvision import transforms, datasets
from tqdm.notebook import tqdm
import wandb

# Mount Google Drive if in Colab
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    DATA_ROOT = '/content/drive/MyDrive/inaturalist_12K'
except ImportError:
    DATA_ROOT = './data'

# ---------------------------
# Data Manager
# ---------------------------
class ImageDataManager:
    """Handles image dataset preparation: loading, normalization, augmentation"""
    def __init__(self, img_size, data_root, device, standardize=True):
        self.img_size = img_size
        self.data_root = data_root
        self.device = device
        self.standardize = standardize
        self.mean, self.std = None, None
        # Store classes for later reference
        self.classes = None

    def _compute_stats(self, subset):
        path = os.path.join(self.data_root, subset)
        trans = transforms.Compose([transforms.Resize(self.img_size), transforms.ToTensor()])
        if not os.path.isdir(path):
            raise FileNotFoundError(f"Dataset path not found: {path}")
        dataset = datasets.ImageFolder(path, trans)
        loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=2)
        mean = torch.zeros(3).to(self.device)
        var = torch.zeros(3).to(self.device)
        total = 0
        for imgs, _ in loader:
            imgs = imgs.to(self.device)
            b = imgs.size(0)
            imgs_flat = imgs.view(b, 3, -1)
            mean += imgs_flat.mean(2).sum(0)
            var += imgs_flat.var(2).sum(0)
            total += b
        self.mean = (mean/total).cpu()
        self.std = (torch.sqrt(var/total)).cpu()
        return self.mean, self.std

    def create_loader(self, subset, batch_size=32, augmentations=None):
        path = os.path.join(self.data_root, subset)
        if not os.path.isdir(path):
            raise FileNotFoundError(f"Dataset path not found: {path}")

        transforms_list = [transforms.Resize(self.img_size), transforms.ToTensor()]
        if self.standardize:
            if self.mean is None:
                self._compute_stats(subset)
            transforms_list.append(transforms.Normalize(self.mean, self.std))

        # Create a base dataset first to get classes
        base_dataset = datasets.ImageFolder(path, transforms.Compose(transforms_list))

        # Store classes for later reference
        if self.classes is None:
            self.classes = base_dataset.classes

        # Handle augmentations correctly
        if subset == 'train' and augmentations:
            datasets_list = []
            # Add dataset with no augmentation
            datasets_list.append(base_dataset)

            # Add datasets with each augmentation
            for aug in augmentations:
                # Create a new transforms list that includes the augmentation at the beginning
                aug_transforms = [aug] + transforms_list
                datasets_list.append(datasets.ImageFolder(path, transforms.Compose(aug_transforms)))

            final_dataset = ConcatDataset(datasets_list)
        else:
            final_dataset = base_dataset

        return DataLoader(
            final_dataset,
            batch_size=batch_size,
            shuffle=(subset == 'train'),
            num_workers=2,
            pin_memory=True
        )

# ---------------------------
# CNN Model
# ---------------------------
class CustomCNN(nn.Module):
    """Flexible CNN with conv, pool, batch-norm, dropout and dense layers"""
    def __init__(self, input_size, in_channels, num_classes,
                 conv_layers, dense_units,
                 conv_activation=nn.ReLU, fc_activation=nn.ReLU,
                 use_bn=True, dropout_rate=0.0):
        super().__init__()
        h, w = input_size
        c = in_channels
        self.features = nn.Sequential()
        idx = 0
        for cfg in conv_layers:
            if cfg['type'] == 'conv':
                self.features.add_module(
                    f"conv{idx}",
                    nn.Conv2d(c, cfg['filters'], cfg['kernel'], cfg['stride'], cfg['padding'])
                )
                if use_bn:
                    self.features.add_module(f"bn{idx}", nn.BatchNorm2d(cfg['filters']))
                self.features.add_module(f"act{idx}", conv_activation())
                c = cfg['filters']
                h = (h - cfg['kernel'] + 2*cfg['padding'])//cfg['stride'] + 1
                w = (w - cfg['kernel'] + 2*cfg['padding'])//cfg['stride'] + 1
            elif cfg['type'] == 'pool':
                self.features.add_module(
                    f"pool{idx}", nn.MaxPool2d(cfg['size'], cfg['stride'])
                )
                h = (h - cfg['size'])//cfg['stride'] + 1
                w = (w - cfg['size'])//cfg['stride'] + 1
            idx += 1
        flat_dim = h * w * c
        layers = []
        in_feat = flat_dim
        for u in dense_units:
            layers.append(nn.Linear(in_feat, u))
            layers.append(fc_activation())
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            in_feat = u
        layers.append(nn.Linear(in_feat, num_classes))
        self.classifier = nn.Sequential(*layers)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        return self.classifier(x)

# ---------------------------
# Experiment Pipeline
# ---------------------------
class DLExperiment:
    """Handles training and validation only"""
    def __init__(self, img_size, data_root, device=None, use_wandb=False):
        self.device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.data_mgr = ImageDataManager(img_size, data_root, self.device)
        self.use_wandb = use_wandb
        self.train_loader = None
        self.val_loader = None
        self.model = None

    def setup_data(self, batch_size, augmentations=None):
        self.train_loader = self.data_mgr.create_loader('train', batch_size, augmentations)
        self.val_loader = self.data_mgr.create_loader('val', batch_size)
        if not self.train_loader or not self.val_loader:
            raise FileNotFoundError("Both 'train' and 'val' directories must exist under data_root.")

    def setup_model(self, conv_config, dense_units, num_classes,
                    use_bn=True, dropout_rate=0.0):
        self.model = CustomCNN(
            input_size=self.data_mgr.img_size,
            in_channels=3,
            num_classes=num_classes,
            conv_layers=conv_config,
            dense_units=dense_units,
            use_bn=use_bn,
            dropout_rate=dropout_rate
        ).to(self.device)

    def train(self, epochs, lr, weight_decay):
        if self.model is None:
            raise ValueError("Model not initialized. Call setup_model() first.")

        optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.CrossEntropyLoss()
        best_acc = 0.0

        for epoch in range(1, epochs+1):
            # Training
            self.model.train()
            total, correct = 0, 0
            train_loss = 0.0

            for imgs, labels in tqdm(self.train_loader, desc=f"Epoch {epoch}"):
                imgs, labels = imgs.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = self.model(imgs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.item() * labels.size(0)
                _, preds = outputs.max(1)
                correct += preds.eq(labels).sum().item()
                total += labels.size(0)

            # Calculate metrics
            train_loss = train_loss / total
            train_acc = 100 * correct / total

            # Validation
            val_loss, val_acc = self.evaluate()

            # Log metrics to wandb
            if self.use_wandb:
                wandb.log({
                    "epoch": epoch,
                    "train_loss": train_loss,
                    "train_accuracy": train_acc,
                    "val_loss": val_loss,
                    "val_accuracy": val_acc
                })

            print(f"Epoch {epoch}/{epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

            # Save best model
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(self.model.state_dict(), 'best_model.pth')
                if self.use_wandb:
                    wandb.run.summary["best_val_accuracy"] = best_acc

        return best_acc

    def evaluate(self):
        if self.model is None:
            raise ValueError("Model not initialized. Call setup_model() first.")

        self.model.eval()
        total, correct = 0, 0
        loss_sum = 0.0
        criterion = nn.CrossEntropyLoss()

        with torch.no_grad():
            for imgs, labels in self.val_loader:
                imgs, labels = imgs.to(self.device), labels.to(self.device)
                outputs = self.model(imgs)
                loss_sum += criterion(outputs, labels).item() * labels.size(0)
                _, preds = outputs.max(1)
                correct += preds.eq(labels).sum().item()
                total += labels.size(0)

        return loss_sum/total, 100*correct/total

# ---------------------------
# Main Execution
# ---------------------------
def run_experiment(config):
    wandb.init(project="DA6401_Assignment2_PartA", config=config)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    exp = DLExperiment(img_size=(config['crop_size'], config['crop_size']),
                       data_root=DATA_ROOT,
                       device=device,
                       use_wandb=True)

    # Setup data with proper augmentations
    augmentations = []
    if config.get('augmentations'):
        for aug in config['augmentations']:
            augmentations.append(aug)

    exp.setup_data(batch_size=config['batch_size'], augmentations=augmentations)

    # Build conv config
    conv_cfg = []
    filters = config['num_filters']
    for i in range(config['conv_layers']):
        conv_cfg.append({'type': 'conv', 'filters': filters,
                         'kernel': config['filter_size'], 'stride': 1,
                         'padding': config['filter_size']//2})
        conv_cfg.append({'type': 'pool', 'size': 2, 'stride': 2})

        # Apply filter growth strategy
        if config.get('filter_growth_strategy') == 'double':
            filters *= 2
        elif config.get('filter_growth_strategy') == 'half':
            filters = max(16, filters // 2)  # Prevent filters from becoming too small
        # Default is to use filter_growth_factor
        else:
            filters = int(filters * config['filter_growth_factor'])

    # Get number of classes from the data manager
    num_classes = len(exp.data_mgr.classes)
    print(f"Number of classes: {num_classes}")

    # Setup model
    exp.setup_model(
        conv_cfg,
        dense_units=[config['hidden_units']] * config['dense_layers'],
        num_classes=num_classes,
        use_bn=config['batch_norm'],
        dropout_rate=config['dropout_rate']
    )

    # Report model parameters
    total_params = sum(p.numel() for p in exp.model.parameters())
    trainable_params = sum(p.numel() for p in exp.model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")

    # Train the model
    best_accuracy = exp.train(
        epochs=config['training_epochs'],
        lr=config['learning_rate'],
        weight_decay=config['l2_regularization']
    )

    # Close wandb run
    wandb.finish()

    return best_accuracy

# Example config with corrected augmentations
default_config = {
    'conv_layers': 4,
    'num_filters': 32,
    'filter_size': 3,
    'filter_growth_factor': 2,
    'filter_growth_strategy': 'double',  # Options: 'double', 'half', None (use factor)
    'dense_layers': 2,
    'hidden_units': 512,
    'batch_norm': True,
    'dropout_rate': 0.2,
    # Each augmentation is separate now
    'augmentations': [
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(brightness=0.1, contrast=0.1)
    ],
    'crop_size': 224,  # Reduced from 600 to save memory
    'batch_size': 32,
    'learning_rate': 1e-3,
    'l2_regularization': 1e-4,
    'training_epochs': 10
}

# For setting up hyperparameter sweeps
def sweep_configuration():
    sweep_config = {
        'method': 'bayes',  # Can be 'grid', 'random' or 'bayes'
        'metric': {
            'name': 'val_accuracy',
            'goal': 'maximize'
        },
        'parameters': {
            'conv_layers': {
                'values': [3, 4, 5]
            },
            'num_filters': {
                'values': [16, 32, 64]
            },
            'filter_size': {
                'values': [3, 5]
            },
            'filter_growth_strategy': {
                'values': ['double', 'half', None]
            },
            'dense_layers': {
                'values': [1, 2]
            },
            'hidden_units': {
                'values': [256, 512, 1024]
            },
            'batch_norm': {
                'values': [True, False]
            },
            'dropout_rate': {
                'values': [0.0, 0.2, 0.3, 0.5]
            },
            'learning_rate': {
                'values': [1e-4, 3e-4, 1e-3]
            },
            'batch_size': {
                'values': [16, 32, 64]
            },
            'crop_size': {
                'values': [224]  # Fixed for memory efficiency
            },
            'training_epochs': {
                'value': 30  # Fixed for all runs
            }
        }
    }
    return sweep_config

# For running a sweep
def run_sweep():
    sweep_id = wandb.sweep(sweep_configuration(), project="DA6401_Assignment2_PartA")
    wandb.agent(sweep_id, function=run_experiment, count=20)  # Run 20 experiments

if __name__ == '__main__':
    # Choose one of these:
    run_experiment(default_config)  # Run a single experiment with default config
    # run_sweep()  # Run hyperparameter sweep

Mounted at /content/drive


Using device: cuda
Number of classes: 10
Total parameters: 26,347,786
Trainable parameters: 26,347,786


Epoch 1:   0%|          | 0/1250 [00:00<?, ?it/s]

Epoch 1/5 - Train Loss: 2.7103, Train Acc: 11.62%, Val Loss: 2.2664, Val Acc: 13.50%


Epoch 2:   0%|          | 0/1250 [00:00<?, ?it/s]

Epoch 2/5 - Train Loss: 2.2534, Train Acc: 14.18%, Val Loss: 2.2098, Val Acc: 16.75%


Epoch 3:   0%|          | 0/1250 [00:00<?, ?it/s]

Epoch 3/5 - Train Loss: 2.2249, Train Acc: 16.09%, Val Loss: 2.1910, Val Acc: 16.35%


Epoch 4:   0%|          | 0/1250 [00:00<?, ?it/s]

Epoch 4/5 - Train Loss: 2.1833, Train Acc: 17.75%, Val Loss: 2.1043, Val Acc: 23.15%


Epoch 5:   0%|          | 0/1250 [00:00<?, ?it/s]

Epoch 5/5 - Train Loss: 2.1021, Train Acc: 21.60%, Val Loss: 1.9998, Val Acc: 26.70%


0,1
epoch,▁▃▅▆█
train_accuracy,▁▃▄▅█
train_loss,█▃▂▂▁
val_accuracy,▁▃▃▆█
val_loss,█▇▆▄▁

0,1
best_val_accuracy,26.7
epoch,5.0
train_accuracy,21.59966
train_loss,2.10207
val_accuracy,26.7
val_loss,1.99981
