<a href="https://colab.research.google.com/github/baranceanuvlad/Advanced-Topics-in-Neural-Networks-Template-2023/blob/main/Lab07/Solution/Homework7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from multiprocessing import freeze_support

import torch
from torchvision.datasets import CIFAR10
from torchvision.transforms import v2
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import random
import torchvision.transforms as transformss
import torch.nn as nn
import torch.nn.functional as F

In [None]:
!pip install sam-pytorch


In [None]:
!pip install wandb

In [None]:
from sam import SAM

In [None]:
import wandb

In [2]:
class Handmade_Conv2d_implementation(nn.Module):
    def __init__(self, weights):
        super(Handmade_Conv2d_implementation, self).__init__()
        self.out_channels = weights.size(dim=0)
        self.kernel_height = weights.size(dim=2)
        self.kernel_width = weights.size(dim=3)
        self.weights = weights

    def forward(self, x):
        batch_size, in_channels, height, width = x.size()

        out_height = height - self.kernel_height + 1
        out_width = width - self.kernel_width + 1

        output = torch.zeros(batch_size, self.out_channels, out_height, out_width)

        for i in range(out_height):
            for j in range(out_width):
                output[:, :, i, j] = torch.sum(
                    x[:, :, i:i + self.kernel_height, j:j + self.kernel_width] * self.weights,
                    dim=(1, 2, 3)
                )

        return output

In [3]:
input = torch.randn(1,3,10,12)
w = torch.randn(2,3,4,5)

custom_conv2d_layer = Handmade_Conv2d_implementation(weights=w)
out = custom_conv2d_layer(input)
print((torch.nn.functional.conv2d(input, w)- out).abs().max())

tensor(5.7220e-06)


In [None]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
        # For multi-gpu workstations, PyTorch will use the first available GPU (cuda:0), unless specified otherwise
        # (cuda:1).
    if torch.backends.mps.is_available():
        return torch.device('mos')
    return torch.device('cpu')


In [None]:
class CachedDataset(Dataset):
    def __init__(self, dataset, cache=True):
        if cache:
            dataset = tuple([x for x in dataset])
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        return self.dataset[i]

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2,hidden_size_3,hidden_size_4, output_size):
        super(MLP, self).__init__()

        self.conv1 = nn.Conv2d(3, 16, 3, 1, padding=1) # input is color image, hence 3 i/p channels. 16 filters, kernal size is tuned to 3 to avoid overfitting, stride is 1 , padding is 1 extract all edge features.
        self.conv2 = nn.Conv2d(16, 32, 3, 1, padding=1) # We double the feature maps for every conv layer as in pratice it is really good.
        self.conv3 = nn.Conv2d(32, 64, 3, 1, padding=1)
        self.fc1 = nn.Linear(4*4*64, 500) # I/p image size is 32*32, after 3 MaxPooling layers it reduces to 4*4 and 64 because our last conv layer has 64 outputs. Output nodes is 500
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x)) #Apply relu to each output of conv layer.
        x = F.max_pool2d(x, 2, 2) # Max pooling layer with kernal of 2 and stride of 2
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*64) # flatten our images to 1D to input it to the fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout1(x) # Applying dropout b/t layers which exchange highest parameters. This is a good practice
        x = self.fc2(x)
        return x

In [None]:
class Block(nn.Module):
    '''expand + depthwise + pointwise'''
    def __init__(self, in_planes, out_planes, expansion, stride):
        super(Block, self).__init__()
        self.stride = stride

        planes = expansion * in_planes
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 1 and in_planes != out_planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_planes),
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out = out + self.shortcut(x) if self.stride==1 else out
        return out


class MobileNetV2(nn.Module):
    # (expansion, out_planes, num_blocks, stride)
    cfg = [(1,  16, 1, 1),
           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
           (6,  32, 3, 1),
           (6,  64, 4, 1),
           (6,  96, 3, 1),
           (6, 160, 3, 1),
           (6, 320, 1, 1)]

    def __init__(self, num_classes=10):
        super(MobileNetV2, self).__init__()
        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(1280)
        self.linear = nn.Linear(1280 * 64, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for expansion, out_planes, num_blocks, stride in self.cfg:
            strides = [stride] + [1]*(num_blocks-1)
            for stride in strides:
                layers.append(Block(in_planes, out_planes, expansion, stride))
                in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.relu(self.bn2(self.conv2(out)))
        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


In [None]:
def accuracy(output, labels):
    fp_plus_fn = torch.logical_not(output == labels).sum().item()
    all_elements = len(output)
    return (all_elements - fp_plus_fn) / all_elements

In [None]:
def train(model, train_loader, criterion, optimizer, device, writer):
    model.train()

    all_outputs = []
    all_labels = []
    batch_number = 0
    total_loss = 0

    for data, labels in train_loader:
        batch_number += 1

        data = data.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        output = model(data)
        loss = criterion(output, labels)
        total_loss += loss.item()


        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        output = model(data)
        output = output.softmax(dim=1).detach().cpu().squeeze()
        labels = labels.cpu().squeeze()
        all_outputs.append(output)
        all_labels.append(labels)

    all_outputs = torch.cat(all_outputs).argmax(dim=1)
    all_labels = torch.cat(all_labels)

    return round(accuracy(all_outputs, all_labels), 4) , total_loss / len(train_loader)

In [None]:
def val(model, val_loader, criterion, device):
    model.eval()

    all_outputs = []
    all_labels = []
    total_loss = 0

    for data, labels in val_loader:
        data = data.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        with torch.no_grad():
            output = model(data)

        loss = criterion(output, labels)
        total_loss += loss.item()

        output = output.softmax(dim=1).cpu().squeeze()
        labels = labels.cpu().squeeze()
        all_outputs.append(output)
        all_labels.append(labels)

    all_outputs = torch.cat(all_outputs).argmax(dim=1)
    all_labels = torch.cat(all_labels)

    return round(accuracy(all_outputs, all_labels), 4), total_loss / len(val_loader)

In [None]:
def do_epoch(model, train_loader, val_loader, criterion, optimizer, device, writer):
    acc, loss = train(model, train_loader, criterion, optimizer, device, writer)
    acc_val, loss_val = val(model, val_loader, criterion, device)
    # torch.cuda.empty_cache()
    return acc, acc_val, loss, loss_val

In [None]:
def get_model_norm(model):
    norm = 0.0
    for param in model.parameters():
        norm += torch.norm(param)
    return norm


In [None]:
def main_train():
    run = wandb.init(
        project="image-classification-convolutions",
        notes="My first experiment",
    )
    device=get_default_device()
    #learning_rate = wandb.config.learning_rate
    #batch_size = wandb.config.batch_size
    #num_epochs = wandb.config.num_epochs

    mean = [0.4914, 0.4822, 0.4465]
    std = [0.247, 0.243, 0.261]
    train_transforms = [
        transformss.ToTensor(),
        transformss.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]

    val_transforms = [
        transformss.ToTensor(),
        transformss.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]

    data_path = '../data'
    train_dataset = CIFAR10(root=data_path, train=True, transform=v2.Compose(train_transforms), download=True)
    val_dataset = CIFAR10(root=data_path, train=False, transform=v2.Compose(val_transforms), download=True)
    train_dataset = CachedDataset(train_dataset)
    val_dataset = CachedDataset(val_dataset)

    batch_size = 64
    base_learning_rate = 0.001
    num_epochs = 100


    val_batch_size = 500
    num_workers = 2
    persistent_workers = (num_workers != 0)
    pin_memory = device.type == 'cuda'
    train_loader = DataLoader(train_dataset, shuffle=True, pin_memory=pin_memory, num_workers=num_workers,
                              batch_size=batch_size, drop_last=True, persistent_workers=persistent_workers)
    val_loader = DataLoader(val_dataset, shuffle=False, pin_memory=True, num_workers=0, batch_size=val_batch_size,
                            drop_last=False)

    #model = MLP(784, 3136, 2000 ,1000, 500, 10)
    model = MobileNetV2()
    model = model.to(device)
    base_optimizer = torch.optim.AdamW(model.parameters(), lr=base_learning_rate)
    param_groups = [{'params': model.parameters()}]
    criterion = torch.nn.CrossEntropyLoss()

    log_dir = "logs"
    writer = SummaryWriter(log_dir)
    writer.add_scalar('Batch size', batch_size)
    writer.add_text('Optimizer', base_optimizer.__class__.__name__)
    writer.add_scalar('Learning rate', base_learning_rate)

    tbar = tqdm(tuple(range(num_epochs)))
    for epoch in tbar:
        acc, acc_val, loss, loss_val = do_epoch(model, train_loader, val_loader, criterion, base_optimizer, device, writer)
        wandb.log({"accuracy": acc_val, "loss": loss_val})
        tbar.set_postfix_str(f"Acc: {acc}, Acc_val: {acc_val}")
        writer.add_scalar("Train/Loss", loss, epoch)
        writer.add_scalar("Train/Accuracy", acc, epoch)
        writer.add_scalar("Val/Loss", loss_val, epoch)
        writer.add_scalar("Val/Accuracy", acc_val, epoch)
        writer.add_scalar("Model/Norm", get_model_norm(model), epoch)


In [None]:
def main(device=get_default_device()):


    #paramaters_dict =  {"epochs": 200, "learning_rate": 0.01, "batch_size":  random.choice([32, 64, 128, 256]), "optimizer": "SGD with SAM"}

    sweep_config = {
      'method': 'random',  # You can choose other methods like 'grid', 'bayes', etc.
      'project': 'image-classification',
      'parameters': {
          'learning_rate': {'values': [0.001,0.05, 0.01, 0.1]},
          'batch_size': {'values': [32, 64, 128]},
          'num_epochs': {'values': [50, 100, 150]},
      }
    }

    #wandb.config = sweep_config['parameters']
    #learning_rate = wandb.config['learning_rate']
    #batch_size = wandb.config['batch_size']
    #epochs = wandb.config['num_epochs']






    sweep_id = wandb.sweep(sweep_config, project='image-classification')
    wandb.agent(sweep_id, function=main_train)

    #tbar = tqdm(tuple(range(epochs)))
    #for epoch in tbar:
     #   acc, acc_val, loss, loss_val = do_epoch(model, train_loader, val_loader, criterion, optimizer, device, writer)
      #  wandb.log({"accuracy": acc_val, "loss": loss_val})
       # tbar.set_postfix_str(f"Acc: {acc}, Acc_val: {acc_val}")
        #writer.add_scalar("Train/Loss", loss, epoch)
        #writer.add_scalar("Train/Accuracy", acc, epoch)
        #writer.add_scalar("Val/Loss", loss_val, epoch)
        #writer.add_scalar("Val/Accuracy", acc_val, epoch)
        #writer.add_scalar("Model/Norm", get_model_norm(model), epoch)



In [None]:
if __name__ == '__main__':
    freeze_support()
    main_train()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112662044443925, max=1.0…

Files already downloaded and verified
Files already downloaded and verified


  0%|          | 0/100 [00:00<?, ?it/s]


OutOfMemoryError: ignored