## Set up paths and imports

In [None]:
import os

import torch
from torchvision import transforms

if not os.path.exists("./notebooks"):
    %cd ..

import src.model
from src.training import do_train, do_test
from src.dataset import prepare_dataset_loaders
from src.data_processing import load_mean_std
from src.config import DATASET_DIR

wandb_enabled = False

## 1. Load standarization data and define Config

In [None]:
mean, std = load_mean_std(f"{DATASET_DIR}/scaling_params.json")

class Config:
    def __init__(self, lr=0.001, epochs=40, batch_size=32):
        self.learning_rate = lr
        self.epochs = epochs
        self.batch_size = batch_size

### Optionally initialize W&B project

In [None]:
wandb_enabled = True

## 2. Choose device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Choose your architecture

In [None]:
name = "TutorialCNN"
model = src.model.TutorialCNN()
config = Config()
transform = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, {}, run.id, device, wandb_enabled)

In [None]:
name = "OriginalSizeCNN"
model = src.model.OriginalSizeCNN()
config = Config()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

#run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, {}, run.id, device, wandb_enabled)

In [None]:
name = "ResNetBatchNormalization"
model_args={
    "batch_normalization": True,
    "residual_connections": True
}
model = src.model.OurResNet(**model_args)
config = Config()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, model_args, run.id, device, wandb_enabled)

In [None]:
name = "DropoutCNN"
model = src.model.DropoutCNN()
config = Config()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer)

do_test(name, test_loader, model.__class__, run, device, wandb_enabled)

In [None]:
import numpy as np
ensemble_models = []
for i in range(10):
    model = src.model.OriginalSizeCNN()
    config = Config()
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    torch.manual_seed(i)
    np.random.seed(i)
    train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
    do_test(name, test_loader, model.__class__, run, device, wandb_enabled)
    ensemble_models.append(model)

In [None]:
ensemble_model_names = ["OriginalSizeCNN-HE-RELU", "OriginalSizeCNN-UNIFORM-RELU", "OriginalSizeCNN-XAVIER-RELU"]
ensemble_models = []
for model_name in ensemble_model_names:
    model = src.model.OriginalSizeCNN()
    model.load_state_dict(torch.load(f"./models/{model_name}.pth", weights_only=True))
    model.device = device
    model.to(device)
    ensemble_models.append(model)

name = "EnsembleCNN"
model = src.model.EnsembleCNN(ensemble_models, 2)
config = Config()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer)
do_test(name, test_loader, model.__class__, run, device, wandb_enabled)

## Comparison of our CNN architectures
Comparison of architectures trainable using this notebook can be seen [here](https://wandb.ai/mytkom-warsaw-university-of-technology/iml/reports/Comparison-of-from-scratch-architectures--VmlldzoxMDU0MDk4NQ?accessToken=mle3zdqu8bxvrc4z8pdhl89talltdlml5gw5zmictx9e0qhvue0k5awsdggr37vp).

`TutorialCNN` demonstrated considerably lower validation and test accuracy as well as F1 scores. It also converged much more slowly. In contrast, `OriginalSizeCNN` and `ResNetBatchNormalization` exhibited similar performance, with `ResNetBatchNormalization` converging slightly faster. We believe the lack of a significant difference between their performances is due to the simplicity of the classification task.

## Appendix A. Impact of Data Normalization on Neural Network Performance

This section introduces an alternative configuration of the `TutorialCNN` neural network, where data normalization is omitted. All other settings and hyperparameters remain unchanged. The results of the performance comparison between the normalized and non-normalized datasets using `TutorialCNN` can be found [here](https://api.wandb.ai/links/mytkom-warsaw-university-of-technology/wj0f1okh).

In [None]:
name = "TutorialCNN without standardization"
model = src.model.TutorialCNN()
config = Config()
transform = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor()
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, {}, run.id, device, wandb_enabled)

## Appendix B. Impact of Skip Connections and Batch Normalization in Residual Networks

In this section, we delve into the significance of **residual addition (skip connections)** and **batch normalization** in training deep convolutional neural networks, particularly focusing on their role within the `OurResNet` architecture. To demonstrate their impact, we compare three training configurations:

1. **`NotResNet`:** A baseline architecture without skip connections or batch normalization. This configuration employs only convolutional and pooling layers, which limits its ability to mitigate vanishing gradients and optimize deeper architectures effectively.

2. **`ResNetNoBatchNormalization`:** This version incorporates skip connections but excludes batch normalization. It demonstrates the advantage of residual addition while highlighting the challenges posed by the lack of batch normalization.

3. **`ResNetBatchNormalization` (default configuration):** This configuration includes both skip connections and batch normalization, showcasing the synergistic effect of these features on performance.

### Role of Skip Connections (Residual Addition)

Skip connections are central to residual networks, enabling the output of a layer to "skip" over one or more subsequent layers and be added to the output of a deeper layer. This technique facilitates:
- **Gradient flow:** During backpropagation, skip connections ensure that gradients can bypass intermediate layers, reducing the risk of vanishing gradients.
- **Optimization stability:** They provide a direct pathway for information, simplifying optimization for deeper networks.
- **Feature reuse:** By combining learned features with earlier representations, skip connections enable the network to refine its understanding without losing essential information.

In the `OurResNet` architecture, each residual block optionally applies residual addition. When enabled, the network adds the input of the block (possibly downsampled) to the output of the final convolutional layer in that block, ensuring efficient information propagation.

### Role of Batch Normalization

Batch normalization normalizes the activations of a layer for each mini-batch during training. This process has multiple benefits:
- **Faster convergence:** By reducing internal covariate shift, batch normalization allows the model to converge faster.
- **Regularization:** It introduces slight noise during training, which acts as a regularizer and reduces the risk of overfitting.
- **Stabilized training:** By keeping the activation values within a standardized range, batch normalization improves the overall stability of the training process.

In `OurResNet`, batch normalization is applied after each convolutional layer within the residual blocks (when enabled). It helps maintain consistent activation distributions, even as the network grows deeper.


### Results and Comparison

A detailed comparison of these configurations' performance is available in the following **Weights & Biases (wandb) report**:  
[Link to wandb report](https://wandb.ai/mytkom-warsaw-university-of-technology/iml/reports/Impact-of-Skip-Connections-and-Batch-Normalization-in-Residual-Networks--VmlldzoxMTEwNDI2MQ?accessToken=kbbga0avcuq4pimsxfd0quvdtuay3mqfkhudfkdkwj6yoosy3srq138d5s06np7v).

In our experiment, turning on/off batch normalization and skipping connections showed little differences in training convergence. We believe the lack of a substantial difference between their training performances is due to the simplicity of the classification task and too shallow neural network. Also, the use of Adam optimizer probably helped NotResNet to converge faster.

In [None]:
name = "NotResNet"
model_args={
    "batch_normalization": False,
    "residual_connections": False
}
model = src.model.OurResNet(**model_args)
config = Config()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, model_args, run.id, device, wandb_enabled)

In [None]:
name = "ResNetNoBatchNormalization"
model_args={
    "batch_normalization": False,
    "residual_connections": True
}
model = src.model.OurResNet(**model_args)
config = Config()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, model_args, run.id, device, wandb_enabled)

We suspected that Adam optimizer reduces impact of batch normalization and skip connections on training performance. So we performed similar experiment using Stochastic Gradient Descent with momentum of 0.9 with 10 times bigger learning rate (0.01). The results are presented [here](https://wandb.ai/mytkom-warsaw-university-of-technology/iml/reports/SGD-Impact-of-Skip-Connections-and-Batch-Normalization-in-Residual-Networks--VmlldzoxMTEwNjI2Ng?accessToken=ronkgh3c8etvumnzmcettjrbfbjnkwosuq5ownyt7vzjxko44ous49ecb3oaspmj).

Compared to Adam, using SGD as an optimizer shows significant differences between networks with skip connections and those without it. The addition of batch normalization added another slight improvement for training convergence. If the neural network was deeper and the classification task harder, we suspect more significant differences (even NotResNetSGD not training at all, locked in some local minimum of the loss function)

In [None]:
name = "NotResNetSGD"
model_args={
    "batch_normalization": False,
    "residual_connections": False
}
model = src.model.OurResNet(**model_args)
config = Config(lr=0.01)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, model_args, run.id, device, wandb_enabled)

In [None]:
name = "ResNetNoBatchNormalizationSGD"
model_args={
    "batch_normalization": False,
    "residual_connections": True
}
model = src.model.OurResNet(**model_args)
config = Config(lr=0.01)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, model_args, run.id, device, wandb_enabled)

In [None]:
name = "ResNetBatchNormalizationSGD"
model_args={
    "batch_normalization": True,
    "residual_connections": True
}
model = src.model.OurResNet(**model_args)
config = Config(lr=0.01)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)

run = do_train(name, train_loader, val_loader, config, model, criterion, optimizer, device, wandb_enabled)
do_test(name, test_loader, model.__class__, model_args, run.id, device, wandb_enabled)