# Lab (2): Improving the training pipeline


## Step 0: Set up the SimpleNN model
As you have practiced to implement simple neural networks in Homework 1, we just prepare the implementation for you.

In [11]:
# import necessary dependencies
import argparse
import os, sys
import time
import datetime
from tqdm import tqdm_notebook as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
# define the SimpleNN mode;
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 8, 5)
        self.conv2 = nn.Conv2d(8, 16, 3)
        self.fc1   = nn.Linear(16*6*6, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out

### Question (a)
data augumentation

## Step 1: Set up preprocessing functions
Preprocessing is very important as discussed in the lecture.
You will need to write preprocessing functions with the help of *torchvision.transforms* in this step.
You can find helpful tutorial/API at [here](https://pytorch.org/vision/stable/transforms.html).

In [23]:
import torchvision
import torchvision.transforms as transforms

#############################################
# Define preprocessing functions for training and validation data

# Preprocessing for training data with augmentation
transform_train_aug = transforms.Compose([
    transforms.RandomCrop(32, padding=4),       
    transforms.RandomHorizontalFlip(),          
    transforms.ToTensor(),                     
    transforms.Normalize(                       
        mean=(0.4914, 0.4822, 0.4465),         
        std=(0.2023, 0.1994, 0.2010)           
    )
])

transform_val_aug = transforms.Compose([
    transforms.ToTensor(),                      
    transforms.Normalize(                      
        mean=(0.4914, 0.4822, 0.4465),         
        std=(0.2023, 0.1994, 0.2010)           
    )
])
#############################################

In [24]:
# specify preprocessing function without data augmentation
transform_train = transforms.Compose(
    [
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010))]
)

transform_val = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010))
    ]
)
#############################################

## Step 2: Set up dataset and dataloader


In [16]:
# do NOT change these
from tools.dataset import CIFAR10
from torch.utils.data import DataLoader

# a few arguments, do NOT change these
DATA_ROOT = "./data"
TRAIN_BATCH_SIZE = 128
VAL_BATCH_SIZE = 100

#############################################
# your code here
# construct dataset
train_set_aug = CIFAR10(
    root=DATA_ROOT, 
    mode='train', 
    download=True,
    transform=transform_train_aug    # your code
)
val_set_aug = CIFAR10(
    root=DATA_ROOT, 
    mode='val', 
    download=True,
    transform=transform_val_aug    # your code
)

# construct dataloader
train_loader_aug = DataLoader(
    train_set_aug, 
    batch_size=TRAIN_BATCH_SIZE,  # your code
    shuffle=True,     # your code
    num_workers=4
)
val_loader_aug = DataLoader(
    val_set_aug, 
    batch_size=VAL_BATCH_SIZE,  # your code
    shuffle=False,     # your code
    num_workers=4
)

# construct dataset without data augmentation
train_set = CIFAR10(
    root=DATA_ROOT, 
    mode='train', 
    download=True,
    transform=transform_train   
)
val_set = CIFAR10(
    root=DATA_ROOT, 
    mode='val', 
    download=True,
    transform=transform_val
)

# construct dataloader without data augmentation
train_loader = DataLoader(
    train_set, 
    batch_size=TRAIN_BATCH_SIZE,  
    shuffle=True,     
    num_workers=4
)

val_loader = DataLoader(
    val_set, 
    batch_size=VAL_BATCH_SIZE,  
    shuffle=False,     
    num_workers=4
)
#############################################

Using downloaded and verified file: ./data/cifar10_trainval_F22.zip
Extracting ./data/cifar10_trainval_F22.zip to ./data
Files already downloaded and verified
Using downloaded and verified file: ./data/cifar10_trainval_F22.zip
Extracting ./data/cifar10_trainval_F22.zip to ./data
Files already downloaded and verified
Using downloaded and verified file: ./data/cifar10_trainval_F22.zip
Extracting ./data/cifar10_trainval_F22.zip to ./data
Files already downloaded and verified
Using downloaded and verified file: ./data/cifar10_trainval_F22.zip
Extracting ./data/cifar10_trainval_F22.zip to ./data
Files already downloaded and verified


## Step 3: Instantiate your SimpleNN model and deploy it to GPU devices.


In [17]:
# initialize the model
net = SimpleNN()
# specify the device for computation
#############################################
# your code here
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = net.to(device)
    print("Model - GPU.")
else:
    device = torch.device("cpu")
    model = net.to(device)
    print("Model - CPU.")
    
#############################################

Model - CPU.


## Step 4: Set up the loss function and optimizer
Loss function/objective function is used to provide "feedback" for the neural networks. Typically, we use multi-class cross-entropy as the loss function for classification models. As for the optimizer, we will use SGD with momentum. 

In [18]:
import torch.nn as nn
import torch.optim as optim

# hyperparameters, do NOT change right now
# initial learning rate
INITIAL_LR = 0.01

# momentum for optimizer
MOMENTUM = 0.9

# L2 regularization strength
REG = 1e-4

#############################################
# your code here
# create loss function
criterion = nn.CrossEntropyLoss().to(device)

# Add optimizer
optimizer = optim.SGD(
    net.parameters(),       
    lr=INITIAL_LR,         
    momentum=MOMENTUM,     
    weight_decay=REG
)      
#############################################
#############################################

## Step 5: Start the training process.


In [19]:
def train_test_NN(train_loader, val_loader, model, epoch, lr, optimizer, criterion, device, checkpoint_folder):
    """
    Train and validate the neural network model.

    Args:
        train_loader (DataLoader): DataLoader for the training dataset.
        val_loader (DataLoader): DataLoader for the validation dataset.
        model (nn.Module): The neural network model to train.
        epoch (int): Total number of training epochs.
        lr (float): Initial learning rate.
        optimizer (torch.optim): Optimizer for training.
        criterion (nn.Module): Loss function.
        device (torch.device): Device to run the model on (e.g., 'cuda' or 'cpu').
        checkpoint_folder (str): Folder to save the best model checkpoint.

    Returns:
        l_train_acc (list): Training accuracy for each epoch.
        l_train_loss (list): Training loss for each epoch.
        l_val_acc (list): Validation accuracy for each epoch.
        l_val_loss (list): Validation loss for each epoch.
    """
    # Initialize variables
    best_val_acc = 0  
    current_learning_rate = lr  

    print("==> Training starts!")
    print("=" * 50)

    l_train_acc = []
    l_train_loss = []
    l_val_acc = []
    l_val_loss = []

    
    for i in range(1, epoch + 1):
        model.train()  # Switch to training mode

        # Track training metrics
        total_examples = 0
        correct_examples = 0
        train_loss = 0

        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs = inputs.to(device)
            targets = targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward pass: compute gradients and update weights
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(outputs, 1)
            correct = predicted.eq(targets).sum()

            # Accumulate metrics
            train_loss += loss.item()
            total_examples += targets.size(0)
            correct_examples += correct.item()

        # Compute average training loss and accuracy
        avg_train_loss = train_loss / len(train_loader)
        avg_train_acc = correct_examples / total_examples
        l_train_acc.append(avg_train_acc)
        l_train_loss.append(avg_train_loss)

        # Print training metrics every 10 epochs or on the first epoch
        if i % 10 == 0 or i == 1:
            print(f"EPOCH: {i}, Training loss: {avg_train_loss:.4f}, Training accuracy: {avg_train_acc:.4f}")

        # Validation phase
        model.eval()  

        # Track validation metrics
        total_examples = 0
        correct_examples = 0
        val_loss = 0

        # Disable gradient computation for validation
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(val_loader):
                # Move data to the device
                inputs = inputs.to(device)
                targets = targets.to(device)

                # Forward pass: compute outputs and loss
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                # Count correct predictions
                _, predicted = torch.max(outputs, 1)
                correct = predicted.eq(targets).sum()

                # Accumulate metrics
                val_loss += loss.item()
                total_examples += targets.size(0)
                correct_examples += correct.item()

        # Compute average validation loss and accuracy
        avg_val_loss = val_loss / len(val_loader)
        avg_val_acc = correct_examples / total_examples
        l_val_loss.append(avg_val_loss)
        l_val_acc.append(avg_val_acc)

        # Print validation metrics every 10 epochs or on the first epoch
        if i % 10 == 0 or i == 1:
            print(f"EPOCH: {i}, Validation loss: {avg_val_loss:.4f}, Validation accuracy: {avg_val_acc:.4f}")

        # Save the model checkpoint if validation accuracy improves
        if avg_val_acc > best_val_acc:
            best_val_acc = avg_val_acc
            if not os.path.exists(checkpoint_folder):
                os.makedirs(checkpoint_folder)
            # Save model state
            state = {
                'state_dict': model.state_dict(),
                'epoch': i,
                'lr': current_learning_rate
            }
            torch.save(state, os.path.join(checkpoint_folder, 'simplenn.pth'))

    # Print final results
    print("=" * 50)
    print(f"==> Optimization finished! Best validation accuracy: {best_val_acc:.4f}")

    return l_train_acc, l_train_loss, l_val_acc, l_val_loss

In [20]:
def weight_reset(m):

    # Check if the layer is one of the types that need resetting
    if isinstance(m, (nn.Conv2d, nn.Linear, nn.BatchNorm2d)):
        # Reset the parameters of the layer
        m.reset_parameters()

In [22]:
# some hyperparameters
# total number of training epochs
EPOCHS = 30

# the folder where the trained model is saved
CHECKPOINT_FOLDER = "./saved_model"

# start training model without data augmentation
l_train_acc_basic, _, l_val_acc_basic, _ = train_test_NN(
    train_loader, 
    val_loader, 
    model, 
    EPOCHS, 
    INITIAL_LR, 
    optimizer, 
    criterion, 
    device, 
    CHECKPOINT_FOLDER
)



==> Training starts!
EPOCH: 1, Training loss: 0.2708, Training accuracy: 0.9025
EPOCH: 1, Validation loss: 2.0146, Validation accuracy: 0.6094
EPOCH: 10, Training loss: 0.2127, Training accuracy: 0.9258
EPOCH: 10, Validation loss: 2.5088, Validation accuracy: 0.6110
EPOCH: 20, Training loss: 0.2007, Training accuracy: 0.9319
EPOCH: 20, Validation loss: 2.6476, Validation accuracy: 0.6140
EPOCH: 30, Training loss: 0.1812, Training accuracy: 0.9400
EPOCH: 30, Validation loss: 2.9274, Validation accuracy: 0.6172
==> Optimization finished! Best validation accuracy: 0.6274


# Bonus: with learning rate decay

The following code can help you adjust the learning rate during training. You need to figure out how to incorporate this code into your training loop.
```python
    if i % DECAY_EPOCHS == 0 and i != 0:
        current_learning_rate = current_learning_rate * DECAY
        for param_group in optimizer.param_groups:
            param_group['lr'] = current_learning_rate
        print("Current learning rate has decayed to %f" %current_learning_rate)
```

In [11]:
model_weight_decay = SimpleNN().to(device)

In [12]:
import torch.nn as nn
import torch.optim as optim

# hyperparameters, do NOT change right now
# initial learning rate
INITIAL_LR = 0.01

# momentum for optimizer
MOMENTUM = 0.9

# L2 regularization strength
REG = 1e-4

#############################################
# your code here
# create loss function
criterion = nn.CrossEntropyLoss().to(device)

# Add optimizer
optimizer = optim.SGD(
    model_weight_decay.parameters(), 
    lr=INITIAL_LR, 
    momentum=MOMENTUM, 
    weight_decay=REG
)

# total number of training epochs
EPOCHS = 30
DECAY_EPOCHS = 5
DECAY = 0.5

# the folder where the trained model is saved
CHECKPOINT_FOLDER = "./saved_model"

# start the training/validation process
# the process should take about 5 minutes on a GTX 1070-Ti
# if the code is written efficiently.
best_val_acc = 0
current_learning_rate = INITIAL_LR

print("==> Training starts!")
print("="*50)

l_train_acc = []
l_train_loss = []
l_val_acc = []
l_val_loss = []

for i in range(0, EPOCHS):
    # handle the learning rate scheduler.
    if i % DECAY_EPOCHS == 0 and i != 0:
        current_learning_rate = current_learning_rate * DECAY
        for param_group in optimizer.param_groups:
            param_group['lr'] = current_learning_rate
        print("Current learning rate has decayed to %f" %current_learning_rate)

#######################
    # your code here
    # switch to train mode
    model_weight_decay.train()
    
    #######################
    
    print("Epoch %d:" %i)
    # this help you compute the training accuracy
    total_examples = 0
    correct_examples = 0

    train_loss = 0 # track training loss if you want

    
    # Train the model for 1 epoch.
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        ####################################
        # your code here
        # copy inputs to device
        inputs = inputs.to(device)
        targets = targets.to(device)
        
        # compute the output and loss
        outputs = model_weight_decay(inputs)
        loss = criterion(outputs, targets)
        
        # zero the gradient
        optimizer.zero_grad()
        
        # backpropagation
        loss.backward()

       # apply gradient and update the weights
        optimizer.step()
        
        # count the number of correctly predicted samples in the current batch
        _, predicted = torch.max(outputs, 1)
        correct = predicted.eq(targets).sum()
        
        # add to totals
        train_loss += loss.item()
        total_examples += targets.size(0)
        correct_examples += correct.item()
        ####################################
                
    avg_loss = train_loss / len(train_loader)
    avg_acc = correct_examples / total_examples
    l_train_acc.append(avg_acc)
    l_train_loss.append(avg_loss)
    print("Training loss: %.4f, Training accuracy: %.4f" %(avg_loss, avg_acc))

    # Validate on the validation dataset
    #######################
    # your code here
    # switch to eval mode
    model_weight_decay.eval()
    
    #######################

    # this help you compute the validation accuracy
    total_examples = 0
    correct_examples = 0
    val_loss = 0 # again, track the validation loss if you want

    # disable gradient during validation, which can save GPU memory
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            ####################################
            # your code here
            # copy inputs to device
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            # compute the output and loss
            outputs = model_weight_decay(inputs)
            loss = criterion(outputs, targets)
            
            # count the number of correctly predicted samples in the current batch
            _, predicted = torch.max(outputs, 1)
            correct = predicted.eq(targets).sum()
            
            # add to totals
            val_loss += loss.item()
            total_examples += targets.size(0)
            correct_examples += correct.item()
            ####################################

    avg_loss = val_loss / len(val_loader)
    avg_acc = correct_examples / total_examples
    l_val_loss.append(avg_loss)
    l_val_acc.append(avg_acc)
    print("Validation loss: %.4f, Validation accuracy: %.4f" % (avg_loss, avg_acc))
    
  # save the model checkpoint
    if avg_acc > best_val_acc:
        best_val_acc = avg_acc
        if not os.path.exists(CHECKPOINT_FOLDER):
           os.makedirs(CHECKPOINT_FOLDER)
        print("Saving ...")
        state = {'state_dict': model.state_dict(),
                'epoch': i,
                'lr': current_learning_rate}
        torch.save(state, os.path.join(CHECKPOINT_FOLDER, 'simplenn.pth'))
        
    print('')

print("="*50)
print(f"==> Optimization finished! Best validation accuracy: {best_val_acc:.4f}")

==> Training starts!
Epoch 0:
Training loss: 1.8999, Training accuracy: 0.3047
Validation loss: 1.5845, Validation accuracy: 0.4216
Saving ...

Epoch 1:
Training loss: 1.4443, Training accuracy: 0.4754
Validation loss: 1.3731, Validation accuracy: 0.5056
Saving ...

Epoch 2:
Training loss: 1.2718, Training accuracy: 0.5436
Validation loss: 1.2499, Validation accuracy: 0.5602
Saving ...

Epoch 3:
Training loss: 1.1621, Training accuracy: 0.5861
Validation loss: 1.1681, Validation accuracy: 0.5900
Saving ...

Epoch 4:
Training loss: 1.0564, Training accuracy: 0.6249
Validation loss: 1.1184, Validation accuracy: 0.6062
Saving ...

Current learning rate has decayed to 0.005000
Epoch 5:
Training loss: 0.9288, Training accuracy: 0.6737
Validation loss: 1.0248, Validation accuracy: 0.6454
Saving ...

Epoch 6:
Training loss: 0.8737, Training accuracy: 0.6920
Validation loss: 1.0351, Validation accuracy: 0.6390

Epoch 7:
Training loss: 0.8321, Training accuracy: 0.7079
Validation loss: 1.0236, 