# Training LeNet-5 on CIFAR-10
In this notebook, we will try to deploy the famous LeNet-5 to solve a simple image classification task, the CIFAR-10. CIFAR-10 is composed of 60K images from 10 categories. After splitting the dataset, we have 45K/5K/10K images for train/valiation/test dataset.
In this notebook, only the labels of training/validation dataset is visible to you, so you can use the training and validation data to tune your model. After you submitted your model, your final grade will be determined on the model performance on the holdout test dataset.

### Step 0 Setting up LeNet-5 model
As you have set up the LeNet-5 model in Homework 1, we will just move the implementation of LeNet-5 model here, so you can use it for this homework.

In [137]:
import argparse
import os, sys
import time
import datetime

# Import pytorch dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

In [138]:
import math

def swish(x):
    return x * torch.sigmoid(x)
# Create the neural network module: LeNet-5
class ResNet20(nn.Module):
    def __init__(self):
        super(ResNet20, self).__init__()
        

        self.conv0 = nn.Conv2d(3, 16, 3)
        self.norm0 = nn.BatchNorm2d(16)


        self.conv11 = nn.Conv2d(16, 16, 3,padding=1)
        self.norm11 = nn.BatchNorm2d(16)
        self.conv12 = nn.Conv2d(16, 16, 3,padding=1)
        self.norm12 = nn.BatchNorm2d(16)
        self.conv13 = nn.Conv2d(16, 16, 3,padding=1)
        self.norm13 = nn.BatchNorm2d(16)
        self.conv14 = nn.Conv2d(16, 16, 3,padding=1)
        self.norm14 = nn.BatchNorm2d(16)
        self.conv15 = nn.Conv2d(16, 16, 3,padding=1)
        self.norm15 = nn.BatchNorm2d(16)
        self.conv16 = nn.Conv2d(16, 16, 3,padding=1)
        self.norm16 = nn.BatchNorm2d(16)

        self.con1to2=nn.Conv2d(16, 32, 1,stride=2) 
        self.norm1to2 = nn.BatchNorm2d(32)

        self.conv21 = nn.Conv2d(16, 32, 3,padding=1,stride=2)
        self.norm21 = nn.BatchNorm2d(32)
        self.conv22 = nn.Conv2d(32, 32, 3,padding=1)
        self.norm22 = nn.BatchNorm2d(32)
        self.conv23 = nn.Conv2d(32, 32, 3,padding=1)
        self.norm23 = nn.BatchNorm2d(32)
        self.conv24 = nn.Conv2d(32, 32, 3,padding=1)
        self.norm24 = nn.BatchNorm2d(32)
        self.conv25 = nn.Conv2d(32, 32, 3,padding=1)
        self.norm25 = nn.BatchNorm2d(32)
        self.conv26 = nn.Conv2d(32, 32, 3,padding=1)
        self.norm26 = nn.BatchNorm2d(32)

        self.con2to3=nn.Conv2d(32, 64, 1,stride=2) 
        self.norm2to3 = nn.BatchNorm2d(64)

        self.conv31 = nn.Conv2d(32, 64, 3,padding=1,stride=2)
        self.norm31 = nn.BatchNorm2d(64)
        self.conv32 = nn.Conv2d(64, 64, 3,padding=1)
        self.norm32 = nn.BatchNorm2d(64)
        self.conv33 = nn.Conv2d(64, 64, 3,padding=1)
        self.norm33 = nn.BatchNorm2d(64)
        self.conv34 = nn.Conv2d(64, 64, 3,padding=1)
        self.norm34 = nn.BatchNorm2d(64)
        self.conv35 = nn.Conv2d(64, 64, 3,padding=1)
        self.norm35 = nn.BatchNorm2d(64)
        self.conv36 = nn.Conv2d(64, 64, 3,padding=1)
        self.norm36 = nn.BatchNorm2d(64)

        self.avePool=nn.AvgPool2d(8) 
        
        self.fc   = nn.Linear(64, 10)

        # self.softmax=nn.Softmax()
        

    def forward(self, x):
        out0 = F.relu(self.conv0(x))
        out0 = self.norm0(out0)

        out11=F.relu(self.conv11(out0))
        out11=self.norm11(out11)
        out12=F.relu(self.conv12(out11))
        out12=self.norm12(out12)
        out13=F.relu(self.conv13(out12+out0))
        out13=self.norm13(out13)
        out14=F.relu(self.conv14(out13))
        out14=self.norm14(out14)
        out15=F.relu(self.conv15(out14+out12))
        out15=self.norm15(out15)
        out16=F.relu(self.conv16(out15))
        out16=self.norm16(out16)


        out21=F.relu(self.conv21(out16+out14))
        out21=self.norm21(out21)
        out22=F.relu(self.conv22(out21))
        out22=self.norm22(out22)
        out16t=self.con1to2(out16)
        out16t=self.norm1to2(out16t)
        out23=F.relu(self.conv23(out22+out16t))
        out23=self.norm23(out23)
        out24=F.relu(self.conv24(out23))
        out24=self.norm24(out24)
        out25=F.relu(self.conv25(out24+out22))
        out25=self.norm25(out25)
        out26=F.relu(self.conv26(out25))
        out26=self.norm26(out26)

        out31=F.relu(self.conv31(out26+out24))
        out31=self.norm31(out31)
        out32=F.relu(self.conv32(out31))
        out32=self.norm32(out32)
        out26t=self.con2to3(out26)
        out26t=self.norm2to3(out26t)
        out33=F.relu(self.conv33(out32+out26t))
        out33=self.norm33(out33)
        out34=F.relu(self.conv34(out33))
        out34=self.norm34(out34)
        out35=F.relu(self.conv35(out34+out32))
        out35=self.norm35(out35)
        out36=F.relu(self.conv36(out35))
        out36=self.norm36(out36)+out34

        out=self.avePool(out36)
        out = out.view(out.size(0), -1)
        out=self.fc(out)
        # out=self.softmax(out)
       
        return out

    """
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * (m.in_channels + m.out_channels)
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_() """

### Step 1: Setting up preprocessing functions.
Preprocessing is very important because it prepares your data for proceeding training steps.
Write functions to load dataset and preprocess the incoming data. We recommend that the preprocess scheme \textbf{must} include normalize, standardization, batch shuffling to make sure the training 
process goes smoothly. The preprocess scheme may also contain some data augmentation methods 
(e.g., random crop, random flip, etc.). 

Reference value for mean/std:

**mean(RGB-format): (0.4914, 0.4822, 0.4465)**

**std(RGB-format): (0.2023, 0.1994, 0.2010)**

In [139]:
# Specify preprocessing function.
# Reference mean/std value for 
transform_train  = transforms.Compose([
          transforms.RandomCrop(size=[32,32],padding=4),
        transforms.RandomHorizontalFlip(),
       transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

transform_val = transforms.Compose([
        transforms.RandomCrop(size=[32,32],padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])


### Step 2: Setting up data I/O
Data I/O reads data from the dataset and prepares it for further procedures. Note that you have to link transformation with data I/O so that these operations can be interleaved. Thus, the training process can be more efficient.

In [140]:
# You cannot change this line.
from tools.dataloader import CIFAR10
# Call the dataset Loader
DATAROOT = "./data"
TRAIN_BATCH_SIZE = 128
VAL_BATCH_SIZE = 100
trainset = CIFAR10(root=DATAROOT, train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=4)
valset = CIFAR10(root=DATAROOT, train=False, download=True, transform=transform_val)
valloader = torch.utils.data.DataLoader(valset, batch_size=VAL_BATCH_SIZE, shuffle=False, num_workers=4)

Using downloaded and verified file: ./data/cifar10_trainval_F20.zip
Extracting ./data/cifar10_trainval_F20.zip to ./data
Files already downloaded and verified
Training dataset has 45000 examples!
Using downloaded and verified file: ./data/cifar10_trainval_F20.zip
Extracting ./data/cifar10_trainval_F20.zip to ./data
Files already downloaded and verified
Validation dataset has 5000 examples!


### Step 3: Instantialize your LeNet-5 model and deploy it to GPU devices.
You may want to deploy your model to GPU device for efficient training. Please assign your model to GPU if possible. If you are training on a machine without GPUs, please deploy your model to CPUs.

In [141]:
# Specify the device for computation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net = ResNet20()
net = net.to(device)
if device =='cuda':
    print("Train on GPU...")
else:
    print("Train on CPU...")

Train on GPU...


### Hyperparameter settings
Hyperparameters are quite crucial in determining the performance of our model. The default hyperparameter settings are sufficient for a decent result. You may tune them wisely and carefully for better results.

In [142]:
# Initial learning rate
INITIAL_LR = 0.1
lr_list=[1.0, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001]
wd_list=[0.01, 0.001, 0.0001, 0.00001, 0.0]
# Momentum for optimizer.
MOMENTUM = 0.9
# Regularization
REG = 1e-4
# Total number of training epochs
EPOCHS = 200
# Learning rate decay policy.
DECAY_EPOCHS = 50
DECAY = 0.1

### Handling weights load/save protocols.
This handles the weight loading/saving protocols.You may be able to load from checkpoints.

In [143]:
CHECKPOINT_PATH = "./saved_model"
# FLAG for loading the pretrained model
TRAIN_FROM_SCRATCH = True
# Code for loading checkpoint and recover epoch id.
CKPT_PATH = "./saved_model/model.h5"
def get_checkpoint(ckpt_path):
    try:
        ckpt = torch.load(ckpt_path)
    except Exception as e:
        print(e)
        return None
    return ckpt

ckpt = get_checkpoint(CKPT_PATH)
if ckpt is None or TRAIN_FROM_SCRATCH:
    if not TRAIN_FROM_SCRATCH:
        print("Checkpoint not found.")
    print("Training from scratch ...")
    start_epoch = 0
    current_learning_rate = INITIAL_LR
else:
    print("Successfully loaded checkpoint: %s" %CKPT_PATH)
    net.load_state_dict(ckpt['net'])
    start_epoch = ckpt['epoch'] + 1
    current_learning_rate = ckpt['lr']
    print("Starting from epoch %d " %start_epoch)

print("Starting from learning rate %f:" %current_learning_rate)

Training from scratch ...
Starting from learning rate 0.100000:


### Step 4 Setting up loss functions and Optimizers
Loss function is your objective to train the neural networks. Typically, we use multi-class cross entropy as objectives for classification models (e.g., CIFAR-10, MNIST). In this homework, we use SGD optimizer with momentum as our optimizer. You need to formulate the cross-entropy loss function in PyTorch.
You should also specify a PyTorch Optimizer to optimize this loss function.

In [144]:
# Create loss function and specify regularization
criterion = nn.CrossEntropyLoss()
# Add optimizer
optimizer = optim.SGD(params=net.parameters(), lr=current_learning_rate, momentum=MOMENTUM, weight_decay=REG, nesterov=False)

### Step 5: Start the training process.
Congratulations! You have completed all of the previous steps and it is time to train our neural network.
Training a neural network usually composes the following 3 parts: 

**i) Get a batch of data from the dataloader and copy it to your device (GPU)**

**ii) Do a forward pass to get the output logits from the neural network. Compute the forward loss.**

**iii) Do a backward pass (back-propagation) to compute gradients of all weights with respect to the loss.**

You will also need to compute accuracy within all these parts to justify that your model is doing well on the dataset.


In [145]:
# Start the training/validation process
# The process should take about 5 minutes on a GTX 1070-Ti
# if the code is written efficiently.
global_step = 0
best_val_acc = 0

for i in range(start_epoch, EPOCHS):
    print(datetime.datetime.now())
    # Switch to train mode
    net.train()
    print("Epoch %d:" %i)

    total_examples = 0
    correct_examples = 0

    train_loss = 0
    train_acc = 0
    
    # Train the training dataset for 1 epoch.
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        # Copy inputs to device
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Zero the gradient
        optimizer.zero_grad()
        # Generate output
        outputs = net(inputs)

        loss = criterion(outputs, targets)
#         if i==0:
#             print('initial loss: ',loss)
        # Now backward loss
        loss.backward()
        # Apply gradient
        optimizer.step()
        # Calculate predicted labels
        _, predicted = outputs.max(1)
        total_examples += predicted.size(0)
        correct_examples += predicted.eq(targets).sum().item()
        train_loss += loss
        global_step += 1
                
    avg_loss = train_loss / (batch_idx + 1)
    avg_acc = correct_examples / total_examples
    print("Training loss: %.4f, Training accuracy: %.4f" %(avg_loss, avg_acc))
    print(datetime.datetime.now())
    # Validate on the validation dataset
    print("Validation...")
    total_examples = 0
    correct_examples = 0
    
    net.eval()

    val_loss = 0
    val_acc = 0
    # Disable gradient during validation
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(valloader):
            # Copy inputs to device
            inputs = inputs.to(device)
            targets = targets.to(device)
            # Zero the gradient
            optimizer.zero_grad()
            # Generate output from the DNN.
            outputs = net(inputs)
            loss = criterion(outputs, targets)            
            # Calculate predicted labels
            _, predicted = outputs.max(1)
            total_examples += predicted.size(0)
            correct_examples += predicted.eq(targets).sum().item()
            val_loss += loss

    avg_loss = val_loss / len(valloader)
    avg_acc = correct_examples / total_examples
    
    print("Validation loss: %.4f, Validation accuracy: %.4f" % (avg_loss, avg_acc))

    # Handle the learning rate scheduler.
    if i % DECAY_EPOCHS == 0 and i != 0:
        current_learning_rate = current_learning_rate * DECAY
        for param_group in optimizer.param_groups:
            param_group['lr'] = current_learning_rate
        print("Current learning rate has decayed to %f" %current_learning_rate)
    
    # Save for checkpoint
    if avg_acc > best_val_acc:
        best_val_acc = avg_acc
        if not os.path.exists(CHECKPOINT_PATH):
            os.makedirs(CHECKPOINT_PATH)
        print("Saving ...")
        state = {'net': net.state_dict(),
                 'epoch': i,
                 'lr': current_learning_rate}
        torch.save(state, os.path.join(CHECKPOINT_PATH, 'model.h5'))

print("Optimization finished.")
print("best val",best_val_acc)

h 90:
Training loss: 0.0625, Training accuracy: 0.9784
2020-09-19 20:58:31.372999
Validation...
Validation loss: 0.3723, Validation accuracy: 0.8960
2020-09-19 20:58:32.223814
Epoch 91:
Training loss: 0.0626, Training accuracy: 0.9786
2020-09-19 20:58:44.196825
Validation...
Validation loss: 0.3545, Validation accuracy: 0.8994
2020-09-19 20:58:45.064538
Epoch 92:
Training loss: 0.0597, Training accuracy: 0.9796
2020-09-19 20:58:57.207138
Validation...
Validation loss: 0.3722, Validation accuracy: 0.8942
2020-09-19 20:58:58.058696
Epoch 93:
Training loss: 0.0617, Training accuracy: 0.9788
2020-09-19 20:59:10.160378
Validation...
Validation loss: 0.3784, Validation accuracy: 0.8932
2020-09-19 20:59:11.039911
Epoch 94:
Training loss: 0.0581, Training accuracy: 0.9803
2020-09-19 20:59:23.110156
Validation...
Validation loss: 0.3602, Validation accuracy: 0.8992
2020-09-19 20:59:24.003979
Epoch 95:
Training loss: 0.0606, Training accuracy: 0.9789
2020-09-19 20:59:36.152790
Validation...
Vali