# Part 5 DNN and backpropagation

### The material presented in this notebook is for using in Introduction to Deep Learning (ECE 685D) course, Duke University, Fall 2023.

# DNN

### Define a deep neural network (DNN) in Pytorch
    a. an nn.Mudule class
    b. Define the layers in __init__()  
    c. Define the forward pass in forward()
    
Example of a CNN model:

    class Network(nn.Module):
        def __init__(self, parameters):
            super().__init__()
            # your layers defined here
            self.layer1 = ...
            
        def forward(x):
            # your forward pass defined here
            output = self.layer1(x)
            ...
            return output


    model = Network() #create a model  
    model(data)       #forward pass with your data
    model.backward()  #backward pass

## 1. Different ways to define the network

### a. Directly define the layers

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

In [16]:
class Network1(nn.Module):
    def __init__(self,in_channels, out_channels):
        super().__init__()
        self.first_conv = nn.Conv2d(in_channels, 32, 3, 1)
        self.conv1 = nn.Conv2d(32, 32, 3, 1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.relu2 = nn.ReLU()
        self.max_pooling = nn.MaxPool2d(2)
        #you may need to change the numbers when given an input of different dimensions
        self.fc1 = nn.Linear(9216, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, out_channels)
        
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)

    def forward(self, x):
        x= self.first_conv(x)
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.max_pooling(x, 2)
        x = self.dropout1(x)
        # flatten the output to a vector for classification
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout2(x)
        x = self.fc2(x)

        return x


#### Problem: heavy codes and low readability if the layer number further goes up.

### b. use nn.Sequential() to define blocks
nn.Sequential(): A sequential container. Modules will be added to it **in the order** they are passed in the constructor.

In [17]:
class Network2(nn.Module):
    def __init__(self,in_channels, out_channels):
        super().__init__()
        self.first_conv = nn.Conv2d(in_channels, 32, 3, 1)
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(32, 32, 3, 1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, 1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(9216, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, out_channels)
        )

    def forward(self, x):
        x = self.first_conv(x)
        x = self.feature_extractor(x)
        
        # flatten the output to a vector for classification
        x = torch.flatten(x, 1)
        
        x = self.classifier(x)

        return x

**problem**: still heavy codes in __init__ function when defining the containers  

What if we want to define **multiple blocks with the same layers** (but different shape, e.g., channels)?

In [18]:
## say if we need to define a few blocks of (Conv + Relu + Conv + Rely + Maxpooling)

class Network2(nn.Module):
    def __init__(self,in_channels, out_channels):
        super().__init__()
        self.first_conv = nn.Conv2d(in_channels, 32, 3, 1)
        
        self.feature_extractor1 = nn.Sequential(
            nn.Conv2d(32, 32, 3, 1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, 1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
        )
        
        self.feature_extractor2 = nn.Sequential(
            nn.Conv2d(64, 64, 3, 1),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, 1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
        )
        
        self.feature_extractor3 = nn.Sequential(
            nn.Conv2d(128, 128, 3, 1),
            nn.ReLU(),
            nn.Conv2d(128, 256, 3, 1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),
        )
        
        #...
        
        self.classifier = nn.Sequential(
            nn.Linear(9216, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, out_channels)
        )

    def forward(self, x):
        x = self.feature_extractor1(x)
        x = self.feature_extractor2(x)
        x = self.feature_extractor3(x)
        # flatten the output to a vector for classification
        x = torch.flatten(x, 1)
        
        x = self.classifier(x)

        return x

Codes are too long!

### c. Use functions and modules to make the codes even more concise and readable


In [19]:
# define modules
class feature_extractor(nn.Module):
    def __init__(self,in_channels, out_channels):
        super().__init__()
        self.feature_extractor = nn.Sequential(
                nn.Conv2d(in_channels, in_channels, 3, 1),
                nn.ReLU(),
                nn.Conv2d(in_channels, out_channels, 3, 1),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout(0.25),
            )
    def forward(x):
        return self.feature_extractor(x)
    
    
class Network3(nn.Module):
    def __init__(self,in_channels, out_channels):
        super().__init__()
        self.first_conv = nn.Conv2d(in_channels, 32, 3, 1)
        self.feature_extractor1 = feature_extractor(32, 64)
        self.feature_extractor2 = feature_extractor(64, 128)
        self.feature_extractor3 = feature_extractor(128, 256)
        #...
        
        self.classifier = nn.Sequential(
            nn.Linear(9216, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, out_channels)
        )

    def forward(self, x):
        x = self.first_conv(x)
        x = self.feature_extractor1(x)
        x = self.feature_extractor2(x)
        x = self.feature_extractor3(x)
        # flatten the output to a vector for classification
        x = torch.flatten(x, 1)
        
        x = self.classifier(x)

        return x
    

In [20]:
## or use modules + container

class Network4(nn.Module):
    def __init__(self,in_channels, out_channels):
        super().__init__()
        self.first_conv = nn.Conv2d(in_channels, 32, 3, 1)
        self.feature_extractors = nn.Sequential(
            feature_extractor(32, 64),
            feature_extractor(64, 128),
            feature_extractor(128, 256),
            #...
        )
        
        
        self.classifier = nn.Sequential(
            nn.Linear(9216, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, out_channels)
        )

    def forward(self, x):
        x = self.first_conv(x)
        x = self.feature_extractors(x)
        # flatten the output to a vector for classification
        x = torch.flatten(x, 1)
        
        x = self.classifier(x)

        return x

## 2. Forward pass
    In forward pass, we define the exact order of how we input the data into the network.  
    The order of operation does not need to follow the layer definition order in __init__()

In [21]:
class Network4(nn.Module):
    def __init__(self,in_channels, out_channels):
        super().__init__()
        
        ## the order of definition doesn't matter here
        self.classifier = nn.Sequential(
            nn.Linear(9216, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, out_channels)
        )
        
        self.feature_extractors = nn.Sequential(
            feature_extractor(32, 64),
            feature_extractor(64, 128),
            feature_extractor(128, 256),
            #...
            ## the order in nn.Sequential matters
        )
        
        self.first_conv = nn.Conv2d(in_channels, 32, 3, 1)

    def forward(self, x):
        ## the order does matter here
        output = self.first_conv(x)
        output = self.feature_extractors(output)
        # flatten the output to a vector for classification
        output = torch.flatten(output, 1)
        
        output = self.classifier(output)

        return output

**Sometimes we can use torch.nn.functional to implement some functions without claiming them in the __ init__**

In [22]:
# These two are equivalent:

## method 1
# relu_layer = nn.ReLU()
# out = relu_layer(x)

# # method 2
# out = F.relu(x)

**Differences in forward pass between training mode and evaluation mode**

You might notice that in the previous example we set model.train() before every training and set model.eval() before each testing.  

In [23]:
### codes from Part 4

def train(model, device, train_loader, criterion, optimizer, epoch):
    train_loss = 0
    model.train() # what is the training mode?
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if batch_idx % (len(train_loader)//2) == 0:
            print('Train({})[{:.0f}%]: Loss: {:.4f}'.format(
                epoch, 100. * batch_idx / len(train_loader), train_loss/(batch_idx+1)))

def test(model, device, test_loader, criterion, epoch):
    model.eval() # what is the eval mode?
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss = (test_loss*batch_size)/len(test_loader.dataset)
    print('Test({}): Loss: {:.4f}, Accuracy: {:.4f}%'.format(
        epoch, test_loss, 100. * correct / len(test_loader.dataset)))

## Why?

**Not every layer is needed in evaluation!**  


**What layers are excluded from testing process?**
- Dropout: we use dropout in training to prevent overfitting by randomly cut some nodes at each iteration. But in testing, we use all the nodes.
![dropout Operation](https://miro.medium.com/v2/resize:fit:640/format:webp/1*dEi_IkVB7IpkzZ-6H0Vpsg.png)

**What layers are operated differently in the testing process?**
- Batch Normalization: In training, we use **per-batch** statistics. In testing, we use the **global** running statistics

**Conclusion: don't forget to switch mode when training/testing your CNN!**

# Back-propagation
In pytorch, we don't need to define a backward pass function.
Simply call .backward()

### when do we need backpropagation?
- Back-propagation = get gradients
- We need gradients when we need to update the parameters of something

**(Most of the cases) We won't need gradients and backpropagation in testing!**

We can turn off the gradient calculation in testing by

    1. use with torch.no_grad() before testing
    2. don't call backward function

In [24]:
def test(model, device, test_loader, criterion, epoch):
    model.eval() # what is the eval mode?
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss = (test_loss*batch_size)/len(test_loader.dataset)
    print('Test({}): Loss: {:.4f}, Accuracy: {:.4f}%'.format(
        epoch, test_loss, 100. * correct / len(test_loader.dataset)))