# Project ASPLOC - 1DT109 11201 (2024HT)
#### By Supun Madusanka

This script will use the LeNet NN architecture and quantize it to be used on hardware accelerator

Ref:
- [PyTorch tutorial on NN](https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html)
- [PyTorch tutorial on NN pruning](https://pytorch.org/tutorials/intermediate/pruning_tutorial.html)
- [CNN](https://towardsdatascience.com/a-comprehensive-guide-to-convolutional-neural-networks-the-eli5-way-3bd2b1164a53)
- [NN quantization](https://towardsdatascience.com/introduction-to-weight-quantization-2494701b9c0c)

Requried libraries

In [1]:
%matplotlib inline

import torch
from torch import nn
import torch.nn.utils.prune as prune
import torch.nn.functional as F
import numpy as np
import torchvision
import torchvision.transforms as transforms

# Define relevant variables for the ML task
batch_size = 64
num_classes = 10
learning_rate = 0.001
num_epochs = 10

DOWNLOAD_DATA = False

Load data

In [35]:
#Loading the dataset and preprocessing
train_dataset = torchvision.datasets.MNIST(root = './data',
                                            train = True,
                                            transform = transforms.Compose([
                                                    transforms.Resize((32,32)),
                                                    transforms.ToTensor()
                                                #     ,
                                                #     transforms.Normalize(mean = (0.1307,), std = (0.3081,))
                                                    ]),
                                            download = DOWNLOAD_DATA)


test_dataset = torchvision.datasets.MNIST(root = './data',
                                            train = False,
                                            transform = transforms.Compose([
                                                    transforms.Resize((32,32)),
                                                    transforms.ToTensor()
                                                #     ,
                                                #     transforms.Normalize(mean = (0.1325,), std = (0.3105,))
                                                    ]),
                                            download=DOWNLOAD_DATA)


train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True)


test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = True)

Create a model with PyTorch
==============

In this tutorial, we use the
[LeNet](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf) architecture
from LeCun et al., 1998.

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square conv kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5x5 image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, int(x.nelement() / x.shape[0]))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [4]:
model = LeNet().to(device=device)

#Setting the loss function
cost = nn.CrossEntropyLoss()

#Setting the optimizer with the model parameters and learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#this is defined to print how many steps are remaining when training
total_step = len(train_loader)

Train the model

In [36]:
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        images = images.to(device)
        labels = labels.to(device)
        
        #Forward pass
        outputs = model(images)
        loss = cost(outputs, labels)
        #Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i+1) % 400 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/10], Step [400/938], Loss: 0.0026
Epoch [1/10], Step [800/938], Loss: 0.0031
Epoch [2/10], Step [400/938], Loss: 0.0042
Epoch [2/10], Step [800/938], Loss: 0.0404
Epoch [3/10], Step [400/938], Loss: 0.0002
Epoch [3/10], Step [800/938], Loss: 0.0875
Epoch [4/10], Step [400/938], Loss: 0.0004
Epoch [4/10], Step [800/938], Loss: 0.0002
Epoch [5/10], Step [400/938], Loss: 0.0009
Epoch [5/10], Step [800/938], Loss: 0.0058
Epoch [6/10], Step [400/938], Loss: 0.0043
Epoch [6/10], Step [800/938], Loss: 0.0001
Epoch [7/10], Step [400/938], Loss: 0.0051
Epoch [7/10], Step [800/938], Loss: 0.0006
Epoch [8/10], Step [400/938], Loss: 0.0001
Epoch [8/10], Step [800/938], Loss: 0.0203
Epoch [9/10], Step [400/938], Loss: 0.0006
Epoch [9/10], Step [800/938], Loss: 0.0004
Epoch [10/10], Step [400/938], Loss: 0.0158
Epoch [10/10], Step [800/938], Loss: 0.0004


Test the model

In [37]:
# In test phase, we don't need to compute gradients (for memory efficiency)
    
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

Accuracy of the network on the 10000 test images: 99.01 %


Inspect the model

In [7]:
print(list(model.named_parameters()))

[('conv1.weight', Parameter containing:
tensor([[[[ 7.2516e-02,  5.2665e-02, -3.8219e-03,  2.0556e-01, -1.3547e-01],
          [-6.7008e-02,  2.4579e-01,  4.5050e-01,  3.0839e-01,  1.2236e-01],
          [-9.0664e-02, -1.4577e-01,  1.7678e-01,  3.5698e-01,  1.4170e-01],
          [-1.7582e-01, -2.7391e-01, -2.9852e-01,  8.4123e-02,  7.4193e-02],
          [-3.0596e-01, -2.9633e-01, -1.4885e-01, -1.9747e-01, -1.9624e-04]]],


        [[[ 3.0369e-01,  1.6104e-01,  2.2792e-01,  2.3089e-02,  1.4041e-01],
          [ 7.5471e-02,  3.0492e-01,  1.9584e-01,  1.2743e-01,  4.0496e-02],
          [ 2.0146e-01,  1.0262e-01, -5.0157e-02,  3.4336e-02, -1.6055e-01],
          [-2.5590e-01, -1.7346e-01, -3.7317e-01, -1.1355e-01, -1.4698e-01],
          [-4.6209e-01, -3.9947e-01, -1.9265e-01, -9.4197e-02,  1.6645e-01]]],


        [[[-3.1350e-02, -2.5138e-01,  3.7261e-02,  5.5104e-02,  2.9144e-01],
          [-3.0489e-01, -4.2895e-02, -1.2456e-01,  7.5359e-02,  2.8774e-01],
          [-1.5446e-01, -2.4

Save the model

In [8]:
torch.save(model, 'lenet_model.pth')

Skip the training if already done and restart from here by loading existing model

In [None]:
model = torch.load('lenet_model.pth', weights_only=False)

Unwrap model functions for quantization
=======================================

Here we will unwrap most of the functions for quantization and make it hardware friendly. The trained weights and biases from previous model is used inside the functions and the training phase is not needed.

In [23]:
def absmax_quantize(X:np.matrix) -> np.matrix:
    # Calculate scale
    scale = 127 / np.max(np.absolute(X))

    # Quantize
    X_quant = np.round(scale * X)

    return X_quant.astype(np.int16), scale

In [42]:
def fs256_quantize(X:np.matrix) -> np.matrix:
    # Fix scale
    scale = 256

    # Quantize
    X_quant = np.round(scale * X)

    return X_quant.astype(np.int16)

In [47]:
def fs256_dequantize(X:np.matrix) -> np.matrix:
    # Fix scale
    scale = 256

    # Quantize
    X_quant = np.round(X/scale)

    return X_quant.astype(np.int16)

In [41]:
torch.detach(model.fc3.weight).numpy().max()

0.38244924

In [43]:
fs256_quantize(torch.detach(model.conv1.weight).numpy())

array([[[[  14,   12,    2,   54,  -47],
         [ -21,   71,  142,  104,   33],
         [ -39,  -42,   60,  113,   43],
         [ -76,  -99,  -89,   26,   26],
         [ -97, -100,  -62,  -56,   11]]],


       [[[  87,   54,   77,   20,   37],
         [  37,   95,   67,   41,   -1],
         [  69,   35,   -6,    5,  -59],
         [ -69,  -65, -111,  -36,  -46],
         [-147, -141,  -63,  -18,   50]]],


       [[[ -13,  -87,   -5,   16,   81],
         [ -94,  -42,  -35,   35,   88],
         [ -51,  -72,   75,   70,   24],
         [ -77,    0,   52,  107,  -24],
         [  -3,  -38,   47,   38,  -50]]],


       [[[  59,  -15,   -8,   54,   35],
         [ -33,  -39,   10,   37,    3],
         [ -23,  -19,   19,   55,  -27],
         [  34,   46,   67,   17,   -6],
         [  36,   42,   10,   11,  -72]]],


       [[[ -19,   -7,  -79, -100,    9],
         [ -18,  -76,  -30,   58,   44],
         [  -5,   44,  102,   86,   88],
         [  -1,   65,   37,  -29,  -66],


In [44]:
absmax_quantize(torch.detach(model.conv1.weight).numpy())

(array([[[[  12,   10,    2,   47,  -40],
          [ -18,   62,  123,   90,   28],
          [ -34,  -36,   51,   98,   37],
          [ -66,  -86,  -77,   23,   23],
          [ -83,  -86,  -53,  -49,    9]]],
 
 
        [[[  75,   47,   67,   17,   32],
          [  32,   82,   58,   35,   -1],
          [  60,   30,   -5,    4,  -51],
          [ -59,  -56,  -96,  -31,  -40],
          [-127, -121,  -54,  -15,   43]]],
 
 
        [[[ -11,  -75,   -4,   14,   70],
          [ -81,  -36,  -30,   31,   76],
          [ -44,  -62,   65,   60,   21],
          [ -67,    0,   45,   92,  -21],
          [  -2,  -33,   41,   33,  -43]]],
 
 
        [[[  51,  -13,   -7,   47,   30],
          [ -29,  -34,    9,   32,    3],
          [ -20,  -16,   16,   47,  -23],
          [  30,   39,   58,   15,   -5],
          [  31,   36,    8,   10,  -62]]],
 
 
        [[[ -16,   -6,  -68,  -86,    8],
          [ -16,  -66,  -26,   50,   38],
          [  -5,   38,   88,   74,   76],
          

In [9]:

class LeNet_quantize():
    def __init__(self, conv_layer_1w:np.ndarray, conv_layer_1b:np.ndarray, 
                 conv_layer_2w:np.ndarray, conv_layer_2b:np.ndarray,
                 dense_1_w:np.ndarray, dense_1_b:np.ndarray, 
                 dense_2_w:np.ndarray, dense_2_b:np.ndarray,
                 dense_3_w:np.ndarray, dense_3_b:np.ndarray):
        # 1 input image channel, 6 output channels, 5x5 square conv kernel
        self.conv1 = self.Conv2d(1, 6, 5, conv_layer_1w, conv_layer_1b)
        self.conv2 = self.Conv2d(6, 16, 5, conv_layer_2w, conv_layer_2b)
        self.fc1 = self.myDense2d(dense_1_w, dense_1_b)
        self.fc2 = self.myDense2d(dense_2_w, dense_2_b)
        self.fc3 = self.myDense2d(dense_3_w, dense_3_b)
    
    def Conv2d(self, inChan:int, outChan:int, kernalDim:int, weight: np.ndarray, bias: np.ndarray):
        myIn = inChan
        myOt = outChan
        myKr = kernalDim
        myWeight = weight
        myBias = bias

        def CalConv2d(img:np.ndarray) -> np.ndarray:
            inDim, imdim_r, imdim_c = img.shape
            outImg = np.zeros((myOt, imdim_r-myKr, imdim_c-myKr))
            for oc in range(myOt):
                bias_s = myBias[oc]
                for ic in range(myIn):
                    kernal = myWeight[oc][ic]
                    for kr_i in range(int(imdim_r-myKr)):
                        for kr_j in range(int(imdim_c-myKr)):
                            outImg[oc][kr_i][kr_j] += (kernal * img[ic][kr_i:kr_i+myKr,kr_j:kr_j+myKr]).sum() + bias_s
            return outImg
        return CalConv2d
    
    def myRelu(self, img:np.ndarray) -> np.ndarray:
        return img.clip(0)
    
    def maxPool2d(self, img:np.ndarray) -> np.ndarray:
        inDim, imdim_r, imdim_c = img.shape
        outImg = np.zeros_like(img, shape=(inDim, int((imdim_r+1)/2), int((imdim_c+1)/2)))
        for chn in range(inDim):
            for i in range(int((imdim_r+1)/2)):
                for j in range(int((imdim_c+1)/2)):
                    if(i*2 > imdim_r):
                        if(j*2 > imdim_c):
                            outImg[chn][i,j] = img[chn][i*2,j*2]
                        else:
                            outImg[chn][i,j] = (img[chn][i*2,j*2:j*2+2]).max()
                    else:
                        if(j*2 > imdim_c):
                            outImg[chn][i,j] = (img[chn][i*2:i*2+2,j*2]).max()
                        else:
                            outImg[chn][i,j] = (img[chn][i*2:i*2+2,j*2:j*2+2]).max()
        return outImg

    def myReshape(self, img:np.ndarray) -> np.ndarray:
        return np.reshape(img, (img.size, -1))
    
    def myDense2d(self, weight:np.ndarray, bias:np.ndarray) -> np.ndarray:
        myweight = weight
        mybias  = bias

        def dense(img:np.ndarray):
            return np.dot(myweight, img) +  mybias

        return dense


Load the weights and biases from the model and seperate them

In [11]:
cnv1w = torch.detach(model.conv1.weight).numpy()
cnv1b = torch.detach(model.conv1.bias).numpy()
cnv2w = torch.detach(model.conv2.weight).numpy()
cnv2b = torch.detach(model.conv2.bias).numpy()

den1w = torch.detach(model.fc1.weight).numpy()
den1b = np.reshape(torch.detach(model.fc1.bias).numpy(), (model.fc1.bias.size()[0], 1)) 
den2w = torch.detach(model.fc2.weight).numpy()
den2b = np.reshape(torch.detach(model.fc2.bias).numpy(), (model.fc2.bias.size()[0], 1)) 
den3w = torch.detach(model.fc3.weight).numpy()
den3b = np.reshape(torch.detach(model.fc3.bias).numpy(), (model.fc3.bias.size()[0], 1)) 

Fixed scale (256) quantized weights and biases

In [45]:
cnv1wq = fs256_quantize(torch.detach(model.conv1.weight).numpy())
cnv1bq = fs256_quantize(torch.detach(model.conv1.bias).numpy())
cnv2wq = fs256_quantize(torch.detach(model.conv2.weight).numpy())
cnv2bq = fs256_quantize(torch.detach(model.conv2.bias).numpy())

den1wq = fs256_quantize(torch.detach(model.fc1.weight).numpy())
den1bq = fs256_quantize(np.reshape(torch.detach(model.fc1.bias).numpy(), (model.fc1.bias.size()[0], 1)) )
den2wq = fs256_quantize(torch.detach(model.fc2.weight).numpy())
den2bq = fs256_quantize(np.reshape(torch.detach(model.fc2.bias).numpy(), (model.fc2.bias.size()[0], 1)) )
den3wq = fs256_quantize(torch.detach(model.fc3.weight).numpy())
den3bq = fs256_quantize(np.reshape(torch.detach(model.fc3.bias).numpy(), (model.fc3.bias.size()[0], 1)) )

Unquantized model

In [12]:
mm = LeNet_quantize(cnv1w, cnv1b, cnv2w, cnv2b, den1w, den1b, den2w, den2b, den3w, den3b)

Quantized model

In [46]:
mmq = LeNet_quantize(cnv1wq, cnv1bq, cnv2wq, cnv2bq, den1wq, den1bq, den2wq, den2bq, den3wq, den3bq)

The forward function LeNet class is copied here. 
- This is used to cross-validate the individual layer outputs with the unwraped LeNet_quantize class. Keep the required layer to cross-check and comment the other layers and check the output as needed.

In [16]:
def trch_forward(x):
    x = F.max_pool2d(F.relu(F.conv2d(x, weight=model.conv1.weight, bias=model.conv1.bias)), (2, 2))
    x = F.max_pool2d(F.relu(F.conv2d(x, weight=model.conv2.weight, bias=model.conv2.bias)), 2)
    x = x.view(-1, int(x.nelement() / x.shape[0]))
    x = F.relu(F.linear(x, weight=model.fc1.weight, bias=model.fc1.bias))
    x = F.relu(F.linear(x, weight=model.fc2.weight, bias=model.fc2.bias))
    x = F.linear(x, weight=model.fc3.weight, bias=model.fc3.bias)
    return x

Correspondencing unwraped function

In [17]:
def my_forward(x):
    x = mm.maxPool2d(mm.myRelu(mm.conv1(x)))
    x = mm.maxPool2d(mm.myRelu(mm.conv2(x)))
    x = mm.myReshape(x)
    x = mm.myRelu(mm.fc1(x))
    x = mm.myRelu(mm.fc2(x))
    x = mm.fc3(x)
    return x

Correspondencing unwraped quantized function

In [50]:
def my_q_forward(x):
    x = mmq.maxPool2d(mmq.myRelu(mmq.conv1(x)))
    x = fs256_dequantize(x)
    x = mmq.maxPool2d(mmq.myRelu(mmq.conv2(x)))
    x = fs256_dequantize(x)
    x = mmq.myReshape(x)
    x = mmq.myRelu(mmq.fc1(x))
    x = fs256_dequantize(x)
    x = mmq.myRelu(mmq.fc2(x))
    x = fs256_dequantize(x)
    x = mmq.fc3(x)
    x = fs256_dequantize(x)
    return x

Validate original model

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = trch_forward(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

Accuracy of the network on the 10000 test images: 99.1 %


Validate unwraped model

> [Note] my_forward() function is incredibly slow compared to trch_forward()

- Obs. my_forward does not support batch input hence the for loop.
- Obs. a bit of accuracy drop due to no padding and other missing small fine tunes that present in Torch

In [38]:
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
    
with torch.no_grad():
      correct = 0
      total = 0
      for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = torch.zeros(images.shape[0])
            for i in range(images.shape[0]):
                  outputs[i] = np.argmax(my_forward(torch.Tensor.numpy(images[i])))
            total += labels.size(0)
            correct += (outputs == labels).sum().item()
      print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

Accuracy of the network on the 10000 test images: 98.42 %


Validate unwraped (FS) quantized model

In [51]:
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
    
with torch.no_grad():
      correct = 0
      total = 0
      for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = torch.zeros(images.shape[0])
            for i in range(images.shape[0]):
                  outputs[i] = np.argmax(my_q_forward(torch.Tensor.numpy(images[i])))
            total += labels.size(0)
            correct += (outputs == labels).sum().item()
      print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

Accuracy of the network on the 10000 test images: 97.8 %
