# MNIST Network for Classification and Addition(Modified)

In [2]:
#import the necessary modules

import torch
import torchvision # provide access to datasets, models, transforms, utils, etc
import torchvision.transforms as transforms
from torch.utils.data import Dataset, TensorDataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
from random import randint

In [3]:
torch.set_printoptions(linewidth=120)

In [4]:
# Set up the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# If we are on CUDA , the following will print CUDA
print("The device available is: ", device)

The device available is:  cuda:0


In [5]:
#Let's check if GPU is running or not:
!nvidia-smi

Sat Oct 23 03:59:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    31W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Combined MNIST Dataset and random numbers between 0 and 9

In [6]:
class CombinedDataset():

  # We pass the train variable to get train or test data, and batch_size
  def __init__(self, train, batch_size):

      self.batch_size = batch_size
      # Load the MNIST data into the data_loader object
      self.data_loader = torch.utils.data.DataLoader(
          torchvision.datasets.MNIST('/files/', train=train, download=True,
                                transform=torchvision.transforms.Compose([
                                  torchvision.transforms.ToTensor(),
                                  torchvision.transforms.Normalize(
                                    (0.1307,), (0.3081,))
                                ])),
          batch_size=self.batch_size, shuffle=True)

      # Number of samples in the dataaset
      self.dataset = self.data_loader.dataset            

  # getitem function creats batches of our dataset on the fly by calling next(iter())
  def __getitem__(self, index):
      # Extract one batch of the MNIST data_loader
      image, label = next(iter(self.data_loader))

      # Generate randoms numbers between 0 and 9 of size=batch_size. The datatype is float as this is the input required for the network
      random_numbers = torch.tensor([randint(0,9) for _ in range(self.batch_size)], dtype=torch.float32)

      # Combine inputs and outputs as a list after transfering the tensors to the GPU
      x = [image.to(device), random_numbers.to(device)]
      # y labels for addition of number is reshaped to [32,1] as MSE requires it in this format
      y = [label.to(device), (label+random_numbers).reshape([32,1]).to(device)]

      return x, y

  def __len__(self):
      return len(self.data_loader)

# Set the batch size
batch_size = 32

# Create the train and test dataset
train_data = CombinedDataset(train=True, batch_size=batch_size)
test_data = CombinedDataset(train=False, batch_size=batch_size)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /files/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting /files/MNIST/raw/train-images-idx3-ubyte.gz to /files/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /files/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting /files/MNIST/raw/train-labels-idx1-ubyte.gz to /files/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /files/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting /files/MNIST/raw/t10k-images-idx3-ubyte.gz to /files/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /files/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting /files/MNIST/raw/t10k-labels-idx1-ubyte.gz to /files/MNIST/raw



  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [7]:
print(f"Number of train batches: {len(train_data)}")
print(f"Number of test batches: {len(test_data)}")

print(f"Number of train samples: {len(train_data.dataset)}")
print(f"Number of test samples: {len(test_data.dataset)}")

x, y = next(iter(train_data))

print(f"Shape of input data is: [{x[0].shape}, {x[1].shape}]")
print(f"Shape of output data is: [{y[0].shape}, {y[1].shape}]")


Number of train batches: 1875
Number of test batches: 313
Number of train samples: 60000
Number of test samples: 10000
Shape of input data is: [torch.Size([32, 1, 28, 28]), torch.Size([32])]
Shape of output data is: [torch.Size([32]), torch.Size([32, 1])]


#Creating the NN

In [8]:
class Network(nn.Module):
    def __init__(self):


        super().__init__()

        # Classifier Network
        self.input1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3) 
        self.conv1 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3) 
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3) 
        self.pool = nn.MaxPool2d(2, 2) 

        # 1x1 convolution
        self.oneconv1 = nn.Conv2d(in_channels=64, out_channels=16, kernel_size=1) 
        
        self.conv3 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3) 
        self.conv4 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3) 
        self.conv5 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3) 
        self.conv6 = nn.Conv2d(in_channels=64, out_channels=10, kernel_size=5) 

        # Addition network using fully connected layers
        self.input2 = nn.Linear(in_features=2, out_features=5)
        self.layer1 = nn.Linear(in_features=5, out_features=5)
        self.out2 = nn.Linear(in_features=5, out_features=1)

    def forward(self, data1, data2):
        # first block of CNN classification layer
        x = F.relu(self.input1(data1))
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = F.relu(self.oneconv1(x))
        
        # Second Bloc
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))

        # Third Block
        x = self.conv6(x)

        # flattening all dimensions except batch
        output1 = torch.flatten(x, start_dim=1)       

        # Addition layer
        # Collect the output of the classifier network and select the index with maximum value
        x = torch.argmax(output1, dim=1)
        # Use torch.stack to create pairs of network outputs and random numbers
        x  = torch.stack((x.float(), data2), dim=1)
        
        # Pass the data through the addition network. No activation function required as addition of two numbers is a linear function
        x = self.input2(x)
        x = self.layer1(x)
        output2 = self.out2(x)

        # Return outputs from both the classifier and addition network
        return output1, output2

In [10]:
# Creating the model from the created NN
model = Network().to(device)

print(model)

print("Shape of parameters ")
for name, param in model.named_parameters():
    print(str(name) + "--->", str(param.shape))

# loss function and optimizer  
CE_loss = nn.CrossEntropyLoss()
MSE_loss = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

Network(
  (input1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1))
  (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (oneconv1): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv6): Conv2d(64, 10, kernel_size=(5, 5), stride=(1, 1))
  (input2): Linear(in_features=2, out_features=5, bias=True)
  (layer1): Linear(in_features=5, out_features=5, bias=True)
  (out2): Linear(in_features=5, out_features=1, bias=True)
)
Shape of parameters 
input1.weight---> torch.Size([16, 1, 3, 3])
input1.bias---> torch.Size([16])
conv1.weight---> torch.Size([32, 16, 3, 3])
conv1.bias---> torch.Size([32])
conv2.weight---> torch.Size([64, 32, 3, 3])
conv2.bias--->

# Model Training

In [11]:
for epoch in range(10): 

    total_loss = 0.0
    total_correct_1, total_correct_2 = 0, 0
    # Loop over the entire length of train data
    for i in range(len(train_data)):
        
        x, y = next(iter(train_data))

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + Backward + Optimize
        output1, output2 = model(x[0], x[1])
        # Use the CE loss for classification and MSE loss for addition 
        loss = CE_loss(output1, y[0]) + MSE_loss(output2, y[1])
        loss.backward()
        optimizer.step()

        # Calculate statistics
        total_loss += loss.item()
        total_correct_1 += output1.argmax(dim=1).eq(y[0]).sum().item()
        total_correct_2 += (torch.round(output2) == torch.round(y[1])).sum().item()
        
       
    print(f"Epoch: {epoch+1}, loss: {total_loss}, Classification Acc: {100 * (total_correct_1/(len(train_data.dataset)))}, Addition Acc: {100 * (total_correct_2/(len(train_data.dataset)))}")
    


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: 1, loss: 10439.560721114278, Classification Acc: 63.739999999999995, Addition Acc: 45.615
Epoch: 2, loss: 1374.7385136596859, Classification Acc: 96.73, Addition Acc: 91.39166666666667
Epoch: 3, loss: 875.6999483203981, Classification Acc: 97.92999999999999, Addition Acc: 95.34833333333333
Epoch: 4, loss: 690.4721625688253, Classification Acc: 98.37333333333333, Addition Acc: 96.97333333333333
Epoch: 5, loss: 545.3599636015715, Classification Acc: 98.70833333333333, Addition Acc: 97.23
Epoch: 6, loss: 432.04236380799557, Classification Acc: 99.00999999999999, Addition Acc: 98.535
Epoch: 7, loss: 437.615093589935, Classification Acc: 99.09666666666666, Addition Acc: 98.35666666666667
Epoch: 8, loss: 399.4035941637121, Classification Acc: 99.16333333333334, Addition Acc: 98.78333333333333
Epoch: 9, loss: 316.5287288064428, Classification Acc: 99.295, Addition Acc: 99.17166666666667
Epoch: 10, loss: 261.2785445338741, Classification Acc: 99.39833333333333, Addition Acc: 99.22166666

# Model Evaluation

In [12]:
correct_1, correct_2 = 0, 0
total_1, total_2 = 0, 0

# Since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    # Loop over the entire length of test data
    for i in range(len(test_data)):
        # Get the inputs and outputs
        # Input data x is a list of [images, random numbers], output data y is a list of [classes, sum of numbers]
        x, y = next(iter(test_data))

        # Calculate outputs by running data through the network 
        output1, output2 = model(x[0], x[1])

        # The class with the highest energy is what we choose as prediction
        _, predicted = torch.max(output1.data, 1)
        total_1 += y[0].size(0)
        # Calculate number of correction predictions for classifier
        correct_1 += (predicted == y[0]).sum().item()

        total_2 += y[1].to(device).size(0)
        # Calculate number of correction predictions for addition
        correct_2 += (torch.round(output2) == torch.round(y[1])).sum().item()

print('Accuracy of the network on the 10,000 test images: ', (100 * correct_1 / total_1))
print('Accuracy of the network on the 10,000 test images: ', (100 * correct_2 / total_2))

Accuracy of the network on the 10,000 test images:  98.77196485623003
Accuracy of the network on the 10,000 test images:  98.77196485623003


#Sample Prediction

In [13]:
x, y = next(iter(test_data))

In [16]:
#print(x[0],x[1])

In [17]:
output1, output2 = model(x[0], x[1])

In [18]:
#Class with the highest value ll be our prediction
_, predicted = torch.max(output1.data, 1)

In [20]:
print("[Predicted , Actual] ")
print(torch.stack((predicted, y[0]), dim=1)[:10])

[Predicted , Actual] 
tensor([[3, 3],
        [1, 1],
        [2, 2],
        [0, 0],
        [9, 9],
        [5, 5],
        [1, 1],
        [7, 7],
        [7, 7],
        [4, 4]], device='cuda:0')


In [22]:
print("[Predicted,Predicted Round-off,Actual of addition")

print(torch.stack((output2, torch.round(output2), y[1]), dim=1)[:3])

[Predicted,Predicted Round-off,Actual of addition
tensor([[[4.9948],
         [5.0000],
         [5.0000]],

        [[4.0101],
         [4.0000],
         [4.0000]],

        [[5.0028],
         [5.0000],
         [5.0000]]], device='cuda:0', grad_fn=<SliceBackward>)
