In [11]:
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD, Adam, Adagrad, Adadelta, RMSprop
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor


from torch.nn import Module
from torch import nn


class LeNet(Module):
    def __init__(self, act=nn.ReLU(), pool=nn.MaxPool2d(2)):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.relu1 = act
        self.pool1 = pool
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.relu2 = act
        self.pool2 = pool
        self.fc1 = nn.Linear(256, 120)
        self.relu3 = act
        self.fc2 = nn.Linear(120, 84)
        self.relu4 = act
        self.fc3 = nn.Linear(84, 10)
        self.relu5 = act

    def forward(self, x):
        y = self.conv1(x)
        y = self.relu1(y)
        y = self.pool1(y)
        y = self.conv2(y)
        y = self.relu2(y)
        y = self.pool2(y)
        y = y.view(y.shape[0], -1)
        y = self.fc1(y)
        y = self.relu3(y)
        y = self.fc2(y)
        y = self.relu4(y)
        y = self.fc3(y)
        y = self.relu5(y)
        return y
import time

In [12]:
def train(batch_size= 256, epochs=100, lr=0.01, act=nn.ReLU(), pool=nn.MaxPool2d(2), optim="adam"):
    startTime = time.time()
    batch_size = batch_size
    train_dataset = mnist.MNIST(root='./data_MNIST/train', train=True, transform=ToTensor())
    test_dataset = mnist.MNIST(root='./data_MNIST/test', train=False, transform=ToTensor())
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model = LeNet(act=act, pool=pool)
    
    if optim == "adam":
        optimizer = Adam(model.parameters(), lr=lr)
    elif optim == "adagrad":
        optimizer = Adagrad(model.parameters(), lr=lr)
    elif optim == "sgdWithMomentum":
        optimizer = SGD(model.parameters(), lr=lr, momentum=0.9)
    else: 
        optimizer = SGD(model.parameters(), lr=lr)
    
    cost = CrossEntropyLoss()
    epoch = epochs
    for _epoch in range(epoch):
        model.train()
        loss = 0
        for idx, (train_x, train_label) in enumerate(train_loader):
            label_np = np.zeros((train_label.shape[0], 10))
            optimizer.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                # print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
                pass
            loss.backward()
            optimizer.step()

        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            predict_y = model(test_x.float()).detach()
            predict_ys = np.argmax(predict_y, axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        if _epoch % 5 == 0:
            print('Epoch {:.2f},  loss: {:.2f},  accuracy: {:.2f}'.format(_epoch, loss, correct / _sum), )
    print("Took: " + str((time.time() - startTime).__round__(2)) +" seconds")

In [13]:
# Normal
train(batch_size= 256, epochs=100, lr=0.01, act=nn.Tanh(), pool=nn.AvgPool2d(2), optim="sgd")

Epoch 0.00,  loss: 2.28,  accuracy: 0.12
Epoch 5.00,  loss: 1.43,  accuracy: 0.78
Epoch 10.00,  loss: 1.18,  accuracy: 0.81
Epoch 15.00,  loss: 1.10,  accuracy: 0.83
Epoch 20.00,  loss: 1.05,  accuracy: 0.85
Epoch 25.00,  loss: 1.01,  accuracy: 0.88
Epoch 30.00,  loss: 0.98,  accuracy: 0.91
Epoch 35.00,  loss: 0.95,  accuracy: 0.92
Epoch 40.00,  loss: 0.93,  accuracy: 0.93
Epoch 45.00,  loss: 0.91,  accuracy: 0.94
Epoch 50.00,  loss: 0.90,  accuracy: 0.94
Epoch 55.00,  loss: 0.89,  accuracy: 0.94
Epoch 60.00,  loss: 0.88,  accuracy: 0.95
Epoch 65.00,  loss: 0.87,  accuracy: 0.95
Epoch 70.00,  loss: 0.87,  accuracy: 0.95
Epoch 75.00,  loss: 0.86,  accuracy: 0.96
Epoch 80.00,  loss: 0.86,  accuracy: 0.96
Epoch 85.00,  loss: 0.86,  accuracy: 0.96
Epoch 90.00,  loss: 0.85,  accuracy: 0.96
Epoch 95.00,  loss: 0.85,  accuracy: 0.97
Took: 783.81 seconds


 #### Comment
 Best one. Worked very well
 

In [14]:
# Q1 ReLU instead of Tanh.  MaxPool instead of AvgPool. 
train(batch_size= 256, epochs=100, lr=0.01, act=nn.ReLU(), pool=nn.MaxPool2d(2), optim="sgd")

Epoch 0.00,  loss: 2.30,  accuracy: 0.10
Epoch 5.00,  loss: 2.20,  accuracy: 0.34
Epoch 10.00,  loss: 1.05,  accuracy: 0.63
Epoch 15.00,  loss: 0.66,  accuracy: 0.73
Epoch 20.00,  loss: 0.60,  accuracy: 0.76
Epoch 25.00,  loss: 0.57,  accuracy: 0.77
Epoch 30.00,  loss: 0.55,  accuracy: 0.78
Epoch 35.00,  loss: 0.54,  accuracy: 0.78
Epoch 40.00,  loss: 0.53,  accuracy: 0.78
Epoch 45.00,  loss: 0.53,  accuracy: 0.78
Epoch 50.00,  loss: 0.53,  accuracy: 0.78
Epoch 55.00,  loss: 0.53,  accuracy: 0.79
Epoch 60.00,  loss: 0.53,  accuracy: 0.79
Epoch 65.00,  loss: 0.53,  accuracy: 0.79
Epoch 70.00,  loss: 0.53,  accuracy: 0.79
Epoch 75.00,  loss: 0.53,  accuracy: 0.79
Epoch 80.00,  loss: 0.52,  accuracy: 0.79
Epoch 85.00,  loss: 0.52,  accuracy: 0.79
Epoch 90.00,  loss: 0.52,  accuracy: 0.79
Epoch 95.00,  loss: 0.52,  accuracy: 0.79
Took: 782.01 seconds


 #### Comment
Not much improvement.

In [15]:
# Q2 Try different batch sizes
train(batch_size= 128, epochs=100, lr=0.01, act=nn.ReLU(), pool=nn.MaxPool2d(2), optim="sgd")

Epoch 0.00,  loss: 2.30,  accuracy: 0.14
Epoch 5.00,  loss: 1.02,  accuracy: 0.66
Epoch 10.00,  loss: 0.99,  accuracy: 0.68
Epoch 15.00,  loss: 0.98,  accuracy: 0.69
Epoch 20.00,  loss: 0.69,  accuracy: 0.78
Epoch 25.00,  loss: 0.64,  accuracy: 0.79
Epoch 30.00,  loss: 0.63,  accuracy: 0.79
Epoch 35.00,  loss: 0.62,  accuracy: 0.79
Epoch 40.00,  loss: 0.61,  accuracy: 0.80
Epoch 45.00,  loss: 0.61,  accuracy: 0.80
Epoch 50.00,  loss: 0.60,  accuracy: 0.80
Epoch 55.00,  loss: 0.60,  accuracy: 0.80
Epoch 60.00,  loss: 0.60,  accuracy: 0.80
Epoch 65.00,  loss: 0.59,  accuracy: 0.80
Epoch 70.00,  loss: 0.59,  accuracy: 0.80
Epoch 75.00,  loss: 0.59,  accuracy: 0.80
Epoch 80.00,  loss: 0.58,  accuracy: 0.80
Epoch 85.00,  loss: 0.58,  accuracy: 0.80
Epoch 90.00,  loss: 0.57,  accuracy: 0.80
Epoch 95.00,  loss: 0.57,  accuracy: 0.80
Took: 945.34 seconds


 #### Comment 
 Accuracy increased a little but loss was not much better. 

In [16]:
# Q3.1 Normal with lr low 
train(batch_size= 256, epochs=100, lr=0.001, act=nn.ReLU(), pool=nn.MaxPool2d(2), optim="sgd")

Epoch 0.00,  loss: 2.30,  accuracy: 0.10
Epoch 5.00,  loss: 2.30,  accuracy: 0.10
Epoch 10.00,  loss: 2.30,  accuracy: 0.10
Epoch 15.00,  loss: 2.30,  accuracy: 0.14
Epoch 20.00,  loss: 2.30,  accuracy: 0.19
Epoch 25.00,  loss: 2.29,  accuracy: 0.20
Epoch 30.00,  loss: 2.29,  accuracy: 0.26
Epoch 35.00,  loss: 2.28,  accuracy: 0.35
Epoch 40.00,  loss: 2.27,  accuracy: 0.41
Epoch 45.00,  loss: 2.24,  accuracy: 0.43
Epoch 50.00,  loss: 2.14,  accuracy: 0.44
Epoch 55.00,  loss: 1.79,  accuracy: 0.49
Epoch 60.00,  loss: 1.43,  accuracy: 0.54
Epoch 65.00,  loss: 1.27,  accuracy: 0.58
Epoch 70.00,  loss: 1.18,  accuracy: 0.62
Epoch 75.00,  loss: 1.12,  accuracy: 0.65
Epoch 80.00,  loss: 1.08,  accuracy: 0.66
Epoch 85.00,  loss: 1.05,  accuracy: 0.67
Epoch 90.00,  loss: 1.02,  accuracy: 0.69
Epoch 95.00,  loss: 1.00,  accuracy: 0.70
Took: 929.09 seconds


 #### Comment
 Due to low learning rate, accuracy was not improved a lot on 100 epoch. Underfit

In [17]:
# Q3.2 Normal with lr high 
train(batch_size= 256, epochs=100, lr=0.1, act=nn.ReLU(), pool=nn.MaxPool2d(2), optim="sgd")

Epoch 0.00,  loss: 1.23,  accuracy: 0.72
Epoch 5.00,  loss: 0.17,  accuracy: 0.97
Epoch 10.00,  loss: 0.16,  accuracy: 0.98
Epoch 15.00,  loss: 0.16,  accuracy: 0.98
Epoch 20.00,  loss: 0.15,  accuracy: 0.98
Epoch 25.00,  loss: 0.13,  accuracy: 0.98
Epoch 30.00,  loss: 0.11,  accuracy: 0.98
Epoch 35.00,  loss: 0.08,  accuracy: 0.98
Epoch 40.00,  loss: 0.06,  accuracy: 0.98
Epoch 45.00,  loss: 0.04,  accuracy: 0.99
Epoch 50.00,  loss: 1.00,  accuracy: 0.72
Epoch 55.00,  loss: 0.42,  accuracy: 0.81
Epoch 60.00,  loss: 0.36,  accuracy: 0.85
Epoch 65.00,  loss: 0.32,  accuracy: 0.85
Epoch 70.00,  loss: 0.30,  accuracy: 0.86
Epoch 75.00,  loss: 0.30,  accuracy: 0.86
Epoch 80.00,  loss: 0.28,  accuracy: 0.86
Epoch 85.00,  loss: 0.27,  accuracy: 0.86
Epoch 90.00,  loss: 0.28,  accuracy: 0.86
Epoch 95.00,  loss: 0.27,  accuracy: 0.86
Took: 914.07 seconds


 #### Comment
 With high learning rate, accuracy was improved but loss was decreased.  45th epoch was the most optimal. And oscillated 

In [18]:
# Q4 sgdWithMomentum
train(batch_size= 256, epochs=100, lr=0.01, act=nn.ReLU(), pool=nn.MaxPool2d(2), optim="sgdWithMomentum")

Epoch 0.00,  loss: 0.82,  accuracy: 0.71
Epoch 5.00,  loss: 0.36,  accuracy: 0.88
Epoch 10.00,  loss: 0.37,  accuracy: 0.89
Epoch 15.00,  loss: 0.34,  accuracy: 0.89
Epoch 20.00,  loss: 0.30,  accuracy: 0.89
Epoch 25.00,  loss: 0.27,  accuracy: 0.89
Epoch 30.00,  loss: 0.27,  accuracy: 0.89
Epoch 35.00,  loss: 0.28,  accuracy: 0.89
Epoch 40.00,  loss: 0.27,  accuracy: 0.89
Epoch 45.00,  loss: 0.28,  accuracy: 0.89
Epoch 50.00,  loss: 0.27,  accuracy: 0.89
Epoch 55.00,  loss: 0.27,  accuracy: 0.89
Epoch 60.00,  loss: 0.26,  accuracy: 0.89
Epoch 65.00,  loss: 0.26,  accuracy: 0.89
Epoch 70.00,  loss: 0.26,  accuracy: 0.89
Epoch 75.00,  loss: 0.26,  accuracy: 0.89
Epoch 80.00,  loss: 0.26,  accuracy: 0.89
Epoch 85.00,  loss: 0.26,  accuracy: 0.89
Epoch 90.00,  loss: 0.26,  accuracy: 0.89
Epoch 95.00,  loss: 0.26,  accuracy: 0.89
Took: 934.11 seconds


 #### Comment
 Reached optimal state faster.

In [19]:
# Q4 adagrad
train(batch_size= 256, epochs=100, lr=0.01, act=nn.ReLU(), pool=nn.MaxPool2d(2), optim="adagrad")

Epoch 0.00,  loss: 0.50,  accuracy: 0.85
Epoch 5.00,  loss: 0.44,  accuracy: 0.89
Epoch 10.00,  loss: 0.43,  accuracy: 0.89
Epoch 15.00,  loss: 0.43,  accuracy: 0.89
Epoch 20.00,  loss: 0.43,  accuracy: 0.89
Epoch 25.00,  loss: 0.42,  accuracy: 0.89
Epoch 30.00,  loss: 0.42,  accuracy: 0.89
Epoch 35.00,  loss: 0.42,  accuracy: 0.89
Epoch 40.00,  loss: 0.42,  accuracy: 0.89
Epoch 45.00,  loss: 0.41,  accuracy: 0.89
Epoch 50.00,  loss: 0.41,  accuracy: 0.89
Epoch 55.00,  loss: 0.41,  accuracy: 0.89
Epoch 60.00,  loss: 0.40,  accuracy: 0.89
Epoch 65.00,  loss: 0.40,  accuracy: 0.89
Epoch 70.00,  loss: 0.39,  accuracy: 0.89
Epoch 75.00,  loss: 0.39,  accuracy: 0.89
Epoch 80.00,  loss: 0.39,  accuracy: 0.89
Epoch 85.00,  loss: 0.38,  accuracy: 0.89
Epoch 90.00,  loss: 0.38,  accuracy: 0.89
Epoch 95.00,  loss: 0.37,  accuracy: 0.89
Took: 920.87 seconds


In [None]:
_predict_y = model(test_x.float()).detach()
_predict_ys = np.argmax(predict_y, axis=-1)
_label_np = test_label.numpy()
_ = predict_ys == test_label
_correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
correct/_sum

 #### Comment 
 slow improvement on loss
 