# MNIST Digit Classification with our own Framework

Lab Assignment from [AI for Beginners Curriculum](https://github.com/microsoft/ai-for-beginners).

### Reading the Dataset

This code download the dataset from the repository on the internet. You can also manually copy the dataset from `/data` directory of AI Curriculum repo.

In [39]:
# !rm *.pkl
# !wget https://raw.githubusercontent.com/microsoft/AI-For-Beginners/main/data/mnist.pkl.gz
# !gzip -d mnist.pkl.gz

In [40]:
import pickle

%matplotlib nbagg
import matplotlib.pyplot as plt 
from matplotlib import gridspec
from sklearn.datasets import make_classification
import numpy as np
# pick the seed for reproducibility - change it to explore the effects of random variations
np.random.seed(0)
import random
import gzip

with gzip.open('../../03-Perceptron/lab/mnist.pkl.gz', 'rb') as mnist_pickle:
    MNIST = pickle.load(mnist_pickle)
    
# with open('mnist.pkl','rb') as f:
#     MNIST = pickle.load(f)

In [41]:
labels = MNIST['Train']['Labels']
data = MNIST['Train']['Features'].astype('float32') / 256.0

Let's see what is the shape of data that we have:

In [42]:
data.shape

(42000, 784)

### Splitting the Data

We will use Scikit Learn to split the data between training and test dataset:

In [43]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(data,labels,test_size=0.2)

print(f"Train samples: {len(features_train)}, test samples: {len(features_test)}")
print(f"Train samples: {features_train.shape}, test samples: {len(features_test)}")
print(labels_train)

Train samples: 33600, test samples: 8400
Train samples: (33600, 784), test samples: 8400
[6 6 4 ... 9 0 9]



### Instructions

1. Take the framework code from the lesson and paste it into this notebook, or (even better) into a separate Python module
1. Define and train one-layered perceptron, observing training and validation accuracy during training
1. Try to understand if overfitting took place, and adjust layer parameters to improve accuracy
1. Repeat previous steps for 2- and 3-layered perceptrons. Try to experiment with different activation functions between layers.
1. Try to answer the following questions:
    - Does the inter-layer activation function affect network performance?
    - Do we need 2- or 3-layered network for this task?
    - Did you experience any problems training the network? Especially as the number of layers increased.
    - How do weights of the network behave during training? You may plot max abs value of weights vs. epoch to understand the relation.

In [44]:
def train_and_plot(n_epoch, net,train_x , train_labels , test_x , test_labels , loss, batch_size=4, lr=0.1):
    fig, ax = plt.subplots(2, 1)
    ax[0].set_xlim(0, n_epoch + 1)
    ax[0].set_ylim(0, 1)

    train_acc = np.empty((n_epoch, 3))
    train_acc[:] = np.NAN
    valid_acc = np.empty((n_epoch, 3))
    valid_acc[:] = np.NAN

    for epoch in range(1, n_epoch + 1):
        train_epoch(net, train_x, train_labels, loss, batch_size, lr)
        tloss, taccuracy = get_loss_acc(net,train_x, train_labels, loss)
        #def get_loss_acc(net , x,y,loss):
        train_acc[epoch - 1, :] = [epoch, tloss, taccuracy]
        vloss, vaccuracy = get_loss_acc(net,test_x, test_labels, loss)
        valid_acc[epoch - 1, :] = [epoch, vloss, vaccuracy]

        ax[0].set_ylim(0, max(max(train_acc[:, 2]), max(valid_acc[:, 2])) * 1.1)

        plot_training_progress(train_acc[:, 0], (train_acc[:, 2],
                                                 valid_acc[:, 2]), fig, ax[0])
        plot_decision_boundary(net, fig, ax[1] , train_x , train_labels , test_x , test_labels)
        fig.canvas.draw()
        fig.canvas.flush_events()

    return train_acc, valid_acc


import matplotlib.cm as cm


def plot_decision_boundary(net, fig, ax , train_x , train_labels , test_x , test_labels):
    draw_colorbar = True
    # remove previous plot
    while ax.collections:
        ax.collections.pop()
        draw_colorbar = False

    # generate countour grid
    x_min, x_max = train_x[:, 0].min() - 1, train_x[:, 0].max() + 1
    y_min, y_max = train_x[:, 1].min() - 1, train_x[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))
    grid_points = np.c_[xx.ravel().astype('float32'), yy.ravel().astype('float32')]
    n_classes = max(train_labels) + 1
    while train_x.shape[1] > grid_points.shape[1]:
        # pad dimensions (plot only the first two)
        grid_points = np.c_[grid_points,
        np.empty(len(xx.ravel())).astype('float32')]
        grid_points[:, -1].fill(train_x[:, grid_points.shape[1] - 1].mean())

    # evaluate predictions
    prediction = np.array(net.forward(grid_points))
    # for two classes: prediction difference
    if (n_classes == 2):
        Z = np.array([0.5 + (p[0] - p[1]) / 2.0 for p in prediction]).reshape(xx.shape)
    else:
        Z = np.array([p.argsort()[-1] / float(n_classes - 1) for p in prediction]).reshape(xx.shape)

    # draw contour
    levels = np.linspace(0, 1, 40)
    cs = ax.contourf(xx, yy, Z, alpha=0.4, levels=levels)
    if draw_colorbar:
        fig.colorbar(cs, ax=ax, ticks=[0, 0.5, 1])
    c_map = [cm.jet(x) for x in np.linspace(0.0, 1.0, n_classes)]
    colors = [c_map[l] for l in train_labels]
    ax.scatter(train_x[:, 0], train_x[:, 1], marker='o', c=colors, s=60, alpha=0.5)


def plot_training_progress(x, y_data, fig, ax):
    styles = ['k--', 'g-']
    # remove previous plot
    while ax.lines:
        ax.lines.pop()
    # draw updated lines
    for i in range(len(y_data)):
        ax.plot(x, y_data[i], styles[i])
    ax.legend(ax.lines, ['training accuracy', 'validation accuracy'],
              loc='upper center', ncol=2)

In [47]:

class Linear:
    def __init__(self,nin,nout):
        self.W = np.random.normal(0, 1.0/np.sqrt(nin), (nout, nin))
        self.b = np.zeros((1,nout))
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        
    def forward(self, x):
        self.x=x
        return np.dot(x, self.W.T) + self.b
    
    def backward(self, dz):
        dx = np.dot(dz, self.W)
        dW = np.dot(dz.T, self.x)
        db = dz.sum(axis=0)
        self.dW = dW
        self.db = db
        return dx
    
    def update(self,lr):
        self.W -= lr*self.dW
        self.b -= lr*self.db

class Tanh:
    def forward(self,x):
        y = np.tanh(x)
        self.y = y
        return y
    def backward(self,dy):
        return (1.0-self.y**2)*dy

class Softmax:
    def forward(self,z):
        self.z = z
        zmax = z.max(axis=1,keepdims=True)
        expz = np.exp(z - zmax)
        Z = expz.sum(axis=1,keepdims=True)
        return expz / Z
    def backward(self,dp):
        p = self.forward(self.z)
        pdp = p * dp
        return pdp - p * pdp.sum(axis=1, keepdims=True)
    
class CrossEntropyLoss:
    def forward(self,p,y):
        self.p = p
        self.y = y
        epsilon = 1e-7
        p_of_y = p[np.arange(len(y)), y]
        p_of_y = np.maximum(epsilon, p_of_y)
        log_prob = np.log(p_of_y )
        return -log_prob.mean()
    def backward(self,loss):
        dlog_softmax = np.zeros_like(self.p)
        dlog_softmax[np.arange(len(self.y)), self.y] -= 1.0/len(self.y)
        return dlog_softmax / self.p


class Net:
    def __init__(self):
        self.layers = []
    
    def add(self,l):
        self.layers.append(l)
        
    def forward(self,x):
        for l in self.layers:
            x = l.forward(x)
        return x
    
    def backward(self,z):
        for l in self.layers[::-1]:
            z = l.backward(z)
        return z
    
    def update(self,lr):
        for l in self.layers:
            if 'update' in l.__dir__():
                l.update(lr)
                


def get_loss_acc(net , x,y,loss):
    p = net.forward(x)
    l = loss.forward(p,y)
    pred = np.argmax(p,axis=1)
    acc = (pred==y).mean()
    return l,acc

# print("Initial loss={}, accuracy={}: ".format(*get_loss_acc(train_x,train_labels)))

def train_epoch(net, train_x, train_labels, loss=CrossEntropyLoss(), batch_size=4, lr=0.1):
    for i in range(0,len(train_x),batch_size):
        xb = train_x[i:i+batch_size]
        yb = train_labels[i:i+batch_size]
        p = net.forward(xb)
        l = loss.forward(p,yb)
        dp = loss.backward(l)
        dx = net.backward(dp)
        net.update(lr)    
                
nnet = Net()
nnet.add(Linear(784,10))
nnet.add(Tanh())
nnet.add(Linear(10,10))
nnet.add(Tanh())
nnet.add(Linear(10,10))
nnet.add(Softmax())
cross_loss = CrossEntropyLoss()
print("Initial loss={}, accuracy={}: ".format(*get_loss_acc(nnet , features_train,labels_train,cross_loss)))
train_epoch(nnet, features_train, labels_train , cross_loss )
print("Final loss={}, accuracy={}: ".format(*get_loss_acc(nnet , features_train,labels_train,cross_loss)))
print("Test loss={}, accuracy={}: ".format(*get_loss_acc(nnet , features_test,labels_test,cross_loss)))


Initial loss=2.38302679099452, accuracy=0.1044047619047619: 
Final loss=0.4124141341208442, accuracy=0.8851785714285715: 
Test loss=0.4284796857876896, accuracy=0.8821428571428571: 
