In [None]:
import sys
sys.path.append("../")

import HMDL.Tensors as ht
from HMDL.nn import ReLUMultilayerPerceptron, CrossEntropyLoss
from HMDL.optim import StochasticGradientDescent, Adam
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

## Loading data

You can download the mnist dataset from [here](https://www.kaggle.com/datasets/oddrationale/mnist-in-csv)

In [None]:
train_df = pd.read_csv("../data/mnist_train.csv")  
test_df = pd.read_csv("../data/mnist_test.csv")[:1000]  

## Defining our model and hyperparameters

Since we are doing a classification problem we will be using the cross entropy loss. If we were doing linear regression we may use something like MSEloss.

The optimizer we will be using is the adam optimizer which is one of the most commonly used and most performant. The formula for the adam optimizer is as follows:

$$m_t=\beta_1 m_{t-1} + (1 - \beta_1) * \frac{\partial L}{\partial w}\alpha$$
$$v_t=\beta_2 v_{t-1} + (1 - \beta_2) * (\frac{\partial L}{\partial w}\alpha)^2$$
$$\hat m_t=\frac{m_t}{1 - \beta_1^t}$$
$$\hat v_t=\frac{v_t}{1 - \beta_2^t}$$

$$w_{t}=w_{t-1}-\frac{\hat m_t}{\sqrt{\hat v_t} + \epsilon}\gamma$$

In [None]:
TEST_SIZE = 1
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 1e-3

net = ReLUMultilayerPerceptron([784, 32, 10])
optim = Adam(net.parameters(), lr=LEARNING_RATE)
criterion = CrossEntropyLoss()

## Loading and formating out data

Our image data will be in the shape $(B\times C \times HW)$ were $B$ is our batch size, $C$ is the number of color channels (1 in our case) and $H, W$ are the height and width of our image (our image is 28 by 28 so $HW$ will be 784)

In [None]:
train_labels = np.eye(10)[train_df["label"].to_numpy()]
train_labels = [
    ht.Tensor(
        np.array(train_labels[i:i + BATCH_SIZE][:, np.newaxis, :])
    )
    for i in range(0, len(train_labels), BATCH_SIZE)
]

train_data = train_df.drop(["label"], axis=1).to_numpy() / 255
train_data = [
    ht.Tensor(
        np.array(train_data[i:i + BATCH_SIZE][:, np.newaxis, :])
    )
    for i in range(0, len(train_data), BATCH_SIZE)
]

test_labels = np.eye(10)[test_df["label"].to_numpy()]
test_labels = [
    ht.Tensor(
        np.array(test_labels[i:i + TEST_SIZE][:, np.newaxis, :])
    )
    for i in range(0, len(test_labels), TEST_SIZE)
]

test_data = test_df.drop(["label"], axis=1).to_numpy() / 255
test_data = [
    ht.Tensor(
        np.array(test_data[i:i + TEST_SIZE][:, np.newaxis, :])
    )
    for i in range(0, len(test_data), TEST_SIZE)
]

In [None]:
def test_model(model, test_data, test_labels):
    correct = 0

    # Counts the number of times the network gets the correct result
    for label, image in zip(test_labels, test_data):
        result = model.forward(image)
        prediciton = np.argmax(result.data, axis=-1)
        correct += np.count_nonzero(prediciton == np.argmax(label.data, axis=-1))

    print(f"Accuracy: {correct / len(test_data) * 100}%")

## Training our model

After every training epoch in our training loop we will test the performance of our model by testing its accuracy on our test dataset. Our test dataset does not contain any of the examples which are in our training dataset.

In [None]:
for epoch in range(EPOCHS):
    for label, image in zip(train_labels, train_data):
        # Get the predicted results from the network
        result = net.forward(image)
        
        # Zero all the gradients of the parameters
        optim.zero_grad()
        
        # Calculate the loss
        loss = criterion(result, label)
        
        # Backpropogate through the network
        loss.backward()
        
        # Update the networks parameters
        optim.step()

    print(f"Epoch {epoch + 1} Loss: {-1 / 10 * loss.data}")
    test_model(net, test_data, test_labels)