In [1]:
import numpy as np

In [2]:
from mnist import MNIST

mndata = MNIST('samples')

images, labels = mndata.load_training()

test_images, test_labels = mndata.load_testing()

In [15]:
x_train = np.array(mndata.train_images)
y_train = np.eye(len(np.unique(labels)))[mndata.train_labels]

In [31]:
def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

def softmax_derivative(softmax_output, y_true):
    m = y_true.shape[0]
    grad = softmax_output - y_true
    grad /= m
    return grad

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def forward(x, w1, b1, w2, b2):
    z1 = np.dot(x, w1) + b1
    a1 = relu(z1)
    z2 = np.dot(a1, w2) + b2
    a2 = softmax(z2)
    return a1, a2


def backward(x, a1, a2, y_true, w2):
    m = y_true.shape[0]

    dz2 = softmax_derivative(a2, y_true)
    dw2 = np.dot(a1.T, dz2)
    db2 = np.sum(dz2, axis=0, keepdims=True)

    dz1 = np.dot(dz2, w2.T) * relu_derivative(a1)
    dw1 = np.dot(x.T, dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    return dw1, db1, dw2, db2

import numpy as np

def predict(x, w1, b1, w2, b2):
    _, a2 = forward(x, w1, b1, w2, b2)
    return np.argmax(a2, axis=1)

def plateau_scheduler(initial_lr, epoch, patience=10, factor=0.5, threshold=1e-4):
    return initial_lr * factor if epoch % patience == 0 and threshold < 0 else initial_lr * factor if epoch % patience == 0 else initial_lr

def train(x_train, y_train, initial_learning_rate, epochs, batch_size, patience=10, factor=0.5, threshold=1e-4):
    input_size = x_train.shape[1]
    output_size = y_train.shape[1]

    np.random.seed(42)
    w1 = np.random.randn(input_size, 64) * 0.01
    b1 = np.zeros((1, 64))
    w2 = np.random.randn(64, output_size) * 0.01
    b2 = np.zeros((1, output_size))

    m = x_train.shape[0]

    best_loss = float('inf')
    counter = 0

    for epoch in range(epochs):
        for i in range(0, m, batch_size):
            x_batch = x_train[i:i + batch_size]
            y_batch = y_train[i:i + batch_size]

            a1, a2 = forward(x_batch, w1, b1, w2, b2)

            loss = mse_loss(y_batch, a2)

            dw1, db1, dw2, db2 = backward(x_batch, a1, a2, y_batch, w2)

            w1 -= initial_learning_rate * dw1
            b1 -= initial_learning_rate * db1
            w2 -= initial_learning_rate * dw2
            b2 -= initial_learning_rate * db2

        print(f'Epoch {epoch}, Loss: {loss}, Learning Rate: {initial_learning_rate}')

        if loss < best_loss - threshold:
            best_loss = loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                initial_learning_rate = plateau_scheduler(initial_learning_rate, epoch, patience, factor, threshold)
                counter = 0

    return w1, b1, w2, b2

In [None]:
initial_learning_rate = 0.01
epochs = 1000
batch_size = 64
patience = 10
factor = 0.5
threshold = 1e-4

w1, b1, w2, b2 = train(x_train, y_train, initial_learning_rate, epochs, batch_size, patience, factor, threshold)

Cell output was cleared and posted in separate file `trainingProcess.txt`

In [52]:
y_pred = predict(mndata.test_images, w1, b1, w2, b2)

Accuracy

In [53]:
np.sum(y_pred == mndata.test_labels) / len(y_pred)

0.5826