In [11]:
import numpy as np

In [2]:
# importing MNIST dataset
from tensorflow.keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [59]:
# labels one-hot encoding
y_train_one_hot = np.zeros((y_train.shape[0], 10))
for i in range(y_train.shape[0]):
    y_train_one_hot[i][y_train[i]] = 1
y_train = y_train_one_hot

y_test_one_hot = np.zeros((y_test.shape[0], 10))
for i in range(y_test.shape[0]):
    y_test_one_hot[i][y_test[i]] = 1
y_test = y_test_one_hot

In [6]:
# flatten 28x28 images to 1x784 arrays
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)

In [18]:
def relu(x):
    """
    Takes relu funcion of an array of values

    Parameters:
    - x: numpy array, input

    Returns:
    - relu(x): float, relu function applied to x
    """
    return np.maximum(x, 0)

In [152]:
def sigmoid(x):
    """
    Takes sigmoid funcion of an array of values

    Parameters:
    - x: numpy array, input

    Returns:
    - sigmoid(x): float, sigmoid function applied to x
    """
    exp_values = np.exp(-np.clip(x, -700, 700))
    return 1 / (1 + exp_values)

In [114]:
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [195]:
def get_accuracy(y, y_pred):
    # Convert one-hot encoded predictions to class labels
    pred_labels = np.argmax(y_pred, axis=1)
    
    # Convert one-hot encoded true labels to class labels
    true_labels = np.argmax(y, axis=1)

    # Count the number of correct predictions
    correct = np.sum(pred_labels == true_labels)

    # Calculate accuracy
    accuracy = correct / y.shape[0]
    
    return accuracy

In [196]:
# sparse categorical crossentropy loss
def sparse_categorical_crossentropy(y, y_hat):
    epsilon = 1e-15
    y_pred = np.clip(y_hat, epsilon, 1 - epsilon)
    return -np.sum(y * np.log(y_hat), axis=1)

In [197]:
def mean_squared_error(y_true, y_pred):
    """
    Compute the Mean Squared Error (MSE) between the true and predicted values.

    Parameters:
    - y_true: numpy array, true values
    - y_pred: numpy array, predicted values

    Returns:
    - mse: float, mean squared error
    """
    # Ensure both y_true and y_pred are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Check if the shapes match
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes of y_true and y_pred must match.")

    # Compute the mean squared error
    mse = np.mean((y_true - y_pred)**2)
    
    return mse

In [198]:
def shuffle_data(x_train, y_train):
    num_samples = x_train.shape[0]
    indices = np.arange(num_samples)
    np.random.shuffle(indices)

    return x_train[indices], y_train[indices]

In [199]:
def forward_prop(X, W1, b1, W2, b2, W3, b3):
    # Input to hidden layer 1
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)

    # Hidden layer 1 to hidden layer 2
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)

    # Hidden layer 2 to output layer
    z3 = np.dot(a2, W3) + b3
    a3 = softmax(z3)

    return a1, a2, a3

In [200]:
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def back_prop(x, y, a1, a2, a3, W1, b1, W2, b2, W3, b3, learning_rate):
    a3_error = a3 - y
    a2_error = np.dot(a3_error, W3.T) * sigmoid_derivative(a2)
    a1_error = np.dot(a2_error, W2.T) * sigmoid_derivative(a1)

    W3 -= learning_rate * np.dot(a2.T, a3_error)
    b3 -= learning_rate * np.sum(a3_error, axis=0, keepdims=True)
    
    W2 -= learning_rate * np.dot(a1.T, a2_error)
    b2 -= learning_rate * np.sum(a2_error, axis=0, keepdims=True)

    W1 -= learning_rate * np.dot(x.T, a1_error)
    b1 -= learning_rate * np.sum(a1_error, axis=0, keepdims=True)

    return W1, b1, W2, b2, W3, b3

In [209]:
# define random weights on interval [0, 1]
np.random.seed(42)
W1 = np.random.rand(784, 16)
W2 = np.random.rand(16, 16)
W3 = np.random.rand(16, 10)
b1 = np.zeros((1, 16))
b2 = np.zeros((1, 16))
b3 = np.zeros((1, 10))

In [210]:
# define hyperparameters
batch_size = 60
learning_rate = 0.01
epochs = 20

In [211]:
for epoch in range(epochs):
    x_train, y_train = shuffle_data(x_train, y_train)

    for i in range(0, x_train.shape[0], batch_size):
        x_batch = x_train[i: i + batch_size]
        y_batch = y_train[i: i + batch_size]

        a1, a2, a3 = forward_prop(x_batch, W1, b1, W2, b2, W3, b3)
        W1, b1, W2, b2, W3, b3 = back_prop(x_batch, y_batch, a1, a2, a3, W1, b1, W2, b2, W3, b3, learning_rate)

    # Calculate the mean squared error and accuracy over the entire training dataset
    _, _, a3 = forward_prop(x_train, W1, b1, W2, b2, W3, b3)
    mse = mean_squared_error(y_train, a3)
    accuracy = get_accuracy(y_train, a3)
    print(f"Epoch: {epoch} Mean Squared Error: {mse} Accuracy: {accuracy}")

Epoch: 0 Mean Squared Error: 0.05671419457540417 Accuracy: 0.5558666666666666
Epoch: 1 Mean Squared Error: 0.05019705056934677 Accuracy: 0.6384833333333333
Epoch: 2 Mean Squared Error: 0.04275700383200634 Accuracy: 0.6656666666666666
Epoch: 3 Mean Squared Error: 0.03894177478997454 Accuracy: 0.72145
Epoch: 4 Mean Squared Error: 0.03660823134682871 Accuracy: 0.7381833333333333
Epoch: 5 Mean Squared Error: 0.03406195379407588 Accuracy: 0.7653333333333333
Epoch: 6 Mean Squared Error: 0.030586435989880277 Accuracy: 0.7880666666666667
Epoch: 7 Mean Squared Error: 0.029083622007798137 Accuracy: 0.7980666666666667
Epoch: 8 Mean Squared Error: 0.028067196596593028 Accuracy: 0.8063333333333333
Epoch: 9 Mean Squared Error: 0.026807817865211216 Accuracy: 0.8133333333333334
Epoch: 10 Mean Squared Error: 0.026529086538265318 Accuracy: 0.8189666666666666
Epoch: 11 Mean Squared Error: 0.028068902298954302 Accuracy: 0.8086666666666666
Epoch: 12 Mean Squared Error: 0.028665834461659233 Accuracy: 0.7969

In [212]:
_, _, y_pred = forward_prop(x_test, W1, b1, W2, b2, W3, b3)
print(get_accuracy(y_test, y_pred))

0.7752


In [213]:
_, _, y_pred = forward_prop(x_train, W1, b1, W2, b2, W3, b3)
print(get_accuracy(y_train, y_pred))

0.7729333333333334
