In [22]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Load the data, it is a csv file obtained from Kaggle at https://www.kaggle.com/competitions/digit-recognizer
data = pd.read_csv('./train.csv')

In [23]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets


#This will be the development set, it is the first 1000 examples
data_dev = data[0:1000].T # Transpose the data
Y_dev = data_dev[0] # This is the label
X_dev = data_dev[1:n] # This is the pixel data
X_dev = X_dev / 255. # Normalize the data to be between 0 and 1

#This will be the training set, it is the rest of the examples, we use the training set for gradient descent
data_train = data[1000:m].T # Transpose the data
Y_train = data_train[0] # This is the label
X_train = data_train[1:n] # This is the pixel data
X_train = X_train / 255. # Normalize the data to be between 0 and 1
_,m_train = X_train.shape # m is the number of examples, n is the number of features

In [None]:
#visualize the first data point
plt.imshow(X_dev[:, 1].reshape(28, 28), cmap='gray')

In [None]:
class Layer:
    def __init__(self, n_input, n_output):
        """
        Initializes a neural network layer with the specified number of inputs and outputs.
        
        Parameters:
        n_input (int): The number of input features to the layer.
        n_output (int): The number of output features from the layer.
        
        Attributes:
        W (ndarray): The weight matrix for the layer.
        b (ndarray): The bias vector for the layer.
        """
        self.W = np.random.rand(n_output, n_input) - 0.5
        self.b = np.zeros((n_output, 1)) - 0.5

    def forward(self, input):
        """
        Computes the forward pass through the layer.
        
        Parameters:
        input (ndarray): Input matrix of shape (n_input, m), where m is the number of examples.
        
        Returns:
        output (ndarray): Output matrix of shape (n_output, m). It is the result of the forward pass.
        """
        self.input = input
        self.output = np.dot(self.W, input) + self.b
        return self.output
    
    def backwardSoft(self, prediction, Y_one_hot, prevReLu, learningRate, m):
        """
        Performs backpropagation for the output layer using the softmax activation function.
        
        Parameters:
        prediction (ndarray): Predicted probabilities for each class, shape (n_output, m). These are the output of the softmax activation function.
        Y_one_hot (ndarray): One-hot encoded true labels, shape (n_output, m). 
        prevReLu (ndarray): Output of the previous layer after applying ReLU activation, shape (n_input, m).
        learningRate (float): Learning rate used for updating the weights and biases.
        m (int): Number of training examples (batch size).
        
        Returns:
        dPrediction (ndarray): Gradient of the loss with respect to the predictions, shape (n_output, m).
        """
        self.dPrediction = prediction - Y_one_hot
        dW = (1/ m) * self.dPrediction.dot(prevReLu.T)
        db = (1/ m) * np.sum(self.dPrediction)
        self.W = self.W - learningRate * dW
        self.b = self.b - learningRate * db
        return self.dPrediction
    
    def backwardReLu(self, NextLayer, ReLU, Y_one_hot, learningRate, m):
        """
        Performs backpropagation for a hidden layer using the ReLU activation function.
        
        Parameters:
        NextLayer (Layer): The next layer in the network, used to access its weights and gradients.
        ReLU (ReLU): The ReLU activation object, used to compute the derivative of ReLU.
        Y_one_hot (ndarray): One-hot encoded true labels, shape (n_output, m).
        learningRate (float): Learning rate used for updating the weights and biases.
        m (int): Number of training examples (batch size).
        
        Returns:
        dOutput (ndarray): Gradient of the loss with respect to the layer's output, shape (n_output, m).
        """
        self.dOutput = NextLayer.W.T.dot(NextLayer.dPrediction) * ReLU.derivative(self.output)
        dW = (1/ m) * self.dOutput.dot(self.input.T)
        db = (1/ m) * np.sum(self.dOutput)
        self.W = self.W - learningRate * dW
        self.b = self.b - learningRate * db
        return self.dOutput
        
class ReLU:
    def forward(self, input):
        """
        Computes the forward pass through the ReLU activation function.
        """
        self.input = input
        self.output = np.maximum(0, input)
        return self.output
    
    def derivative(self ,Layer_output):
        """
        Computes the derivative of the ReLU activation function.
        """
        return Layer_output > 0

class Softmax:
    def forward(self, input):
        """
        Computes the forward pass through the softmax activation function.
        """
        self.output = np.exp(input) / sum(np.exp(input))
        return self.output
    
def OneHotEncode(Y):
    """
    One hot encode the labels.
    """
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def getPrediction(output):
    """
    Get the prediction from the output of the softmax layer.
    """
    return np.argmax(output, axis=0)

def getAccuracy(prediction, Y):
    """
    Get the accuracy of the model.
    """
    return np.sum(prediction == Y) / Y.size

In [21]:
# Initialize the first layer of the neural network
# Input size is 784 (e.g., for a 28x28 pixel image), output size is 10 (e.g., for 10 classes)
layer1 = Layer(784, 10)

# Initialize the ReLU activation function object
relu = ReLU()

# Initialize the second layer of the neural network
# Input size is 10 (from the output of layer1), output size is 10 (for 10 classes)
layer2 = Layer(10, 10)

# Initialize the Softmax activation function object
softmax = Softmax()

# Set the learning rate for the gradient descent updates
learningRate = 0.1

# Training loop: Run for 10,000 epochs (iterations)
for epoch in range(10000):
    # Forward pass through the network

    # Pass the input data (X_train) through the first layer
    output1 = layer1.forward(X_train)
    
    # Apply the ReLU activation function to the output of the first layer
    output1_ReLU = relu.forward(output1)
    
    # Pass the output of ReLU through the second layer
    output2 = layer2.forward(output1_ReLU)
    
    # Apply the Softmax activation function to the output of the second layer
    output2_Softmax = softmax.forward(output2)
    
    # Get the predicted class labels from the softmax probabilities
    predection = getPrediction(output2_Softmax)
    
    # Calculate the accuracy of the predictions by comparing them to the true labels (Y_train)
    accuracy = getAccuracy(predection, Y_train)
    
    # Print every 50 epochs
    if epoch % 50 == 0:
        print(' Accuracy: ', accuracy, 'epoch: ', epoch)

    # Backward pass through the network
    
    # Compute the gradient of the loss with respect to the predictions (output2_Softmax)
    dPrediction = layer2.backwardSoft(output2_Softmax, OneHotEncode(Y_train), output1_ReLU, learningRate, m)
    
    # Compute the gradient of the loss with respect to the output of layer1
    dOutput = layer1.backwardReLu(layer2, relu, OneHotEncode(Y_train), learningRate, m)


 Accuracy:  0.09814634146341464 epoch:  0
 Accuracy:  0.47897560975609754 epoch:  50


KeyboardInterrupt: 

In [None]:
import os
from IPython.display import clear_output
# Now we test the model

output1 = layer1.forward(X_dev)
output1_ReLU = relu.forward(output1)
output2 = layer2.forward(output1_ReLU)
output2_Softmax = softmax.forward(output2)
predection = getPrediction(output2_Softmax)
accuracy = getAccuracy(predection, Y_dev)
print('Accuracy on the 3rd data point:', accuracy)
for i in range(X_dev.shape[1]):
    data_sample_index = i
    plt.imshow(X_dev[:, data_sample_index].reshape(28, 28), cmap='cool')
    plt.show()
    print('Prediction:', predection[data_sample_index])
    print('Label:', Y_dev[data_sample_index])
    # wait 1 second
    plt.pause(1)
    # clear the current figure
    plt.clf()
    os.system('clear')
    clear_output(wait=True)

In [None]:
#create a random image
random_image = np.random.rand(28*28)
plt.imshow(random_image.reshape(28, 28), cmap='rainbow')
plt.show()
#make it into a column
random_image = random_image.reshape(784, 1)
#pass it through the network
output1 = layer1.forward(random_image)
output1_ReLU = relu.forward(output1)
output2 = layer2.forward(output1_ReLU)
output2_Softmax = softmax.forward(output2)
predection = getPrediction(output2_Softmax)
print('Prediction:', predection[0])
