# import necessary library

In [110]:
import numpy as np
import pandas as pd
from PIL import Image

In [111]:
# Activation Functions
def leaky_ReLU(x):
    return np.maximum(x, 0.1 * x)

def leaky_ReLU_derivative(x):
    return np.where(x > 0, 1, 0.1)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Prevent overflow
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Mean Squared Error Loss
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def mse_derivative(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.shape[0]

def categorical_crossentropy(y, y_pred):
    return np.sum(-np.log(y_pred) * y)

def categorical_crossentropy_derivative(y_true, y_pred):
    return y_pred - y_true


# preprocessing data

In [112]:
def crop_image(image_path):
    image = Image.open(image_path).convert('L')
    image_array = np.array(image)
    
    binary_image = image_array < 127
    
    coords = np.column_stack(np.where(binary_image))
    y_min, x_min = coords.min(axis=0)
    y_max, x_max = coords.max(axis=0)
    
    cropped = image.crop((x_min, y_min, x_max + 1, y_max + 1))
    cropped = cropped.resize((28, 28))

    return cropped


In [113]:
import pandas as pd

# Load label mapping
label_map = pd.read_csv('./datasets/mnist.train.map.csv')

# Count samples per category
count = label_map["category"].value_counts()
min_samples = count.min()  # Ensure equal distribution
print(count)

train_list = []
test_list = []

# Manually split each category
for label in label_map["category"].unique():
    subset = label_map[label_map["category"] == label]
    
    # Downsample to min_samples to ensure balance
    subset = subset.sample(n=min_samples, random_state=42)

    # Shuffle before splitting
    subset = subset.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate split point (80% train, 20% test)
    split_idx = int(len(subset) * 0.8)
    
    train_list.append(subset[:split_idx])  # First 80% for training
    test_list.append(subset[split_idx:])   # Last 20% for testing

# Merge data back together
train_map = pd.concat(train_list).reset_index(drop=True)
test_map = pd.concat(test_list).reset_index(drop=True)

# **Shuffle the entire train & test datasets**
train_map = train_map.sample(frac=1, random_state=42).reset_index(drop=True)
test_map = test_map.sample(frac=1, random_state=42).reset_index(drop=True)

# Read images into DataFrame
train_datasets = train_map.copy()
train_datasets["id"] = train_datasets["id"].apply(lambda x: f"datasets/train/{x}")
train_datasets.rename(columns={"id": "image_path", "category": "label"}, inplace=True)

test_datasets = test_map.copy()
test_datasets["id"] = test_datasets["id"].apply(lambda x: f"datasets/train/{x}")
test_datasets.rename(columns={"id": "image_path", "category": "label"}, inplace=True)

print("Train set size:", train_datasets.shape)
print("Test set size:", test_datasets.shape)


category
0    1100
1    1048
2     966
3     910
4     809
5     790
6     718
7     691
9     612
8     611
Name: count, dtype: int64
Train set size: (4880, 2)
Test set size: (1230, 2)


In [101]:
# count label from train datasets
train_count = train_datasets["label"].value_counts()
print(train_count)

# count label from test datasets
test_count = test_datasets["label"].value_counts()
print(test_count)

label
3    488
1    488
8    488
9    488
5    488
7    488
2    488
4    488
6    488
0    488
Name: count, dtype: int64
label
9    123
6    123
1    123
7    123
5    123
3    123
8    123
4    123
0    123
2    123
Name: count, dtype: int64


# Model size
 - 784 Input which is 28x28 pixels
 - 4 Hidden layers each layer contain 64, 32, 32, 16 nodes in order
 - Output split out 0-9
 

In [120]:
class NeuralNetwork:
    def __init__(self, input_layer_size = 784, hidden_layers = [64, 32, 32, 16], output_layer_size = 10):
        np.random.seed(1234)
        self.input_layer_size = input_layer_size  # 28 * 28 pixels
        self.hidden_layers = hidden_layers
        self.output_layer_size = output_layer_size
        self.activation_function = leaky_ReLU
        self.activation_derivative = leaky_ReLU_derivative

        # Initialize weights and biases dynamically
        layer_sizes = [self.input_layer_size] + self.hidden_layers + [self.output_layer_size]
        self.weights = [np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * np.sqrt(1 / layer_sizes[i])
                        for i in range(len(layer_sizes) - 1)]
        self.biases = [np.zeros(layer_sizes[i + 1]) for i in range(len(layer_sizes) - 1)]  # Initialize biases to zero


    def forward(self, X):
        self.a = [X]  # Store activations for backward pass
        self.z = []  # Store weighted sums for backward pass

        # Hidden layers
        for i in range(len(self.weights) - 1):  
            self.z.append(np.dot(self.a[-1], self.weights[i]) + self.biases[i])
            self.a.append(self.activation_function(self.z[-1]))  # Use activation function here

        # Output layer (no activation here, we apply softmax in loss)
        self.z.append(np.dot(self.a[-1], self.weights[-1]) + self.biases[-1])
        self.a.append(softmax(self.z[-1]))  # Raw output without softmax
        
        # Apply softmax to the output layer
        return self.a[-1]

    def backward(self, y_true, learning_rate):
        # Calculate the loss derivative w.r.t output (using categorical cross-entropy)
        dL_da = categorical_crossentropy_derivative(y_true, self.a[-1])
        
        dW = []
        dB = []
        
        # Backpropagation loop
        for i in reversed(range(len(self.weights))):
            if i == len(self.weights) - 1:
                dL_dz = dL_da  # Output layer gradient (softmax + MSE derivative)
            else:
                dL_dz = np.dot(dL_da, self.weights[i + 1].T) * self.activation_derivative(self.z[i])
            
            dW.insert(0, np.dot(self.a[i].T, dL_dz))
            dB.insert(0, np.sum(dL_dz, axis=0))
            dL_da = dL_dz
        
        # Gradient descent update
        for i in range(len(self.weights)):
            self.weights[i] -= (learning_rate * dW[i]) / y_true.shape[0]
            self.biases[i] -= (learning_rate * dB[i]) / y_true.shape[0]

    def train(self, X, y, epochs = 10, min_learning_rate = 1e-6, max_learning_rate = 1e-2):
        for epoch in range(1, epochs+1):
            current_learning_rate = max_learning_rate - (max_learning_rate - min_learning_rate) * epoch / epochs
            y_pred = self.forward(X)
            loss = categorical_crossentropy(y, y_pred) / y.shape[0]
            self.backward(y, current_learning_rate)

            if epoch % 1 == 0:
                print(f"Epoch {epoch}, learning rate: {current_learning_rate}, Loss: {loss}")

In [115]:
x_train = []
y_train = []

for index, row in train_datasets.iterrows():
    img = crop_image(row["image_path"]).convert('L')
    img = np.array(img).reshape(784) / 255.0  # Flatten & normalize
    x_train.append(img)
    y_train.append(np.eye(10)[row["label"]])  # One-hot encoding
    # if index % 100 == 0:
    #     print(f"Image {index} processed")

print("Image count:", len(x_train))

x_train = np.array(x_train)
y_train = np.array(y_train)


Image count: 4880


In [116]:
x_test = []
y_test = []

for index, row in test_datasets.iterrows():
    img = crop_image(row["image_path"]).convert('L')
    img = np.array(img).reshape(784) / 255.0  # Flatten & normalize
    x_test.append(img)
    y_test.append(np.eye(10)[row["label"]])  # One-hot encoding

print("Image count: ", len(x_test))


Image count:  1230


In [153]:
NN = NeuralNetwork()

In [160]:
NN.train(x_train, y_train, epochs=5000, min_learning_rate=1e-4, max_learning_rate=2e-4)

Epoch 1, Loss: 0.5477117797096405, learning rate: 0.00019998000000000002
Epoch 2, Loss: 0.5477100803799471, learning rate: 0.00019996
Epoch 3, Loss: 0.5477083736603029, learning rate: 0.00019994
Epoch 4, Loss: 0.5477066678296478, learning rate: 0.00019992
Epoch 5, Loss: 0.5477049672696254, learning rate: 0.0001999
Epoch 6, Loss: 0.5477032614500666, learning rate: 0.00019988000000000002
Epoch 7, Loss: 0.5477015594057393, learning rate: 0.00019986
Epoch 8, Loss: 0.5476998554491362, learning rate: 0.00019984
Epoch 9, Loss: 0.5476981530167844, learning rate: 0.00019982000000000002
Epoch 10, Loss: 0.5476964486108549, learning rate: 0.0001998
Epoch 11, Loss: 0.5476947483931751, learning rate: 0.00019978000000000002
Epoch 12, Loss: 0.5476930417244412, learning rate: 0.00019976
Epoch 13, Loss: 0.5476913374341121, learning rate: 0.00019974
Epoch 14, Loss: 0.5476896406575764, learning rate: 0.00019972000000000002
Epoch 15, Loss: 0.5476879338953852, learning rate: 0.0001997
Epoch 16, Loss: 0.5476

In [164]:
current_datasets = train_datasets.copy()
current_x_test = x_train.copy()

correct_predictions = 0
total_predictions = len(current_datasets)

for i, row in current_datasets.iterrows():
    result = NN.forward(current_x_test[i].reshape(784, 1).T)
    # print(result)
    predicted_label = result.argmax()  # Get the index of the max value (predicted class)
    actual_label = row['label']
    
    print(f"Image {i} - Predicted: {predicted_label}, Actual: {actual_label}")
    
    if predicted_label == actual_label:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy}")


Image 0 - Predicted: 3, Actual: 3
Image 1 - Predicted: 1, Actual: 1
Image 2 - Predicted: 8, Actual: 8
Image 3 - Predicted: 9, Actual: 9
Image 4 - Predicted: 9, Actual: 5
Image 5 - Predicted: 5, Actual: 5
Image 6 - Predicted: 9, Actual: 9
Image 7 - Predicted: 1, Actual: 1
Image 8 - Predicted: 9, Actual: 9
Image 9 - Predicted: 9, Actual: 7
Image 10 - Predicted: 6, Actual: 2
Image 11 - Predicted: 8, Actual: 4
Image 12 - Predicted: 8, Actual: 8
Image 13 - Predicted: 6, Actual: 6
Image 14 - Predicted: 4, Actual: 4
Image 15 - Predicted: 5, Actual: 5
Image 16 - Predicted: 8, Actual: 8
Image 17 - Predicted: 6, Actual: 6
Image 18 - Predicted: 9, Actual: 5
Image 19 - Predicted: 7, Actual: 7
Image 20 - Predicted: 8, Actual: 8
Image 21 - Predicted: 5, Actual: 5
Image 22 - Predicted: 8, Actual: 8
Image 23 - Predicted: 3, Actual: 3
Image 24 - Predicted: 2, Actual: 2
Image 25 - Predicted: 9, Actual: 7
Image 26 - Predicted: 4, Actual: 4
Image 27 - Predicted: 1, Actual: 1
Image 28 - Predicted: 0, Actua

# Time to save trained model

In [162]:
import pickle

# Save the model
with open("model.pk", "wb") as f:
    pickle.dump(NN, f)

# Make a model loader

In [166]:
model = pickle.load(open("model.pk", "rb"))

array([[6.52111060e-06, 7.39898703e-06, 4.88526727e-04, 1.16688070e-03,
        2.33216792e-02, 1.19896471e-02, 7.02679826e-06, 2.57336973e-01,
        8.13942365e-02, 6.24281110e-01]])