In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data =pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
data = np.array(data)
m,n = data.shape #returns the columns and rows of the data matrix
np.random.shuffle(data) #shuffle to remove patterns

In [None]:
data_dev = data[0:1000].T #transposed data from 0-1000
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255

data_train =  data[1000:m].T
Y_train =data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255
_,m_train = X_train.shape
print("Data shape:", data.shape)

In [None]:
n_neurons = 64  # Increase from 32 to 64
n_hidden_layers = 2

In [None]:
def init_params():
    #setting the weights to random numbers works, but normalising them reduces computation time or setting it close to 0
    W = []
    b = []
    for i in range(n_hidden_layers):
        if i == 0: #first layer
            W.append(np.random.normal(size=(n_neurons, 784)) * np.sqrt(1/(784)))
            b.append(np.zeros((n_neurons, 1)))
        elif i == n_hidden_layers -1: #last layer
            W.append(np.random.normal(size=(10, n_neurons)) * np.sqrt(1./n_neurons)) #output layer
            b.append(np.zeros((10, 1)))
        else:
            W.append(np.random.normal(size=(n_neurons, n_neurons)) * np.sqrt(1./n_neurons)) #many hidden layers
            b.append(np.zeros((n_neurons, 1)))
        
    return W, b

In [None]:
def ReLU(Z):
    return np.maximum(0,Z)

relu function
if z > 0:
return z
else:
return 0

In [None]:
def softmax(Z):
    Z = Z - np.max(Z, axis=0, keepdims=True)  # Ensure stability
    exp = np.exp(Z)
    return exp / exp.sum(axis=0, keepdims=True)  # Keep dimensions to avoid shape mismatch

converts numbers into probablilities

In [None]:
def forward_prop(W, b, X):

    Z = []
    A = []
    for i in range(n_hidden_layers):
        if i == 0: #first layer
            Z.append(W[i].dot(X) + b[i])
            A.append(ReLU(Z[i]))
        elif i == n_hidden_layers -1: #last layer
            Z.append(W[i].dot(A[i-1]) + b[i]) #output layer
            A.append(softmax(Z[i]))
        else:
            Z.append(W[i].dot(A[i-1]) + b[i])
            A.append(ReLU(Z[i]))
    
    
    return Z, A

In [None]:
def ReLU_prime(Z):
    return Z > 0 #true == 1, false == 0

the derrivative of RelU,

In [None]:
def one_hot_encode(y):
    one_hot_y = np.zeros((y.size, y.max() + 1)) # creates a array of 0s, y.max() + 1, the plus one accounts for index starting at 0
    one_hot_y[np.arange(y.size), y] = 1 #for the rows in the array, got to y and set it to 1
    one_hot_y = one_hot_y.T #transposed to match other data
    return one_hot_y

In [None]:
def backward_prop(Z, A, W, X, Y, m, lambda_reg=0.01):
    dZ = []
    dW = []
    db = []
    
    # Output layer
    dZ.append(A[-1] - Y)
    dW.append((1 / m) * dZ[-1].dot(A[-2].T) + (lambda_reg / m) * W[-1])
    db.append((1 / m) * np.sum(dZ[-1], axis=1, keepdims=True))
    
    # Hidden layers
    for i in range(n_hidden_layers - 2, -1, -1):
        dZ.append(W[i + 1].T.dot(dZ[-1]) * ReLU_prime(Z[i]))
        if i == 0:
            dW.append((1 / m) * dZ[-1].dot(X.T) + (lambda_reg / m) * W[i])
        else:
            dW.append((1 / m) * dZ[-1].dot(A[i - 1].T) + (lambda_reg / m) * W[i])
        db.append((1 / m) * np.sum(dZ[-1], axis=1, keepdims=True))
    
    dW.reverse()
    db.reverse()
    return dW, db

In [None]:
def test_prediction(index, W, b):
    current_image = x_test[:, index, None]
    prediction = make_predictions(x_test[:, index, None], W ,b )
    label = y_test[index]
    print("Prediction: ", prediction)
    print("Label: ", label)

    if index % 100 == 0:
        current_image = current_image.reshape((28, 28)) * 255
        plt.gray()
        plt.imshow(current_image, interpolation='nearest')
        plt.show()
    return prediction[0], label

cost = (a2 - y)^2

In [None]:
def update_params(W, b, dW, db, alpha):
    for i in range(n_hidden_layers):
        W[i] = W[i] - alpha * dW[i]
        b[i] = b[i] - alpha * db[i]
    return W, b

In [None]:
def get_predictions(A):
    return np.argmax(A, 0) #argmax finds the highest probablilty in the array

def get_accuracy(predictions, Y):
    #print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

In [None]:
def gradient_descent_early_stopping(X_train, Y_train, X_dev, Y_dev, alpha, batch_size=64, patience=50):
    W, b = init_params()
    one_hot_Y_train = one_hot_encode(Y_train)
    one_hot_Y_dev = one_hot_encode(Y_dev)
    m = X_train.shape[1]
    best_accuracy = 0
    no_improvement = 0
    
    for i in range(1000):  # Number of epochs
        for j in range(0, m, batch_size):
            X_batch = X_train[:, j:j + batch_size]
            Y_batch = one_hot_Y_train[:, j:j + batch_size]
            Z, A = forward_prop(W, b, X_batch)
            dW, db = backward_prop(Z, A, W, X_batch, Y_batch, batch_size)
            W, b = update_params(W, b, dW, db, alpha)
        
        # Check validation accuracy
        predictions = make_predictions(X_dev, W, b)
        accuracy = get_accuracy(predictions, Y_dev)
        print(f"Iteration: {i}, Validation Accuracy: {accuracy}")
        
        # Early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            no_improvement = 0
        else:
            no_improvement += 1
            if no_improvement >= patience:
                print(f"Early stopping at iteration {i}")
                break
    return W, b

In [None]:
def make_predictions(X, W, b):
    Z, A = forward_prop(W, b, X)
    predictions = get_predictions(A[1])
    return predictions

def test_prediction(index, W, b):
    current_image = X_train[:, index, None]
    prediction = make_predictions(X_train[:, index, None], W ,b )
    label = Y_train[index]
    return prediction[0], label
    #print("Prediction: ", prediction)
    #print("Label: ", label)
    
    #current_image = current_image.reshape((28, 28)) * 255
    #plt.gray()
    #plt.imshow(current_image, interpolation='nearest')
    #plt.show()

different alpha values and its accuracy
0.1 : 93%
0.001 : 54%
0.01 : 88%
0.12 : 93.3%

16 neurons & 0.12 alpha: 95.5%
0.10 : 94%

In [None]:
W, b = gradient_descent_early_stopping(X_train, Y_train, X_dev, Y_dev, 0.12, batch_size=64)

In [None]:
correct = 0
for i in range(1000):
    prediction, label = test_prediction(i, W, b)
    if prediction == label:
        correct += 1
print('accuracy: ',correct/1000)

In [None]:
# Load data_human (single image)
#data_human = pd.read_csv('/kaggle/input/pixelss/pixels.csv', header=None)
#data_human = np.array(data_human)

#Normalize the data
#X_human = data_human / 255

def make_predictions_human(X, W, b):
    Z, A = forward_prop(W, b, X)
    predictions = get_predictions(A[-1])
    #plt.imshow(X_human.reshape(28, 28), cmap='gray')
    #plt.title("Human-Drawn Digit")
    #plt.show()
    return predictions


#prediction = make_predictions_human(X_human, W, b)
#print("Predicted digit for data_human:", prediction[0])

In [None]:
# Load data_test
data_test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
data_test = np.array(data_test)
data_test = data_test.T  # Now shape is (784, 28000)

m,q = data.shape

# Normalize the data
Y_test = data_test[0]
Y_test = data_test[1:q]
X_test = data_test / 255

In [None]:
def test_prediction_test(index, W, b):
    current_image = X_test[:, index, None]
    prediction = make_predictions(X_test[:, index, None], W ,b )

    return prediction[0]

There are no labels for the testing data

In [None]:
for i in range(10):
    prediction = test_prediction_test(i, W, b)
    plt.imshow(X_test[:, i].reshape(28, 28), cmap='gray')
    plt.title(f"Predicted: {prediction} ")
    plt.show()

Improvements:
use mini-batches
increase hidden layer neurons to 16 neurons