In [112]:
from sklearn.datasets import load_digits #import the dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [114]:
#Load the data
digits = load_digits()
X = digits.data
Y = digits.target

#Create a DataFrame
digits_data = pd.DataFrame(digits.data, columns=[f'pixel_{i}' for i in range(digits.data.shape[1])])
digits_data['target'] = digits.target  # Add target labels for easier inspection

# Display the first few rows for visualization purposes
print(digits_data.head())

y_one_hot =[]
for (target) in Y:
    one_hot=np.zeros(10)
    one_hot[target] = 1
    y_one_hot.append(one_hot)
y_one_hot = np.array(y_one_hot)

#Divide into train and test data
X_train2, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)

#scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train2)
# Transform the test data using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test)

#Set the network structure as [64, 30, 10] -> 64 for the input layer, 30 for the hidden layer, and 10 for the output layer
input_size = 64     # Input layer size
hidden_size = 30    # Hidden layer size
output_size = 10    # Output layer size

# #One hot encoded the target
# y_train_encoded = np.zeros((y_train.size, output_size))
# y_train_encoded[np.arange(y_train.size), y_train] = 1
# print (y_train_encoded)


   pixel_0  pixel_1  pixel_2  pixel_3  pixel_4  pixel_5  pixel_6  pixel_7  \
0      0.0      0.0      5.0     13.0      9.0      1.0      0.0      0.0   
1      0.0      0.0      0.0     12.0     13.0      5.0      0.0      0.0   
2      0.0      0.0      0.0      4.0     15.0     12.0      0.0      0.0   
3      0.0      0.0      7.0     15.0     13.0      1.0      0.0      0.0   
4      0.0      0.0      0.0      1.0     11.0      0.0      0.0      0.0   

   pixel_8  pixel_9  ...  pixel_55  pixel_56  pixel_57  pixel_58  pixel_59  \
0      0.0      0.0  ...       0.0       0.0       0.0       6.0      13.0   
1      0.0      0.0  ...       0.0       0.0       0.0       0.0      11.0   
2      0.0      0.0  ...       0.0       0.0       0.0       0.0       3.0   
3      0.0      8.0  ...       0.0       0.0       0.0       7.0      13.0   
4      0.0      0.0  ...       0.0       0.0       0.0       0.0       2.0   

   pixel_60  pixel_61  pixel_62  pixel_63  target  
0      10.0     

In [115]:
# Activation Function: Sigmoid

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

In [116]:
# Activation Function: ReLu

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

In [117]:
# Activation Function: tanH

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

In [118]:
#FeedForward function
#For every X, multiply by the corresponding weight and add the bias
#Pass the previous result into the activation function
#Pass previous result as input into the next layer

def FeedForward_sigmoid (X_train1, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output):
    hidden_layer_input = np.dot(X_train1, weights_input_hidden) + biases_hidden
    hidden_layer_output = sigmoid(hidden_layer_input)
    output_layer_input = np.dot(hidden_layer_output, weights_hidden_output) + biases_output
    output_layer_output = sigmoid(output_layer_input)
    return hidden_layer_input, output_layer_input, hidden_layer_output, output_layer_output
     
def FeedForward_relu (X_train1, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output):
    hidden_layer_input = np.dot(X_train1, weights_input_hidden) + biases_hidden
    hidden_layer_output = relu(hidden_layer_input)
    output_layer_input = np.dot(hidden_layer_output, weights_hidden_output) + biases_output
    output_layer_output = relu(output_layer_input)
    return hidden_layer_input, output_layer_input, hidden_layer_output, output_layer_output

def FeedForward_tanh (X_train1, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output):
    hidden_layer_input = np.dot(X_train1, weights_input_hidden) + biases_hidden
    hidden_layer_output = tanh(hidden_layer_input)
    output_layer_input = np.dot(hidden_layer_output, weights_hidden_output) + biases_output
    output_layer_output = tanh(output_layer_input)
    return hidden_layer_input, output_layer_input, hidden_layer_output, output_layer_output
    

In [120]:
def backPropagation_sigmoid(learning_rate, epochs, batch_size, X_train, y_train, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, activation="sigmoid"):
    #Use Mini Batch Gradient descent for better results due to the size of the dataset
    num_samples = X_train.shape[0]
    for epoch in range(epochs):
        # Shuffle the data at the start of each epoch
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        X_train = X_train[indices]
        y_train = y_train[indices]
        # Iterate over mini-batches
        for start_id in range(0, num_samples, batch_size):
            end_id = min(start_id + batch_size, num_samples)
            X_batch = X_train[start_id:end_id]
            y_batch = y_train[start_id:end_id]
            
            # Feedforward pass
            if (activation=="sigmoid"):
                hidden_layer_input, output_layer_input, hidden_layer_output, output_layer_output = FeedForward_sigmoid(X_batch, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output)
                error_output = (output_layer_output - y_batch) * sigmoid_derivative(output_layer_input)
                error_hidden_layer = np.dot(error_output, weights_hidden_output.T) * sigmoid_derivative(hidden_layer_input)
            elif (activation=="tanh"):
                hidden_layer_input, output_layer_input, hidden_layer_output, output_layer_output = FeedForward_tanh(X_batch, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output)
                error_output = (output_layer_output - y_batch) * tanh_derivative(output_layer_input)
                error_hidden_layer = np.dot(error_output, weights_hidden_output.T) * tanh_derivative(hidden_layer_input)
            else:
                hidden_layer_input, output_layer_input, hidden_layer_output, output_layer_output = FeedForward_relu(X_batch, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output)
                error_output = (output_layer_output - y_batch) * relu_derivative(output_layer_input)
                error_hidden_layer = np.dot(error_output, weights_hidden_output.T) * relu_derivative(hidden_layer_input)
                
            # Calculate loss (MSE for the batch)
            loss = np.mean((y_batch - output_layer_output) ** 2)
            
            # Backpropagation
            # Output layer error
            #error_output = (output_layer_output - y_batch) * sigmoid_derivative(output_layer_input)
            gradient_weights_hidden_output = np.dot(hidden_layer_output.T, error_output) / batch_size
            gradient_biases_output = np.mean(error_output, axis=0)

            # Hidden layer error
            #error_hidden_layer = np.dot(error_output, weights_hidden_output.T) * sigmoid_derivative(hidden_layer_input)
            gradient_weights_input_hidden = np.dot(X_batch.T, error_hidden_layer) / batch_size
            gradient_biases_hidden = np.mean(error_hidden_layer, axis=0)

            # Update weights and biases
            weights_hidden_output -= learning_rate * gradient_weights_hidden_output
            biases_output -= learning_rate * gradient_biases_output
            weights_input_hidden -= learning_rate * gradient_weights_input_hidden
            biases_hidden -= learning_rate * gradient_biases_hidden
        
        # Print loss every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")
    return weights_input_hidden, biases_hidden, weights_hidden_output, biases_output


In [121]:
def calculate_accuracy(X, y, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, activation1):
    # Feedforward pass on the test data
    if (activation1=="sigmoid"):
        _, _, _, output_layer_output = FeedForward_sigmoid(X, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output)
    elif (activation1=="tanh"):
        _, _, _, output_layer_output = FeedForward_tanh(X, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output)
    else:
        _, _, _, output_layer_output = FeedForward_relu(X, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output)
    # Convert output layer output to predicted class labels
    y_pred = np.argmax(output_layer_output, axis=1)
    y_true = np.argmax(y, axis=1)
    # Calculate accuracy
    accuracy = np.mean(y_pred == y_true) * 100
    return accuracy


In [137]:
# Sigmoid Implementation
# Set learning rate, epochs, and batch size
learning_rate = 0.01
epochs = 2500
batch_size = 32  # Set mini-batch size

# Initialize weights and biases
weights_input_hidden = np.random.rand(input_size, hidden_size)# * 0.01
biases_hidden = np.zeros(hidden_size)
weights_hidden_output = np.random.rand(hidden_size, output_size)# * 0.01
biases_output = np.zeros(output_size)

# Run backpropagation with mini-batch gradient descent
weights_input_hidden, biases_hidden, weights_hidden_output, biases_output = backPropagation_sigmoid(learning_rate, epochs, batch_size, X_train_scaled, y_train, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, "sigmoid")
# Calculate accuracy on the test set
test_accuracy = calculate_accuracy(X_test_scaled, y_test, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, "sigmoid")
print(f"Test Accuracy: {test_accuracy:.2f}%")


Epoch 0, Loss: 0.7132
Epoch 100, Loss: 0.4344
Epoch 200, Loss: 0.0786
Epoch 300, Loss: 0.0793
Epoch 400, Loss: 0.0724
Epoch 500, Loss: 0.0684
Epoch 600, Loss: 0.0594
Epoch 700, Loss: 0.0654
Epoch 800, Loss: 0.0553
Epoch 900, Loss: 0.0549
Epoch 1000, Loss: 0.0480
Epoch 1100, Loss: 0.0482
Epoch 1200, Loss: 0.0246
Epoch 1300, Loss: 0.0255
Epoch 1400, Loss: 0.0331
Epoch 1500, Loss: 0.0149
Epoch 1600, Loss: 0.0256
Epoch 1700, Loss: 0.0274
Epoch 1800, Loss: 0.0172
Epoch 1900, Loss: 0.0200
Epoch 2000, Loss: 0.0228
Epoch 2100, Loss: 0.0223
Epoch 2200, Loss: 0.0183
Epoch 2300, Loss: 0.0191
Epoch 2400, Loss: 0.0135
Test Accuracy: 94.17%


In [136]:
# Sigmoid Implementation
# Set learning rate, epochs, and batch size
learning_rate = 0.01
epochs = 2500
batch_size = 32  # Set mini-batch size

# Initialize weights and biases
weights_input_hidden = np.random.rand(input_size, hidden_size)# * 0.01
biases_hidden = np.zeros(hidden_size)
weights_hidden_output = np.random.rand(hidden_size, output_size)# * 0.01
biases_output = np.zeros(output_size)

# Run backpropagation with mini-batch gradient descent
weights_input_hidden, biases_hidden, weights_hidden_output, biases_output = backPropagation_sigmoid(learning_rate, epochs, batch_size, X_train_scaled, y_train, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, "tanh")
# Calculate accuracy on the test set
test_accuracy = calculate_accuracy(X_test_scaled, y_test, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, "tanh")
print(f"Test Accuracy: {test_accuracy:.2f}%")


Epoch 0, Loss: 1.0866
Epoch 100, Loss: 0.8416
Epoch 200, Loss: 0.4292
Epoch 300, Loss: 0.0631
Epoch 400, Loss: 0.0514
Epoch 500, Loss: 0.0533
Epoch 600, Loss: 0.0454
Epoch 700, Loss: 0.0445
Epoch 800, Loss: 0.0484
Epoch 900, Loss: 0.0456
Epoch 1000, Loss: 0.0372
Epoch 1100, Loss: 0.0409
Epoch 1200, Loss: 0.0405
Epoch 1300, Loss: 0.0474
Epoch 1400, Loss: 0.0403
Epoch 1500, Loss: 0.0344
Epoch 1600, Loss: 0.0336
Epoch 1700, Loss: 0.0294
Epoch 1800, Loss: 0.0447
Epoch 1900, Loss: 0.0345
Epoch 2000, Loss: 0.0315
Epoch 2100, Loss: 0.0316
Epoch 2200, Loss: 0.0316
Epoch 2300, Loss: 0.0313
Epoch 2400, Loss: 0.0375
Test Accuracy: 92.50%


In [138]:
# Sigmoid Implementation
# Set learning rate, epochs, and batch size
learning_rate = 0.01
epochs = 2500
batch_size = 32  # Set mini-batch size

# Initialize weights and biases
weights_input_hidden = np.random.rand(input_size, hidden_size)# * 0.01
biases_hidden = np.zeros(hidden_size)
weights_hidden_output = np.random.rand(hidden_size, output_size)# * 0.01
biases_output = np.zeros(output_size)

# Run backpropagation with mini-batch gradient descent
weights_input_hidden, biases_hidden, weights_hidden_output, biases_output = backPropagation_sigmoid(learning_rate, epochs, batch_size, X_train_scaled, y_train, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, "relu")
# Calculate accuracy on the test set
test_accuracy = calculate_accuracy(X_test_scaled, y_test, weights_input_hidden, biases_hidden, weights_hidden_output, biases_output, "relu")
print(f"Test Accuracy: {test_accuracy:.2f}%")


Epoch 0, Loss: 0.1080
Epoch 100, Loss: 0.1000
Epoch 200, Loss: 0.1000
Epoch 300, Loss: 0.1000
Epoch 400, Loss: 0.1000
Epoch 500, Loss: 0.1000
Epoch 600, Loss: 0.1000
Epoch 700, Loss: 0.1000
Epoch 800, Loss: 0.1000
Epoch 900, Loss: 0.1000
Epoch 1000, Loss: 0.1000
Epoch 1100, Loss: 0.1000
Epoch 1200, Loss: 0.1000
Epoch 1300, Loss: 0.1000
Epoch 1400, Loss: 0.1000
Epoch 1500, Loss: 0.1000
Epoch 1600, Loss: 0.1000
Epoch 1700, Loss: 0.1000
Epoch 1800, Loss: 0.1000
Epoch 1900, Loss: 0.1000
Epoch 2000, Loss: 0.1000
Epoch 2100, Loss: 0.1000
Epoch 2200, Loss: 0.1000
Epoch 2300, Loss: 0.1000
Epoch 2400, Loss: 0.1000
Test Accuracy: 9.17%


In [None]:
#The best results given the hyperparemeters of learning_rate = 0.01, epochs = 2500, the best performing trial was using the sigmoid activation function with an accuracy value of 94.17%, 
#followed by tanh activation function with hyperparameters learning_rate = 0.01, epochs = 2500 with a result of 92.50% and finally the Relu activation function with a value of 9.17%
#

In [128]:
#Accuracy
#Do predict
#Find the index of the maximum value in the output later
#Compare the index to the index in the one-hot encoded Y array
