In [2]:
import numpy as np
import pandas as pd

testing_data = pd.read_csv("assets/testing10000.csv")
testing_data_labels = pd.read_csv("assets/testing10000_labels.csv")

training_data = pd.read_csv("assets/training60000.csv")
training_data_labels = pd.read_csv("assets/training60000_labels.csv")

#print("training data labels dimensions look like: ")
#print(training_data_labels.shape)
#print("The first 5 rows of the original labels: ")
#print(training_data_labels[:5])


#need functions to pull data from the csv, pass 2 dataframes
def load_from_csv(features, labels):
    X = features.values
    y = labels.values
    return X, y

#for final accuracy calc
def calculate_accuracy(predictions, ground_truth):
    correct_predictions = np.sum(predictions == ground_truth)
    total_samples = len(ground_truth)
    accuracy = correct_predictions / total_samples
    return accuracy * 100

#logistic activation function, also known as sigmoid
def logistic(x):
    return 1 / (1 + np.exp(-x))

#derivative of the logistic function as well
def logistic_deriv(x):
    return x * (1-x)

#softmax activation function
def softmax(x):
    exp = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp / np.sum(exp, axis=1, keepdims=True)

#mean squared error for the cost of prediction
def mean_squared_error(true, prediction):
    return np.mean((true, prediction)**2)

#and the derivative of the function
def mean_squared_error_deriv(true, prediction):
    return 2 * (prediction - true) / len(true)

#cross entropy
def cross_entropy_loss(true, prediction):
    #epsilon storage
    epsilon = 1e-8
    prediction = np.clip(prediction, epsilon, 1 - epsilon)
    return -np.sum(true * np.log(prediction)) / len(true)

#and the cross entropy derivative
def cross_entropy_loss_deriv(true, prediction):
    return (prediction - true) / len(true)



def initialize_weights(input, hidden, output):
    #maybe have to set a random seed here? keep it consistent
    np.random.seed(42)
    weights_input_hidden = np.random.randn(input, hidden)
    biases_input_hidden = np.zeros((1, hidden))
    weights_hidden_output = np.random.randn(hidden, output)
    biases_hidden_output = np.zeros((1, output))
    return weights_input_hidden, biases_input_hidden, weights_hidden_output, biases_hidden_output


def forward_propogation(X, weights_input, bias_input, weights_hidden_output, biases_hidden_output):
    #forward pass
    hidden_input = np.dot(X, weights_input) + bias_input
    hidden_output = logistic(hidden_input) #use the activation function on the dot product
    output_input = np.dot(hidden_output, weights_hidden_output) + biases_hidden_output #for next layer, dot product the activated layer and the weights + bias
    output_output = softmax(output_input) #apply softmax to change the values to probabilities, 0.0-1.0
    
    return hidden_output, output_output #return output of the hidden layer, as well as the probability output we continue passing 
    

#backpropogation
def Back_propogate(X, y, hidden_output, output_output, weights_input_hidden, weights_hidden_output, biases_input_hidden, biases_hidden_output, learning_rate):

    #find the derivative of the error of the output layer, first by output with cross entropy and then the dot product of the output error with the weights
    #to find what we need to change, multiplying it by the deriv of the sigmoid func
    output_error = cross_entropy_loss_deriv(y, output_output)
    hidden_error = np.dot(output_error, weights_hidden_output.T) * logistic_deriv(hidden_output)

    #weight updates with the learning rate
    weights_hidden_output -= learning_rate * np.dot(hidden_output.T, output_error)
    biases_hidden_output -= learning_rate * np.sum(output_error, axis=0, keepdims=True)
    weights_input_hidden -= learning_rate * np.dot(X.reshape(1, -1).T, hidden_error)
    biases_input_hidden -= learning_rate * np.sum(hidden_error, axis=0, keepdims=True)

def train_neural_network(X_train, Y_train, batch_size, input_size, hidden_size, output_size, learning_rate, epochs):
    #generate the weights and biases
    weights_input_hidden, biases_input_hidden, weights_hidden_output, biases_hidden_output = initialize_weights(input_size, hidden_size, output_size)

    #printed line just to know we entered and that data is being processed
    print("now processing the data, this will take a while...")
    
    #for tracking and printing
    correct_classifs = 0
    incorrect_classifs = 0

    #start doing training loops
    for epoch in range(epochs):
        indices = np.arange(len(X_train)) #create a var that spans the length of the X training set
        np.random.shuffle(indices) #shuffle those indices
        X_shuffle, Y_shuffle = X_train[indices], Y_train[indices] #take a data nugget from the y and x training set

        for i in range(len(X_shuffle)):
            #forward propogation
            hidden_output, output_output = forward_propogation(X_shuffle[i], weights_input_hidden, biases_input_hidden, weights_hidden_output, biases_hidden_output)
            
            #backwards propogation
            Back_propogate(X_shuffle[i], Y_shuffle[i], hidden_output, output_output, weights_input_hidden, weights_hidden_output, biases_input_hidden, biases_hidden_output, learning_rate)
            
            #passing X_shuffle into the forward prop is using the entire dataset, which obviously cant be used
            #calculating info loss for the batch
            if i % batch_size == 0:
                _, train_output = forward_propogation(X_shuffle[i], weights_input_hidden, biases_input_hidden, weights_hidden_output, biases_hidden_output)
                loss = cross_entropy_loss(Y_train, train_output)
                print(f"Epoch {epoch + 1}/{epochs}, Batch {i//batch_size + 1}/{len(X_shuffle)//batch_size}, Loss: {loss}")

    _, final_output = forward_propogation(X_train, weights_input_hidden, biases_input_hidden, weights_hidden_output, biases_hidden_output)
    predicted_labels = np.argmax(final_output, axis=1)
    ground_truth_labels = np.argmax(Y_train, axis=1)


    #print out the params of the process
    print(f"Output of a neural network with {hidden_size} neurons in a single hidden layer, a learning rate of {learning_rate}, data divided into {batch_size} and passed over {epochs} times:")
    
    #count the classifs
    correct_classifs += np.sum(predicted_labels == ground_truth_labels)
    incorrect_classifs += np.sum(predicted_labels != ground_truth_labels)
    print(f"Correct classifications made: {correct_classifs}")
    print(f"Incorrect classifications made: {incorrect_classifs}")

    # No need for one-hot encoding for ground truth labels in the accuracy calculation
    accuracy = calculate_accuracy(predicted_labels, ground_truth_labels)
    print(f"Overall Accuracy: {accuracy}%")

    return weights_input_hidden, biases_input_hidden, weights_hidden_output, biases_hidden_output

    
x_train, y_train = load_from_csv(training_data, training_data_labels)

#details of the ANN outlined by assignment + experiment numbers
inputs = 784 #logistic
hidden_layers = 30 #can change the number to experiment
output_layers = 10 #softmax
epochs = 25
batches = 500 #batch length, make it much larger for the actual data
learn_rate = .01

#gotta "one hot" the y graph to remove errors
def one_hot(y):
    one_hot_y = np.zeros((y.size, y.max() + 1))
    one_hot_y[np.arange(y.size), y.flatten()] = 1  # Use flatten to ensure correct indexing
    return one_hot_y
    
y_train_hot = one_hot(y_train)

#finally, call the program
trained_weights_input_hidden, trained_biases_input_hidden, trained_weights_hidden_output, trained_biases_hidden_output = train_neural_network(x_train, y_train_hot, batches, inputs, hidden_layers, output_layers, learn_rate, epochs) 

#stats for configs:
#baseline: 30 layers, 85 epochs, 500 batches, .01 learn rate
#Overall Accuracy: 0.9396823280388007 -baseline,
#Overall Accuracy: 0.9345655760929349 -15 epochs
#Overall Accuracy: 0.9185153085884765 -45 epochs
#Overall Accuracy: 0.9090151502525042 -55 epochs
#Overall Accuracy: 0.89504825080418 -65 epochs
#Overall Accuracy: 0.9027983799729995 -60 epochs, the magic number

#proceeding with 25 epochs:
# 250 batches = no difference in the loss jumping
# .1 learning rate = massive jump in percent, not in the loss jumping
#    proceeding to push down the pochs with learn rate at this value:
#    20 epochs = Overall Accuracy: 0.9587326455440924 
#    10 epochs = Overall Accuracy: 0.9430157169286155
#    1 epoch? = Overall Accuracy: 0.8652144202403373 
#
#    this is likely an overshooting issue, time to see learning rate .001 impact:
#    20 epochs = Overall Accuracy: 0.7585126418773647 
#    30 epochs = Overall Accuracy: 0.797046617443624 
#    85 epoochs = Overall Accuracy: 0.8594476574609576
#    the least necessary epochs for the model to have 90% accuracy is much higher than .01,  
#    each epoch contributes much less accuracy in this learning rate, implying its too low a number
#

now processing the data, this will take a while...
Epoch 1/25, Batch 1/119, Loss: 5.777004842681532
Epoch 1/25, Batch 2/119, Loss: 5.474618732066435
Epoch 1/25, Batch 3/119, Loss: 4.368222795705219
Epoch 1/25, Batch 4/119, Loss: 3.9884882048629318
Epoch 1/25, Batch 5/119, Loss: 4.1596952655205675
Epoch 1/25, Batch 6/119, Loss: 3.4474067150013346
Epoch 1/25, Batch 7/119, Loss: 3.132475710847291
Epoch 1/25, Batch 8/119, Loss: 5.439696426771947
Epoch 1/25, Batch 9/119, Loss: 3.398846669118386
Epoch 1/25, Batch 10/119, Loss: 2.9195501285236336
Epoch 1/25, Batch 11/119, Loss: 3.2056809842661695
Epoch 1/25, Batch 12/119, Loss: 3.1532243785456213
Epoch 1/25, Batch 13/119, Loss: 3.2605686279976966
Epoch 1/25, Batch 14/119, Loss: 4.328222415939447
Epoch 1/25, Batch 15/119, Loss: 2.7476509477826756
Epoch 1/25, Batch 16/119, Loss: 3.7320204102386323
Epoch 1/25, Batch 17/119, Loss: 2.6869778071334167
Epoch 1/25, Batch 18/119, Loss: 3.8981357898625033
Epoch 1/25, Batch 19/119, Loss: 4.2367848714931