In [None]:
#Olof Persson
#Project 2 - Neural Network Grid Search Implementation

In [34]:
import pickle as cPickle, gzip
# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f,encoding='latin1')
f.close()

In [88]:
import numpy as np
def relu_derivative(x):
    return np.where(x > 0, 1, 0)
def softmax(x):
    exp_scores = np.exp(x)
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
def relu(x):
    return np.maximum(0, x)
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))


In [155]:
# THE NEURAL NETWORK - this is gradient descent with batching to the data to create a more accurate gradient
def train_nn(X, y, hidden_dim, learning_rate, reg_strength, num_iters):
#     Train a neural network on the given data.
#     X (numpy.ndarray): Input data, shape (N, D).
#     y (numpy.ndarray): Labels, shape (N,).
#     hidden_dim (int): Number of neurons in the hidden layer.
#     learning_rate (float): Learning rate for gradient descent.
#     reg_strength (float): Regularization strength.
#     num_iters (int): Number of iterations for gradient descent.
    batch_size = 1000
    num_classes = len(np.unique(y))
    num_examples, input_dim = X.shape

    # Initialize weights
    W1 = np.random.randn(input_dim, hidden_dim) / np.sqrt(input_dim / 2)
    b1 = np.zeros((1, hidden_dim))
    W2 = np.random.randn(hidden_dim, num_classes) / np.sqrt(hidden_dim / 2)
    b2 = np.zeros((1, num_classes))

    # Gradient descent loop
    loss_history = []
    for i in range(num_iters):
            # Shuffle the data after every epoch
            shuffled_indices = np.random.permutation(num_examples)
            x_shuffled = X[shuffled_indices]
            y_shuffled = y[shuffled_indices]
            
            for i in range(0, num_examples, batch_size):
        # Forward pass
                z1 = np.dot(X, W1) + b1
                a1 = sigmoid(z1)
                z2 = np.dot(a1, W2) + b2
                scores = sigmoid(z2)

        # Compute loss and regularization
                data_loss = np.mean(-np.log(scores[range(num_examples), y]))
                reg_loss = 0.5 * reg_strength * (np.sum(W1 * W1) + np.sum(W2 * W2))
                loss = data_loss + reg_loss
                loss_history.append(loss)

        # Backward pass
                dscores = scores
                dscores[range(num_examples), y] -= 1
                dscores /= num_examples

                dW2 = np.dot(a1.T, dscores)
                db2 = np.sum(dscores, axis=0, keepdims=True)

                da1 = np.dot(dscores, W2.T)
                dz1 = da1 * sigmoid_derivative(a1)

                dW1 = np.dot(X.T, dz1)
                db1 = np.sum(dz1, axis=0, keepdims=True)

        # Add regularization
                dW2 += reg_strength * W2
                dW1 += reg_strength * W1

        # Perform parameter update
                W1 -= learning_rate * dW1
                b1 -= learning_rate * db1
                W2 -= learning_rate * dW2
                b2 -= learning_rate * db2

    # Store weights in a dictionary
    weights = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

    return weights, loss_history[-1]


In [105]:
def predict(test_data, weights, biases):
    activations = test_data
    for i in range(len(weights)):
        activations = np.dot(activations, weights[i]) + biases[i]
        activations = sigmoid(activations)
    predicted_labels = np.argmax(activations, axis=1)
    return predicted_labels


In [163]:
import numpy as np
# Divide the data into training, validation, and test set x and y values
X_train, y_train = train_set
X_val, y_val = valid_set
X_test, y_test = test_set

# Define the hyperparameters to search over
learning_rates = [0.001, 0.01]
hidden_size_sets = [64, 128]

best_loss_acc = 10000000
best_hyperparams = {}
# Loop over all hyperparameter combinations and evaluate validation accuracy
# TASK 1 the grid search and the calling of the neural network in each pass of the gird search
# THE GRID SEARCH ALGORITHM - it is simplified with less hyper parameters to search through because it already takes a super long time to run

for lr in learning_rates:
    for hidden_sizes in hidden_size_sets:
        #model, val_acc = train_nn1(X_train, y_train, hidden_sizes1, output_size1, lr)
        model, loss_acc = train_nn(X_train, y_train,hidden_sizes, lr, 0, 10)
        print("Done with hidden size: ", hidden_sizes, " and learning rate: ", lr)
        print('Loss accuracy:', loss_acc)
        if loss_acc < best_loss_acc:
            best_loss_acc = loss_acc
            best_hyperparams = {'lr': lr, 'hidden_sizes': hidden_sizes}

# TASK 2 the training of the neural network with the best hyper parameters and predicting the test set and then reporting the accuracy

# Train a new model with the best hyperparameters on the combined training and validation sets
best_model, val_acc = train_nn(X_val, y_val, best_hyperparams['hidden_sizes'], best_hyperparams['lr'], 0, 10)
y_pred = predict(X_test, [best_model['W1'], best_model['W2']], [best_model['b1'], best_model['b2']])
# Test the performance of the model on the test set
test_acc = np.mean(y_pred == y_test)
print('Test accuracy:', test_acc)


Done with hidden size:  64  and learning rate:  0.001
Loss accuracy: 1.872101641603634
Done with hidden size:  128  and learning rate:  0.001
Loss accuracy: 2.0818422159091616
Done with hidden size:  64  and learning rate:  0.01
Loss accuracy: 1.8341151031556326
Done with hidden size:  128  and learning rate:  0.01
Loss accuracy: 1.7969256283495365
Test accuracy: 0.4028


In [None]:
#The example above was done with the number of iterations down to 10 vs the 100 used to get the 70% accuracy on
#single neural network tests mentioned below, becuase with 10 iterations it took 27 minutes to run the grid search on
#the computer. That is why the accuracy is so low. The accuracy with 100 iterations is 70% and more. I hope that is example 
#showing the grid search is enough to show that the grid search works and such. 

In [159]:
#Comparing to Assignment 3, Question 2
#The result compared to question 2 in assignment 3 is a little bit worse in that this one has around a 70-75% accuracy, 
#whereas the gridsearchCV algorithm and the MLPclassifier in the sklearn package resulted in a 96% accuracy.

In [160]:
#Discuss my performance
#The MLPclassifier only had one hidden layer like my algorithm, but the algorithm is definitely more accurate. 
#One way that I could improve the accuracy of my algorithm is to add more hidden layers, but that would take a lot more 
#time to run and increase the complexity of the algorithm. I could also try to add more hyper parameters to search through
#in the grid search algorithm. Something else I could also do would be to change the batch size in the gradient descent to
#be smaller or larger depending what makes the algorithm give a greater accuracy even if the smaller batch size takes longer.
#Lastly, I can think about tuning the regularization strength to see if the algorithm needed to be adjusted for overfitting. 
#All of those options could increase the accuracy of the algorithm, but it could make it take 
#a lot longer to run on an algorithm that already times a lot of time to run compared to the built in MLPclassifier. 