In [36]:
# Problem 3
import numpy as np
from sklearn.model_selection import train_test_split

In [37]:
# Load data
starting_x_tr = np.reshape(np.load("fashion_mnist_train_images.npy"), (-1, 28*28))
starting_y_tr = np.load("fashion_mnist_train_labels.npy")
x_te = np.reshape(np.load("fashion_mnist_test_images.npy"), (-1, 28*28))
y_te = np.load("fashion_mnist_test_labels.npy")

In [38]:
# Normalize data
starting_x_tr = starting_x_tr/255
x_te = x_te/255

In [39]:
# One hot encoding function
def to_one_hot(y):
    shape = (y.size, int(np.max(y) + 1))
    rows = np.arange(y.size)
    one_hot = np.zeros(shape)
    one_hot[rows, y] = 1.
    return one_hot

In [40]:
# Split training data to create validation datasets
x_train, x_val, y_train, y_val = train_test_split(starting_x_tr, starting_y_tr, train_size=0.8)
print("X_train size:", x_train.shape)
print("X_val size:", x_val.shape)
print("y_train size:", y_train.shape)
print("y_val size:", y_val.shape)

X_train size: (48000, 784)
X_val size: (12000, 784)
y_train size: (48000,)
y_val size: (12000,)


In [41]:
# Converting y training, validation, and testing sets to one-hot encoded
y_train_one_hot = to_one_hot(y_val)
print(y_train_one_hot.shape)

y_val_one_hot = to_one_hot(y_val)
print(y_val_one_hot.shape)

y_te_one_hot = to_one_hot(y_te)
print(y_te_one_hot.shape)

(12000, 10)
(12000, 10)
(10000, 10)


In [42]:
def softmax(z):
    z = z - np.max(z)
    bottom = np.sum(np.exp(z), axis=1, keepdims=True)
    return np.exp(z) / bottom

In [43]:
# Calculate cross-entropy loss
def crossentropy(y, y_hat):
    return -np.mean(np.sum(np.log(y_hat) * y, axis=1))

In [44]:
# Calculate percentage of correct predictions
def computeAccuracy(y, y_hat):
    return np.mean(np.argmax(y_hat, axis=1) == y)

In [45]:
# Perform the gradient step
def gradient_descent(y_hat, x, y, b, w, learn_rate, m, alpha):
    cost = y_hat - y
    reg = alpha * w
    gradW = 1/m * x.T.dot(cost) + reg
    gradB = 1/m * np.sum(cost, axis=0)
    w = w - learn_rate * gradW
    b = b - learn_rate * gradB
    return w, b

In [46]:
# Loop through the samples based on the batch_size hyperparameter
def batch_loop(x, y, size):
	if(len(x) == len(y)):
		random_x = x[np.random.permutation(x.shape[0])]
		random_y = y[np.random.permutation(y.shape[0])]
		for i in np.arange(0, y.shape[0], size):
			yield random_x[i:i + size], random_y[i:i + size]

In [47]:
# Perform SGD
def stochastic_gradient_descent(x, y, b, w, learn_rate, num_of_epoch, batch_size, alpha):
	
	for n in range(num_of_epoch - 1):
		for mini_batch_x, mini_batch_y in batch_loop(x, y, batch_size):
			z = mini_batch_x[:batch_size].dot(w)
			y_hat = softmax(z)
			w, b = gradient_descent(y_hat, mini_batch_x[:batch_size], mini_batch_y[:batch_size], b, w, learn_rate, batch_size, alpha)
		
	# return w, b 
	return w, b

In [48]:
# Find the lowest error by performing SGD
def train_w_and_b(x, y, learn_rate, num_of_epoch, batch_size, alpha):
	w = np.random.randn(x.shape[1], len(np.unique(y)))
	b = np.random.rand(len(np.unique(y)))
	y = to_one_hot(y)
	w_trained, b_trained = stochastic_gradient_descent(x, y, b, w, learn_rate, num_of_epoch, batch_size, alpha)
	return w_trained, b_trained

In [49]:
# Implementation of grid_search to tune our hyperparameters by looping through the various values we have for each
def grid_search():
	hyperparameters = {
		"learn_rate": [0.25, 0.1, 0.01],
		"num_of_epoch": [30, 40, 50],
		"batch_size": [50, 100, 200],
		"alpha": [0.25, 0.1, 0.01]	
	}
	for a in range(len(hyperparameters["num_of_epoch"])):
		for b in range(len(hyperparameters["batch_size"])):
			for c in range(len(hyperparameters["learn_rate"])):
				for d in range(len(hyperparameters["alpha"])):
						yield hyperparameters["num_of_epoch"][a], hyperparameters["batch_size"][b], hyperparameters["learn_rate"][c], hyperparameters["alpha"][d]

In [50]:
# Initialize best hyperparameter values as the worst they could be
best_CE_loss = 1000000
best_accuracy = -1
best_num_of_epoch = -1
best_batch_size = -1
best_learn_rate = -1
best_alpha = -1
y_val_one_hot = to_one_hot(y_val)

print("Problem 3 Output:\n")

# Loop through each combination of the hyperparameters in grid_search() to find the best combination to minimize MSE
for num_of_epoch, batch_size, learn_rate, alpha in grid_search():

	# Print number of epochs used for each loop
	print("Number of epochs: ", num_of_epoch)

	# w_trained, b_trained = find_lowest_error(x_train, y_train, learn_rate, num_of_epoch, batch_size, alpha)
	w_trained, b_trained = train_w_and_b(x_train, y_train, learn_rate, num_of_epoch, batch_size, alpha)

	# Calculate the cross-entropy loss and accuracy
	z = x_val@(w_trained)
	y_hat = softmax(z)
	CE_loss = crossentropy(y_val_one_hot, y_hat)
	accuracy = computeAccuracy(y_val, y_hat)
	print("Cross-entropy loss: ", CE_loss)
	print("Percent of correctly classified examples: ", accuracy) 

	# Store the hyperparameters that led to reduced error in the following variables
	if CE_loss < best_CE_loss:
		best_CE_loss = CE_loss
		best_accuracy = accuracy
		best_learn_rate = learn_rate
		best_num_of_epoch = num_of_epoch
		best_batch_size = batch_size
		best_alpha = alpha

# Finally, calculate the error using the trained weights and biases
z = x_te@(w_trained)
y_hat = softmax(z)
CE_loss = crossentropy(y_te_one_hot, y_hat)
accuracy = computeAccuracy(y_te, y_hat)
print("\n")
print("Results of training:")
print("best cross-entropy loss from validation dataset: ", best_CE_loss)
print("best accuracy from validation dataset: ", best_accuracy)
print("best learning rate: ", best_learn_rate)
print("best number of epochs: ", best_num_of_epoch)
print("best batch size: ", best_batch_size)
print("best reg term: ", best_alpha)
print("Cross-entropy loss from test dataset: ", CE_loss)
print("Percent of correctly classified examples from test dataset: ", accuracy)

Problem 3 Output:

Number of epochs:  30


Cross-entropy loss:  4.148856882081146
Percent of correctly classified examples:  0.14016666666666666
Number of epochs:  30
Cross-entropy loss:  3.114516978707293
Percent of correctly classified examples:  0.18633333333333332
Number of epochs:  30
Cross-entropy loss:  3.6030455363868406
Percent of correctly classified examples:  0.10591666666666667
Number of epochs:  30
Cross-entropy loss:  2.554560461904809
Percent of correctly classified examples:  0.12241666666666666
Number of epochs:  30
Cross-entropy loss:  2.3254771277535196
Percent of correctly classified examples:  0.18258333333333332
Number of epochs:  30
Cross-entropy loss:  2.5040727554269933
Percent of correctly classified examples:  0.12766666666666668
Number of epochs:  30
Cross-entropy loss:  2.3002580769682934
Percent of correctly classified examples:  0.11275
Number of epochs:  30
Cross-entropy loss:  2.2937962316550315
Percent of correctly classified examples:  0.10525
Number of epochs:  30
Cross-entropy loss:  2.3054

In [51]:
# Problem 4

import numpy as np

W1 = np.array([[1, 2], [0, 1], [-1, 0]])
b1 = np.array([[0],[0], [3]])
W2 = np.array([1, -2, 1/4])
b2 = [0]
L = [[W1, b1], [W2, b2]]
x = np.array([[-1], [1]])

def affineTransformation(W, b, x):
    y = np.matmul(W,x) + b
    return y

def composition(L, x):
    f = []
    for i in range(len(L)):
        y = affineTransformation(L[i][0], L[i][1], x)
        f.append(y)
        x = y
    return f

def computeGradients(L, z, x):
    W = list(zip(*L))[0]
    gradw = np.matmul(np.transpose(W[0]), W[1].reshape(-1,1))
    gradb = np.matmul(np.transpose(W[0]), W[1].reshape(-1,1))*x
    return gradw, gradb

print("Problem 4 Output:")
print("z = ", '\n', composition(L, x))
print("gradients of z: ", '\n', computeGradients(L,composition(L,x),x))

Problem 4 Output:
z =  
 [array([[1],
       [1],
       [4]]), array([0.])]
gradients of z:  
 (array([[0.75],
       [0.  ]]), array([[-0.75],
       [ 0.  ]]))
