### Download dataset from Kaggle, fashion_mnist

todo: explain why we choose to use fashion_mnist

In [84]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

training_set = pd.read_csv('Dataset/fashion_data/fashion-mnist_train.csv')
test_set = pd.read_csv('Dataset/fashion_data/fashion-mnist_test.csv')

m, n = training_set.shape

print(training_set.shape)
print(test_set.shape)

training_set = np.array(training_set)
test_set = np.array(test_set)


(60000, 785)
(10000, 785)


In [85]:
# X is pixels, y is labels

test_set = test_set.T

y_test = test_set[0]
X_test = test_set[1:]
X_test = X_test / 255

training_set = training_set.T

y_train = training_set[0]
X_train = training_set[1:]

X_train = X_train / 255


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(784, 60000)
(60000,)
(784, 10000)
(10000,)


In [86]:
def init_param():
    w1 = np.random.rand(10, 784) * 0.01
    b1 = np.random.rand(10, 1)
    w2 = np.random.rand(10, 10) * 0.01
    b2 = np.random.rand(10, 1)
    
    return w1, b1, w2, b2

w1, b1, w2, b2 = init_param()

In [87]:
# activation functions and its derivatives
def ReLU(x):
    t= x * (x > 0)
    return t

def ReLU_derivative(x):
    return (x >= 0) * 1

def sigmoid(x):
    s=1/(1+np.exp(-x))
    return s

def sigmoid_derivative(x):
    s=1/(1+np.exp(-x))
    ds=s*(1-s)  
    return ds
    
def softmax(x):
    return np.exp(x)/sum(np.exp(x))

In [88]:
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y    

In [89]:
def forward_prop(w1, b1, w2, b2, X):

    z1 = w1.dot(X) + b1
    a1 = ReLU(z1)
    z2 = w2.dot(a1) + b2
    a2 = softmax(z2)
    
    return z1, a1, z2, a2

def neg_log_likelihood(y, y_hat):
    return -1/len(y) * np.sum(np.sum(y * np.log(y_hat)))

In [90]:
def back_prop(z1, a1, z2, a2, w2, X, Y):
    
    one_hot_y = one_hot(Y)
    
    dz2 = a2 - one_hot_y
    
    dw2 = 1/m * dz2.dot(a1.T)
    db2 = 1/m * np.sum(dz2)
    
    dz1 = w2.T.dot(dz2) * ReLU_derivative(z1)
    
    dw1 = 1 / m * dz1.dot(X.T)
    db1 = 1 / m * np.sum(dz1)
        

    return dw1, db1, dw2, db2


In [93]:
def update_param(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    
    w1, b1, w2, b2 = init_param()
    
    for i in range(iterations):
        z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, X)
        dW1, db1, dW2, db2 = back_prop(z1, a1, z2, a2, w2, X, Y)
        w1, b1, w2, b2 = update_param(w1, b1, w2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(a2)
            print(get_accuracy(predictions, Y))
            
    return w1, b1, w2, b2

In [95]:
w1, b1, w2, b2 = gradient_descent(X_train, y_train, 0.1, 650)

Iteration:  0
[8 8 8 ... 8 8 8] [2 9 6 ... 8 8 7]
0.1
Iteration:  10
[2 8 2 ... 4 2 4] [2 9 6 ... 8 8 7]
0.09893333333333333
Iteration:  20
[2 8 2 ... 4 2 4] [2 9 6 ... 8 8 7]
0.09993333333333333
Iteration:  30
[2 8 2 ... 4 4 8] [2 9 6 ... 8 8 7]
0.10665
Iteration:  40
[2 8 2 ... 4 4 9] [2 9 6 ... 8 8 7]
0.24226666666666666
Iteration:  50
[2 9 2 ... 4 9 9] [2 9 6 ... 8 8 7]
0.3942
Iteration:  60
[8 9 4 ... 4 8 9] [2 9 6 ... 8 8 7]
0.4518666666666667
Iteration:  70
[8 9 4 ... 4 8 9] [2 9 6 ... 8 8 7]
0.5192833333333333
Iteration:  80
[8 9 4 ... 8 8 7] [2 9 6 ... 8 8 7]
0.54105
Iteration:  90
[8 9 4 ... 8 8 7] [2 9 6 ... 8 8 7]
0.55515
Iteration:  100
[8 9 4 ... 8 8 7] [2 9 6 ... 8 8 7]
0.5679
Iteration:  110
[8 9 4 ... 8 8 7] [2 9 6 ... 8 8 7]
0.5813166666666667
Iteration:  120
[8 9 4 ... 8 8 7] [2 9 6 ... 8 8 7]
0.5969333333333333
Iteration:  130
[8 9 4 ... 8 8 7] [2 9 6 ... 8 8 7]
0.6125
Iteration:  140
[8 9 4 ... 8 8 7] [2 9 6 ... 8 8 7]
0.6288666666666667
Iteration:  150
[8 9 4 ... 