### Download dataset from Kaggle, fashion_mnist

todo: explain why we choose to use fashion_mnist

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

training_set = pd.read_csv('Dataset/fashion_data/fashion-mnist_train.csv')
test_set = pd.read_csv('Dataset/fashion_data/fashion-mnist_test.csv')

m, n = training_set.shape

print(training_set.shape)
print(test_set.shape)

training_set = np.array(training_set)
test_set = np.array(test_set)


(60000, 785)
(10000, 785)


In [4]:
# X is pixels, y is labels

test_set = test_set.T

y_test = test_set[0]
X_test = test_set[1:]
X_test = X_test / 255

training_set = training_set.T

y_train = training_set[0]
X_train = training_set[1:]

X_train = X_train / 255


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(784, 60000)
(60000,)
(784, 10000)
(10000,)


In [39]:
class NeuralNetwork():
    
    def __init__(self, X_train, y_train, learning_rate, epochs, dropout_rate=0.2, dropout=False):
        
        self.X_train = X_train
        self.y_train = y_train
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.dropout_rate = dropout_rate
        self.dropout = dropout    
        
    def init_param(self):
        w1 = np.random.rand(10, 784) * 0.01
        b1 = np.random.rand(10, 1)
        w2 = np.random.rand(10, 10) * 0.01
        b2 = np.random.rand(10, 1)
        
        return w1, b1, w2, b2
        
    # activation functions and its derivatives
    def ReLU(self, x):
        t= x * (x > 0)
        return t
    
    def ReLU_derivative(self, x):
        return (x >= 0) * 1
    
    def sigmoid(self, x):
        s=1/(1+np.exp(-x))
        return s
    
    def sigmoid_derivative(self, x):
        s=1/(1+np.exp(-x))
        ds=s*(1-s)  
        return ds
        
    def softmax(self, x):
        return np.exp(x)/sum(np.exp(x))
        def one_hot(Y):
            one_hot_Y = np.zeros((Y.size, Y.max() + 1))
            one_hot_Y[np.arange(Y.size), Y] = 1
            one_hot_Y = one_hot_Y.T
            return one_hot_Y  

    def forward_prop(self, w1, b1, w2, b2, X, dropout_rate=0.2, dropout=True):
    
        z1 = w1.dot(X) + b1
        a1 = self.ReLU(z1)
    
        if dropout:
            dropout_mask = (np.random.rand(*a1.shape) < dropout_rate) / dropout_rate
            a1 = a1 * dropout_mask
        else:
            dropout_mask = None
        
        z2 = w2.dot(a1) + b2
        a2 = self.softmax(z2)
        
        return z1, a1, z2, a2, dropout_mask
        

    def one_hot(self, Y):
        one_hot_Y = np.zeros((Y.size, Y.max() + 1))
        one_hot_Y[np.arange(Y.size), Y] = 1
        one_hot_Y = one_hot_Y.T
        return one_hot_Y  
    
    def neg_log_likelihood(self, y, y_hat):
        return -1/len(y) * np.sum(np.sum(y * np.log(y_hat)))
        

    def back_prop(self, z1, a1, z2, a2, w2, X, Y, dropout_mask):
        
        one_hot_y = self.one_hot(Y)
        
        dz2 = a2 - one_hot_y
        
        dw2 = 1/m * dz2.dot(a1.T)
        db2 = 1/m * np.sum(dz2)
        
        dz1 = w2.T.dot(dz2) * self.ReLU_derivative(z1)
    
        dz1 = dz1 * dropout_mask if dropout_mask is not None else dz1
        
        dw1 = 1 / m * dz1.dot(X.T)
        db1 = 1 / m * np.sum(dz1)
        
        return dw1, db1, dw2, db2

    def update_param(self, W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
        W1 = W1 - alpha * dW1
        b1 = b1 - alpha * db1    
        W2 = W2 - alpha * dW2  
        b2 = b2 - alpha * db2    
        return W1, b1, W2, b2
        
    
    def get_predictions(self, A2):
        return np.argmax(A2, 0)
        

    def get_accuracy(self, predictions, Y):
        print(predictions, Y)
        return np.sum(predictions == Y) / Y.size
        
    
    def gradient_descent(self, X, Y, alpha, iterations, dropout_rate=0.2, dropout=False):
    
        w1, b1, w2, b2 = self.init_param()
        
        for i in range(iterations):
            z1, a1, z2, a2, dropout_mask = self.forward_prop(w1, b1, w2, b2, X, dropout_rate, dropout=True)
            dW1, db1, dW2, db2 = self.back_prop(z1, a1, z2, a2, w2, X, Y, dropout_mask)
            w1, b1, w2, b2 = self.update_param(w1, b1, w2, b2, dW1, db1, dW2, db2, alpha)
            if i % 10 == 0:
                print("Iteration: ", i)
                predictions = self.get_predictions(a2)
                print(self.get_accuracy(predictions, Y))
            
        return w1, b1, w2, b2

    def train(self):
        self.gradient_descent(self.X_train, self.y_train, self.learning_rate, self.epochs, self.dropout_rate, self.dropout)


In [42]:
model = NeuralNetwork(X_train, y_train, 0.1, 500, 0.2, False)
model.train()

Iteration:  0
[8 8 8 ... 8 0 8] [2 9 6 ... 8 8 7]
0.09886666666666667
Iteration:  10
[2 8 2 ... 2 8 8] [2 9 6 ... 8 8 7]
0.107
Iteration:  20
[4 8 4 ... 0 4 0] [2 9 6 ... 8 8 7]
0.10963333333333333
Iteration:  30
[0 8 0 ... 2 0 8] [2 9 6 ... 8 8 7]
0.10995
Iteration:  40
[0 8 8 ... 8 0 8] [2 9 6 ... 8 8 7]
0.11403333333333333
Iteration:  50
[1 8 4 ... 0 0 8] [2 9 6 ... 8 8 7]
0.12271666666666667
Iteration:  60
[2 8 1 ... 2 9 8] [2 9 6 ... 8 8 7]
0.1485
Iteration:  70
[0 8 0 ... 0 8 9] [2 9 6 ... 8 8 7]
0.15995
Iteration:  80
[0 8 2 ... 0 8 8] [2 9 6 ... 8 8 7]
0.16981666666666667
Iteration:  90
[0 9 2 ... 0 8 8] [2 9 6 ... 8 8 7]
0.17751666666666666
Iteration:  100
[0 8 0 ... 8 8 8] [2 9 6 ... 8 8 7]
0.18903333333333333
Iteration:  110
[4 8 8 ... 0 0 8] [2 9 6 ... 8 8 7]
0.19333333333333333
Iteration:  120
[8 8 2 ... 0 8 9] [2 9 6 ... 8 8 7]
0.19743333333333332
Iteration:  130
[2 8 0 ... 2 8 8] [2 9 6 ... 8 8 7]
0.20381666666666667
Iteration:  140
[8 9 4 ... 8 9 9] [2 9 6 ... 8 8 7]
0.