# Assignment #3
## P556: Applied Machine Learning

More often than not, we will use a deep learning library (Tensorflow, Pytorch, or the wrapper known as Keras) to implement our models. However, the abstraction afforded by those libraries can make it hard to troubleshoot issues if we don't understand what is going on under the hood. In this assignment you will implement a fully-connected and a convolutional neural network from scratch. To simplify the implementation, we are asking you to implement static architectures, but you are free to support variable number of layers/neurons/activations/optimizers/etc. We recommend that you make use of private methods so you can easily troubleshoot small parts of your model as you develop them, instead of trying to figure out which parts are not working correctly after implementing everything. Also, keep in mind that there is code from your fully-connected neural network that can be re-used on the CNN. 

Problem #1.1 (40 points): Implement a fully-connected neural network from scratch. The neural network will have the following architecture:

- Input layer
- Dense hidden layer with 512 neurons, using relu as the activation function
- Dropout with a value of 0.2
- Dense hidden layer with 512 neurons, using relu as the activation function
- Dropout with a value of 0.2
- Output layer, using softmax as the activation function

The model will use categorical crossentropy as its loss function. 
We will optimize the gradient descent using RMSProp, with a learning rate of 0.001 and a rho value of 0.9.
We will evaluate the model using accuracy.

Why this architecture? We are trying to reproduce from scratch the following [example from the Keras documentation](https://keras.io/examples/mnist_mlp/). This means that you can compare your results by running the Keras code provided above to see if you are on the right track.

In [0]:
class NeuralNetwork(object):
  def __init__(self,epochs,learning_rate,droupout):
    self.learning_rate=learning_rate
    self.epochs=epochs
    self.dropout=droupout

  
  def fit(self,x,y1):
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import normalize
    def feed_forward(w1,w2,w3,b1,b2,b3,X,y):
    #     ReLU activation function
        def relu(l):
            return np.maximum(l,0)
    #     Cross Entropy error
        def cross_entropy(a,y):
          return -np.sum(y*np.log(a.clip(min=0.000001)))

    #     Softmax activation function
        def softmax(a):
          a=normalize(a)
          return np.exp(a)/np.sum(np.exp(a), axis=0)
          
    #     The approch in this function is referred from: https://ml-cheatsheet.readthedocs.io/en/latest/backpropagation.html
    #     Hidden layer 1
        ip1=np.dot(X,w1)+b1
        op1=relu(ip1)
        m1=np.random.randn(op1.shape[0],op1.shape[1])<(1-self.dropout)
        op1*=m1

    #     Hidden layer 2
        ip2=np.dot(op1,w2)+b2
        op2=relu(ip2)
        m2=np.random.randn(op2.shape[0],op2.shape[1])<(1-self.dropout)
        op2*=m2    
        
    #     Output Layer
        ip3=np.dot(op2,w3)+b3
        op=softmax(ip3)
        loss=cross_entropy(op,y1)
        return op,ip1,ip2,ip3,op1,op2
          
    def relu_der(l):
      for i in range(len(l)):
        l[i]=[1 if j>0 else 0 for j in l[i]]
      return l
      
    def soft_der(y_train,y_pred):
      return y_pred-y_train
    
    global wh1,wh2,wo,bh1,bh2,bo
    wh1=np.random.randn(x.shape[1],512)/np.sqrt(x.shape[1])
    bh1=np.random.randn(1,512)*0
    wh2=np.random.randn(512,512)/np.sqrt(512)
    bh2=np.random.randn(1,512)*0
    wo=np.random.randn(512,y1.shape[1])/np.sqrt(512)
    bo=np.random.randn(1,10)*0 
    
    start=len(x)//4
    l=[start*i for i in range(4)]
    change_o,change_bo,change_h2,change_bh2,change_h1,change_bh1=0,0,0,0,0,0    
    for iteration in range(self.epochs):
      acc=[]
      for z in l:
        x_batch,y1_batch=x[z:z+start],y1[z:z+start]
        y_pred,ip1,ip2,ip3,op1,op2=feed_forward(wh1,wh2,wo,bh1,bh2,bo,x_batch,y1_batch)    

    #     Calculating errors for each layer
        eo=(y1_batch-y_pred)*soft_der(y1_batch,ip3)
        eh2=(eo@wo.T)*relu_der(ip2)
        eh1=(eh2@wh2.T)*relu_der(ip1)

    #     Updating weights
        dwo=op2.T@eo
        dbo=np.sum(eo,axis=0)
        change_o=(0.9*change_o)+(0.1*dwo*dwo)
        change_bo=(0.9*change_bo)+(0.1*dbo*dbo)
        wo-=(self.learning_rate/np.sqrt(change_o.clip(min=0.000001)))*dwo
        bo=(self.learning_rate/np.sqrt(change_bo.clip(min=0.000001)))*dbo

        dwh2=op1.T@eh2
        dbh2=np.sum(eh2,axis=0)
        change_h2=(0.9*change_h2)+(0.1*dwh2*dwh2)
        change_bh2=(0.9*change_bh2)+(0.1*dbh2*dbh2)
        wh2-=(self.learning_rate/np.sqrt(change_h2.clip(min=0.000001)))*dwh2
        bh2=(self.learning_rate/np.sqrt(change_bh2.clip(min=0.000001)))*dbh2

        dwh1=x_batch.T@eh1
        dbh1=np.sum(eh1,axis=0)
        change_bh1=(0.9*change_bh1)+(0.1*dbh1*dbh1)
        change_h1=(0.9*change_h1)+(0.01*dwh1*dwh1)
        wh1-=(self.learning_rate/np.sqrt(change_h1.clip(min=0.000001)))*dwh1
        bh1=(self.learning_rate/np.sqrt(change_bh1.clip(min=0.000001)))*dbh1

        for i in range(len(y_pred)):
          a=max(y_pred[i])
          y_pred[i]=[1 if j==a else 0 for j in y_pred[i]]
  

  def evaluate(self,x,w1,w2,w3,b1,b2,b3):
    from sklearn.preprocessing import normalize
    def relu(l):
      return np.maximum(l,0)

    def softmax(a):
      a=normalize(a)
      return np.exp(a)/np.sum(np.exp(a), axis=0)

    ip1=np.dot(x,w1)+b1
    op1=relu(ip1)
    ip2=np.dot(op1,w2)+b2
    op2=relu(ip2)
    ip3=np.dot(op2,w3)+b3
    op=softmax(ip3)

    for i in range(len(op)):
      a=max(op[i])
      op[i]=[1 if j==a else 0 for j in op[i]]

    return op

    

Problem #1.2 (10 points): Train your fully-connected neural network on the Fashion-MNIST dataset using 5-fold cross validation. Report accuracy on the folds, as well as on the test set.

In [18]:
# To simplify the usage of our dataset, we will be importing it from the Keras 
# library. Keras can be installed using pip: python -m pip install keras

# Original source for the dataset:
# https://github.com/zalandoresearch/fashion-mnist

# Reference to the Fashion-MNIST's Keras function: 
# https://keras.io/datasets/#fashion-mnist-database-of-fashion-articles

from keras.datasets import fashion_mnist
import keras
import numpy as np
from sklearn.metrics import accuracy_score


# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
# print(x_train.shape[0], 'train samples')
# print(x_test.shape[0], 'test samples')


# convert class vectors to binary class matrices
num_classes = 10

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

nn=NeuralNetwork(20,0.001,0.2)

# Referred from: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
from sklearn.model_selection import KFold as kf
cv=kf(n_splits=5)
a,w1,w2,w3,b1,b2,b3=[],[],[],[],[],[],[]
for tr_ind, test_ind in cv.split(x_train):
  x_tr=x_train[tr_ind]
  y_tr=y_train[tr_ind]
  x_valid=x_train[test_ind]
  y_valid=y_train[test_ind]  
  
  nn.fit(x_tr,y_tr)
     
  w1.append(wh1)
  w2.append(wh2)
  w3.append(wo)
  b1.append(bh1)
  b2.append(bh2)
  b3.append(bo)    
  y_pred=nn.evaluate(x_valid,w1[-1],w2[-1],w3[-1],b1[-1],b2[-1],b3[-1])
    
  a.append(accuracy_score(y_valid, y_pred))

index=np.argmax(a) 
y_pred=nn.evaluate(x_test,w1[index],w2[index],w3[index],b1[index],b2[index],b3[index])
print(y_pred)
accuracy=max(a)*100
print('Validation Accuracy: ',"%.4f" % accuracy,'%')
accuracy=accuracy_score(y_test,y_pred)*100
print('Testing Accuracy: ',"%.4f" % accuracy,'%')

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]
Validation Accuracy:  57.9500 %
Testing Accuracy:  57.4300 %
