In [None]:
# Do not import any additional 3rd party external libraries
import numpy as np
import os
import matplotlib.pyplot as plt


class Activation(object):

    """
    Interface for activation functions (non-linearities).
    """

    # No additional work is needed for this class, as it acts like an abstract base class for the others

    def __init__(self):
        self.state = None

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented


class Identity(Activation):

    """
    Identity function (already implemented).
    """

    # This class is a gimme as it is already implemented for you as an example (do not change)

    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        self.state = x
        return x

    def derivative(self):
        return 1.0


class Sigmoid(Activation):

    """
    Sigmoid non-linearity
    """
    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, x):
        # hint: save the useful data for back propagation
        sig = 1/(1+np.exp(-1*x))
        return sig 

    def derivative(self,x):
        dsig = (1/(1+np.exp(-1*x)))*((1-(1/(1+np.exp(-1*x)))))
        return dsig

class Tanh(Activation):

    """
    Tanh non-linearity
    """

    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, x):
        tanh = np.tanh(x)
        return tanh

    def derivative(self,x):
        dtanh = 1 - (np.tanh(x)**2)
        return dtanh

class ReLU(Activation):

    """
    ReLU non-linearity
    """

    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, x):
        x = np.where(x<0,0,x)
        return x

    def derivative(self,x):
        x = np.where(x<0,0,1)
        return x

class Criterion(object):

    """
    Interface for loss functions.
    """

    # Nothing needs done to this class, it's used by the following Criterion classes

    def __init__(self):
        self.logits = None
        self.labels = None
        self.loss = None

    def __call__(self, x, y):
        return self.forward(x, y)

    def forward(self, x, y):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented


class SoftmaxCrossEntropy(Criterion):

    """
    Softmax loss
    """

    def __init__(self):
        super(SoftmaxCrossEntropy, self).__init__()
        # you can add variables if needed

    def forward(self, labels):
        batch_size = 25
        num_class = 10
        CESM = np.ones([batch_size, num_class])
        for b in range(batch_size):
            CESM[b] = -1*np.array([labels[b]])*np.array([(np.log(np.exp(self.act[2][b])/np.sum(np.exp(labels[b]))))])
            totloss = np.sum(CESM,axis=1)
            Loss = np.sum(totloss)/2
        return Loss

    def derivative(self,labels,b):
        dsoft = np.array([-labels[b]])+np.array([self.act[2][b]])
        return dsoft   


# randomly intialize the weight matrix with dimension d0 x d1 via Normal distribution
def random_normal_weight_init(d0, d1):
    return np.random.randn(d0,d1)


# initialize a d-dimensional bias vector with all zeros
def zeros_bias_init(d):
    return np.zeros([1,d])

class MLP(object):

    """
    A simple multilayer perceptron
    (feel free to add class functions if needed)
    """

    def __init__(self, input_size, output_size, hiddens, activations, weight_init_fn, bias_init_fn, criterion, lr):

        # Don't change this -->
        self.train_mode = True
        self.nlayers = len(hiddens) + 1
        self.input_size = input_size
        self.output_size = output_size
        self.activations = activations
        self.criterion = criterion
        self.lr = lr
        # <---------------------

        # Don't change the name of the following class attributes
        self.nn_dim = [input_size] + hiddens + [output_size]
        # list containing Weight matrices of each layer, each should be a np.array
        self.W = [weight_init_fn(self.nn_dim[i], self.nn_dim[i+1]) for i in range(self.nlayers)]
        # list containing derivative of Weight matrices of each layer, each should be a np.array
        self.dW = [np.zeros_like(weight) for weight in self.W]
        # list containing bias vector of each layer, each should be a np.array
        self.b = [bias_init_fn(self.nn_dim[i+1]) for i in range(self.nlayers)]
        # list containing derivative of bias vector of each layer, each should be a np.array
        self.db = [np.zeros_like(bias) for bias in self.b]

        # You can add more variables if needed

    def forward(self, x):
        batchsize = 25
        r0,c0 = np.shape(self.W[0])
        r1,c1 = np.shape(self.W[1])
        r2,c2 = np.shape(self.W[2])
        self.act = [np.ones([25,c0]),np.ones([25,c1]),np.ones([25,c2])]
        for b in range(batchsize):
            layers = len(self.nn_dim)
            for i in range(layers-1):
                if i == 0:
                    f = np.dot(np.array([x[b]]) , self.W[i]) + self.b[i]
                    self.act[0][b] = np.array([self.activations[i].forward(f)])
                else:
                    f = np.dot(self.act[i-1][b] , self.W[i])+self.b[i]
                    self.act[i][b] = np.array([self.activations[i-1].forward(f)])
        return self.act

    def zero_grads(self):
        layers = len(self.nn_dim)
        for i in range(layers-1):
            Wn, WM = np.shape(self.dW[i])
            bn, bm = np.shape(self.b[i])
            self.dW[i] = np.zeros([Wn,Wm])
            self.db[i] = np.zeros([bn,bm])
        return self.dW,self.db

    def step(self):     
        # update the W and b on each layer
        layers = len(self.nn_dim)
        for i in range(layers-1):
            self.W[i] = self.W[i] - self.lr*self.dW[i] 
            self.b[i] = self.b[i] - self.lr*self.db[i] 
        return self.W[i],self.b[i]

    def backward(self, labels, x):
        batchsize = 25
        layers = len(self.nn_dim)
    
        for b in range(batchsize):
            
            if self.train_mode:
                
                
              
                for i in range(layers-1):
                    #print(SoftmaxCrossEntropy.derivative(self, labels, b))
                    if i == 0:
                        #print(SoftmaxCrossEntropy.derivative(self, labels, b) == (np.array([-labels[b]])+np.array([self.act[2][b]])))
                        #self.dW[0] = np.sum(((np.array([-labels[b]])+np.array([self.act[2][b]]))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))@(self.W[1].T)*self.activations[0].derivative(np.array([self.act[0][b]]))*(np.array(x[i][b]).T)
                        #self.db[0] = np.sum(((np.array([-labels[b]])+np.array([self.act[2][b]]))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))@(self.W[1].T)*self.activations[0].derivative(np.array([self.act[0][b]]))
                        self.dW[0] = np.sum(((SoftmaxCrossEntropy.derivative(self, labels, b))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))@(self.W[1].T)*self.activations[0].derivative(np.array([self.act[0][b]]))*(np.array(x[i][b]).T)
                        self.db[0] = np.sum(((SoftmaxCrossEntropy.derivative(self, labels, b))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))@(self.W[1].T)*self.activations[0].derivative(np.array([self.act[0][b]]))
                    elif i == 1:
                        #print(SoftmaxCrossEntropy.derivative(self, labels, b) == (np.array([-labels[b]])+np.array([self.act[2][b]])))
                        #self.dW[1] = np.sum(((np.array([-labels[b]])+np.array([self.act[2][b]]))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))*((np.array([self.act[0][b]])).T)      
                        #self.db[1] = np.sum(((np.array([-labels[b]])+np.array([self.act[2][b]]))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))
                        self.dW[1] = np.sum(((SoftmaxCrossEntropy.derivative(self, labels, b))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))*((np.array([self.act[0][b]])).T)      
                        self.db[1] = np.sum(((SoftmaxCrossEntropy.derivative(self, labels, b))@(self.W[2].T)),axis=0)*self.activations[1].derivative(np.array([self.act[1][b]]))
                    elif i == 2:
                        #print(SoftmaxCrossEntropy.derivative(self, labels, b) == (np.array([-labels[b]])+np.array([self.act[2][b]])))
                        #self.dW[2] = ((np.array([-labels[b]])+np.array([self.act[2][b]])).T@np.array([self.act[1][b]])).T
                        #self.db[2] = (np.array([-labels[b]])+np.array([self.act[2][b]]))
                        self.dW[2] = ((SoftmaxCrossEntropy.derivative(self, labels, b)).T@np.array([self.act[1][b]])).T
                        self.db[2] = (SoftmaxCrossEntropy.derivative(self, labels, b))
                        
            return self.dW, self.db

    def __call__(self, x):
        return self.forward(x)

    def train(self):
        # training mode
        self.train_mode = True

    def eval(self):
        # evaluation mode
        self.train_mode = False

    def get_loss(self, labels):
        Loss = SoftmaxCrossEntropy.forward(self, labels)
        return Loss

    def get_error(self, labels):
        batch_size = 25
        correct_list = []
        wrong_list = []
        for b in range(batch_size):
            lab_ans = np.argmax(labels[b])
            my_ans = np.argmax(self.act[2][b])
            if lab_ans == my_ans:
                #print("Correct")
                correct_list.append(1)
            else:
                #print("wrong")
                wrong_list.append(1)

        acc = len(correct_list)/(len(wrong_list)+len(correct_list))
        Num_Incorrect = len(wrong_list)
        return Num_Incorrect 

    def save_model(self, path='p1_model.npz'):
        # save the parameters of MLP (do not change)
        np.savez(path, self.W[0], self.b[0])


# Don't change this function
def get_training_stats(mlp, dset, nepochs, batch_size):
    train, val, test = dset
    trainx, trainy = train
    valx, valy = val
    testx, testy = test

    idxs = np.arange(len(trainx))

    training_losses = []
    training_errors = []
    validation_losses = []
    validation_errors = []

    for e in range(nepochs):
        print("epoch: ", e)
        train_loss = 0
        train_error = 0
        val_loss = 0
        val_error = 0
        num_train = len(trainx)
        num_val = len(valx)

        for b in range(0, num_train, batch_size):
            mlp.train()
            mlp(trainx[b:b+batch_size])
            x = trainx[b:b+batch_size]
            mlp.backward(trainy[b:b+batch_size],x)
            mlp.step()
            train_loss += mlp.get_loss(trainy[b:b+batch_size])
            train_error += mlp.get_error(trainy[b:b+batch_size])
        training_losses += [train_loss/num_train]
        training_errors += [train_error/num_train]
        print("training loss: ", train_loss/num_train)
        print("training error: ", train_error/num_train)
        
        for b in range(0, num_val, batch_size):
            mlp.eval()
            mlp(valx[b:b+batch_size])
            val_loss += mlp.get_loss(valy[b:b+batch_size])
            val_error += mlp.get_error(valy[b:b+batch_size])
        validation_losses += [val_loss/num_val]
        validation_errors += [val_error/num_val]
        print("validation loss: ", val_loss/num_val)
        print("validation error: ", val_error/num_val)

    test_loss = 0
    test_error = 0
    num_test = len(testx)
    for b in range(0, num_test, batch_size):
        mlp.eval()
        mlp(testx[b:b+batch_size])
        test_loss += mlp.get_loss(testy[b:b+batch_size])
        test_error += mlp.get_error(testy[b:b+batch_size])
    test_loss /= num_test
    test_error /= num_test
    print("test loss: ", test_loss)
    print("test error: ", test_error)

    return (training_losses, training_errors, validation_losses, validation_errors)


# get ont hot key encoding of the label (no need to change this function)
def get_one_hot(in_array, one_hot_dim):
    dim = in_array.shape[0]
    out_array = np.zeros((dim, one_hot_dim))
    for i in range(dim):
        idx = int(in_array[i])
        out_array[i, idx] = 1
    return out_array


def main(lr=.05,num_epochs=100,hiddens=[6000,2000]):
    # load the mnist dataset from csv files
    image_size = 28 # width and length of mnist image
    num_labels = 10 #  i.e. 0, 1, 2, 3, ..., 9
    image_pixels = image_size * image_size
    data_path = "mnist/"
    train_data = np.loadtxt(data_path + "mnist_train.csv", delimiter=",")
    test_data = np.loadtxt(data_path + "mnist_test.csv", delimiter=",")

    # rescale image from 0-255 to 0-1
    fac = 1.0 / 255
    train_imgs = np.asfarray(train_data[:50000, 1:]) * fac
    val_imgs = np.asfarray(train_data[50000:, 1:]) * fac
    test_imgs = np.asfarray(test_data[:, 1:]) * fac
    train_labels = np.asfarray(train_data[:50000, :1])
    val_labels = np.asfarray(train_data[50000:, :1])
    test_labels = np.asfarray(test_data[:, :1])

    # convert labels to one-hot-key encoding
    train_labels = get_one_hot(train_labels, num_labels)
    val_labels = get_one_hot(val_labels, num_labels)
    test_labels = get_one_hot(test_labels, num_labels)

    print(train_imgs.shape)
    print(train_labels.shape)
    print(val_imgs.shape)
    print(val_labels.shape)
    print(test_imgs.shape)
    print(test_labels.shape)

    dataset = [
        [train_imgs, train_labels],
        [val_imgs, val_labels],
        [test_imgs, test_labels]
    ]

    # These are only examples of parameters you can start with
    # you can tune these parameters to improve the performance of your MLP
    # this is the only part you need to change in main() function
    hiddens = hiddens
    activations = [Sigmoid(), Sigmoid(), Sigmoid()]
    lr = lr
    #print("Learning rate is: " + str(lr))
    num_epochs = num_epochs
    batch_size = 25

    # build your MLP model
    mlp = MLP(
        input_size=image_pixels, 
        output_size=num_labels, 
        hiddens=hiddens, 
        activations=activations, 
        weight_init_fn=random_normal_weight_init, 
        bias_init_fn=zeros_bias_init, 
        criterion=SoftmaxCrossEntropy(), 
        lr=lr
    )

    # train the neural network
    losses = get_training_stats(mlp, dataset, num_epochs, batch_size)

    # save the parameters
    mlp.save_model()

    # visualize the training and validation loss with epochs
    training_losses, training_errors, validation_losses, validation_errors = losses

    fig, (ax1, ax2) = plt.subplots(1, 2)

    ax1.plot(training_losses, color='blue', label="training")
    ax1.plot(validation_losses, color='red', label='validation')
    ax1.set_title('Loss during training')
    ax1.set_xlabel('epoch')
    ax1.set_ylabel('loss')
    ax1.legend()

    ax2.plot(training_errors, color='blue', label="training")
    ax2.plot(validation_errors, color='red', label="validation")
    ax2.set_title('Error during training')
    ax2.set_xlabel('epoch')
    ax2.set_ylabel('error')
    ax2.legend()

    plt.show()

if __name__ == "__main__":
    main()
