## Implementation details :

1) Using MLP for classifying MNIST digits. a. Let us consider a MLP network with H hidden units, O outputs, and
inputs of size D.

2) According to above diagram dimensions of network will be :
```
 W - H * D+1   -- Weights from Input layer to Hidden layer
 V - K * H+1   -- Weights from Hidden layer to Output layer
 X - N * D+1   -- Weights of Input data
 Y - N * K     -- Weights of Output label data
 Z - H+1 * 1   -- Hidden layer Weights
 O - K * 1     -- Output layer weights
```
3) Please note that +1 in above notations is to indicate bias term.

4) tanh is used as the activation function.

5) During forward pass will be :

$\mathbf{z}=\tanh (\mathbf{W} \mathbf{x})$

and $O_{i}=\frac{\exp v_{i}^{T} z}{\sum_{k=1}^{K} \exp v_{k}^{T} z}$

Overall loss function will be :

Total loss = $-\sum_{n=1}^{N} \sum_{i=1}^{K} y_{n i} \log O_{n i}$


The dataset is included in zip file ("data.txt" and "label.txt"). Number of hidden layer units to be 500, learning rate is set to be 0.01. 

 Inorder to update the weights during back propogation we will modified version of stochastic gradient descent, where instead of updating weights after each data point, the updates are made once with batch of input data, Let batch size = 25. Number of epochs = 100.


In [0]:
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.model_selection import train_test_split

####################################################################################
# Derivative of weights as defined in Lab3.a	####################################
####################################################################################
####################################################################################

# Dimension of all the Matrices Used
# W - H * D+1   -- Weights from Input layer to Hidden layer
# V - K * H+1   -- Weights from Hidden layer to Output layer
# X - N * D+1   -- Weights of Input data
# Y - N * K     -- Weights of Output label data
# Z - H+1 * 1   -- Hidden layer Weights
# O - K * 1     -- Output layer weights


# Function to read data and store it in form of 2D array
def read_data(file_name) :
    data = np.loadtxt(file_name, delimiter=',')
    return data


# Implementation of Forward pass function
def forward_pass(data_points,W,V,Y) :
    # use compute_Z_values, compute_O_values, compute_softmax, calculate_error to compute error, O_softmax
    # During forward pass compute Z values, O_values, O_softmax, error
    # Insert code here
#     print("reaching here")
    Z_values = compute_Z_values(W,data_points) 
    O_values = compute_O_values(V,Z_values)
    O_softmax = compute_softmax(O_values)

    error = calculate_error(O_softmax,Y)
    #return the error, O_softmax
    return error,O_softmax


# Implementation of Cross Entropy Error Function takes as input O_softmax, Y
def calculate_error(predictions, targets, epsilon=1e-10):
    # Caculate cross entropy error between output of softmax (predictions) , actual values (targets)
    #Insert code here
    #predictions is 25*10
    #targets is 25*10
    logOfY = np.log(predictions)
#     logOfoneminusY = np.log(predictions*(-1)+1)
    sumoftwoterms = np.multiply(targets,logOfY)   # + np.multiply(targets(-1)+1,logOfoneminusY)
    cross_entropy_error = np.sum(sumoftwoterms,axis=1)
    #returns cross entropy error
    cross_entropy_error = (-1)*np.mean(cross_entropy_error,axis=0)
    return cross_entropy_error


# Implementation of Softmax Error Function takes as input O
def compute_softmax(output_matrix) :
    # return output_matrix after apply softmax function ( hint: use np.exp function )
    y_length = 10
    #Insert code here
#     expo = np.exp(output_matrix)
#     rowsum = np.sum(expo,axis=1)                # it is 25x1
# #     print("rowsum shape=")
# #     print(rowsum.shape)
#     for i in range(0,10):      
#       expo[i] = np.divide(expo[i],rowsum[i])
    n,m = output_matrix.shape
    output_matrix = output_matrix.astype(float)
    for i in range(n):
        output_matrix[i] = np.exp(output_matrix[i])/np.sum(np.exp(output_matrix[i]),axis=0)
    # returns output_matrix
    return output_matrix


# Implementation of Backward pass using Backpropagation Algorithm to calculate V_new, W_new, bias_v
def backward_pass(O_softmax,Y,V,Z,W,X,bias_z):
    # use gradient_hidden_to_output, gradient_input_to_hidden functions to compute V_new, bias_v, W_new 
    #Insert code here   
    V_new,bias_v = gradient_hidden_to_output(O_softmax,Y,Z,bias_z)
    W_new = gradient_input_to_hidden(O_softmax,Y,V,Z,X)    #501*401
    W_new = np.delete( W_new ,500,0)   #500*400         
    #returns V_new, W_new, bias_v
    return  W_new, V_new, bias_v


#Implementation of Gradient back propogation from Hidden to output Layer
def gradient_hidden_to_output(O_softmax,Y,Z,bias_z) :
    # function to update V values using backpropogation using matrix operations.
    #Insert code here

    gradL=np.subtract(O_softmax,Y)  #25*10
    final_result_matrix=np.dot(np.transpose(gradL),Z)    
    bias_v=np.dot(np.transpose(gradL),bias_z)
    
    return final_result_matrix,bias_v  #final_result is V_new or basically V_grad or dL/dV

#Implementation of Graident back propogation from Input to Hidden Layer
def gradient_input_to_hidden(O_softmax,Y,V,Z,X) :
    # function to update W values using backpropogation using only matrix operations.
    #Insert code here
    temp  = np.transpose((np.dot(np.subtract(O_softmax,Y),V)))  #25*501
#     print("Z.shape()")
#     print(Z.shape)
    sech2 = (-1)*np.square(Z)+1  #25*501
    result_matrix  = np.dot(np.multiply(temp,np.transpose(sech2)), X)  #501*25 x 25*401
    # returns updated W values   #final_result is W_new or basically W_grad
    return result_matrix  #501*401




# Function to calculate Z values during forward pass
def compute_Z_values(weights,data_points) :
    # function to update Z during forward pass using matrix operations.
    #Insert code here
    
    z_values = np.tanh(np.dot(data_points,np.transpose(weights)))  #arr1_transpose = arr1.transpose() 
    #return calculated z_values
    z_values = np.append(np.ones((len(z_values),1)),z_values,axis=1)
    return z_values


# Function to Calculate output matrix during forward pass
def compute_O_values(weights,z_values) :
    # function to update O during forward pass using matrix operations.
    #Insert code here
    
#     print("weights.shape this is")    
#     print(weights.shape)
#     print("z_values.shape")
#     print(z_values.shape)
    
    o_values = np.dot(z_values,np.transpose(weights))   #O is 25x10     
    #return calculated o_values
    #return o_values.T    
    return o_values



# Function to Intialise weights with bias term
def initilaise_weights(data) :
    # function to append bias term.
    # insert code here
    r = data.shape[0]
#     o = np.ones(r,1)
#     final_data = np.hstack((o, data ))
#     return final_data
    bias_data = np.empty(shape=(r, 1))
    bias_data.fill(1.0)
    data = np.append(bias_data, data ,axis=1)
    return data



# To intiliase random weights to Matrices such as W, V
def random_weights(number_of_rows,number_of_columns) :
    # Function to assign random weights to W, V
    #Insert code here 
    new_data = np.random.randn(number_of_rows,number_of_columns)-0.5
  
    # return random weights with number_of_rows * number_of_columns from normal distribution
    return new_data


# To divide the data into test train data
def train_test1_split(X,Y,fraction) :
    # Function to divide train, validation and test data based on fraction. let fraction = 0.8 then train = 0.75, validation= 0.05 and test = 0.2 
    #Insert code here
    data_train_x, test_data_x, data_train_y, test_data_y = train_test_split(X,Y, test_size=1-fraction, random_state=1)      # test_set gets (100-fraction*100)% of X,Y rest goes to train 

    data_train_x, validation_data_x, data_train_y, validation_data_y = train_test_split(data_train_x, data_train_y , test_size=0.2, random_state=1)  # (100-fraction*100)% of trainset goes to validation and rest to train set
    

    # return data_train_x,data_train_y,validation_data_x,validation_data_y,test_data_x,test_data_y
    return data_train_x,data_train_y,validation_data_x,validation_data_y,test_data_x,test_data_y


# Shuffle in same order for X,Y
def shuffle(a, b, seed):
   # to Shuffle in same order for X,Y based on seed
   #Insert code here
    
    zipped = list(zip(a,b))
    random.seed(seed)
    random.shuffle(zipped)
    a,b=zip(*zipped)
   # return shuffled values a,b in same order
    return a,b

if __name__ == "__main__" :
    data = read_data("data.txt")
    Y = read_data("label.txt")
    X = initilaise_weights(data)
    W = random_weights(500,401)
    V = random_weights(10,501)
    Z = compute_Z_values(W,X[:25,:])
    
    
#     print(Z.shape)
     
    O = compute_O_values(V,Z)
    bias_z = np.empty(shape=(25, 1))
    bias_z.fill(1.0)
    i=0
    learning_rate = 0.01
    train_test_fraction  = 0.8
    train_validation_split = 0.2
    train_data_x,train_data_y,validation_data_x,validation_data_y,test_data_x,test_data_y =train_test1_split(X,Y,train_test_fraction)
    number_of_epocs=100
    train_error_epoch = []*(5*number_of_epocs)
    #X = train_data_x
    #Y = train_data_y
    #Dividing the data into training data and test data into 0.8 ratio same ration for train and validation data,values below correspond to 0.8 ratio
    train_data_len = 3200
    validation_data_len = 800
    test_data_len =1000
    validation_error_epoch = [] * (5*number_of_epocs)
    # Running for 5 trails using 100 Epocs and Batch size = 25
    batch_size = 25
    print("Started calculating Training error in 5 Trails for each epoch with Batch size = 25 ")
    print("Started calculating Validation error in 5 Trails for each epoch with Batch size = 25 ")
    # Different trails are performed for 5 times.
    # 5 different trails
    for k in range(5) :
        W = random_weights(500, 401)
        V = random_weights(10, 501)
        error_train=0
        error_validation = 0
        # Randomising the data
        seed = random.randint(10000,10000000)
        
        X,Y = shuffle(X,Y,seed)
        X = np.array(X)
        Y = np.array(Y)
#         print(X.shape)
#         print(Y.shape)
        print("Training Error for 100th epoch for Trail Number : "+str(k+1))
        for j in range(number_of_epocs) :
            i=0
            count=0
            error_train = 0.0
            error_validation = 0.0
            #X,Y = shuffle(X,Y,12345)
            while i < (train_data_len)  :
                i1=i
                # Batch size is 25
                i= i+25
                #error,O_softmax = forward_pass(np.array(X)[i1:i, :], W, V, np.array(Y)[i1:i, :])
                
                error,O_softmax = forward_pass(X[i1:i, :], W, V, Y[i1:i, :])  
                W_new,V_new,bias_v=backward_pass(O_softmax,Y[i1:i,:],V,Z,W,X[i1:i,:],bias_z)
                print("W_new is = ",W_new)
                #print(W_new.shape)
                W = W - (learning_rate/25)*W_new
                #print(W)
                V_new=np.delete(V_new,500,1)                             #deleting bias weights
                V_new = np.append(learning_rate*bias_v,V_new,axis=1)
                V = V - (learning_rate/25)*V_new
                error_train+= error
                count+=1
                #print(error1)
                #print(V.shape)
            #print(error)
            #print(j)
            error, O_softmax = forward_pass(X[0:3200, :], W, V, Y[0 : 3200, :])
            error1, O_softmax = forward_pass(X[3200:4000, :], W, V, Y[3200:4000, :])
            error_validation = error1
            count = train_data_len/batch_size
            count1= validation_data_len/batch_size
            error_train = error_train/count
            error_validatio = error_validation/count1
            print("Training error after  epoch : "+str(j+1)+" here every batch size = 25")
            print(error_train)
            print("Validation error after  epoch : "+str(j+1)+" here batch size = 25")
            print(error_validation)
            train_error_epoch.append(error_train)
            validation_error_epoch.append(error_validation)
    print("\n")
    print("Final Training Errors after 5 trails and 100 Epocs : ")
    print(train_error_epoch)
    print("\n")
    print("Final Validation Errors after 5 trails and 100 Epocs : ")
    print(validation_error_epoch)
    mean_training = []
    variance_training = []
    mean_validation = []
    variance_validation = []
    train_error_epoch = np.reshape(train_error_epoch,(5,number_of_epocs))
    validation_error_epoch = np.reshape(validation_error_epoch,(5,number_of_epocs))
    mean_training = np.mean(train_error_epoch, axis=0)
    mean_validation = np.mean(validation_error_epoch,axis=0)
    variance_training = np.var(train_error_epoch,axis=0)
    variance_validation = np.var(validation_error_epoch,axis=0)
    mean_training = np.reshape(mean_training,(number_of_epocs,))
    mean_validation = np.reshape(mean_validation, (number_of_epocs))
    variance_training = np.reshape(variance_training,(number_of_epocs,))
    variance_validation = np.reshape(variance_validation,(number_of_epocs,))
    epochs = []
    for i in range(1,number_of_epocs+1) :
        epochs.append(i)
    print("\n")
    print("Plots are started to Generate In Figures Folder : ")
    plt.plot(epochs,mean_training, color='red', label='Training')
    plt.xlabel("Epoch values")
    #plt.title("Plot for Training Error Vs Epochs")
    location = "./figures/lab3.a_TrainingError" + ".png"
    #plt.savefig(location)
    #plt.close()
    plt.plot(epochs, mean_validation,color='blue', label='Validation')
    plt.ylabel("Training,Validation Error values")
    plt.title("Plot for Training,Validation Error Vs Epochs")
    plt.legend(loc='best')
    location = "./figures/lab3.a_TrainingAndValidationError" + ".png"
    plt.savefig(location)
    plt.close()
    #plt.ylim(0.0145,0.01465)
    plt.plot(epochs, mean_training, color='red', label='Training')
    plt.xlabel("Epoch values")
    plt.ylabel("Mean Training Error values")
    plt.title("Plot for Mean Training Error Vs Epochs")
    location = "./figures/lab3.a_MeanTrainingError" + ".png"
    plt.legend(loc='best')
    plt.savefig(location)
    plt.close()
    #plt.ylim(0.0133, 0.134)
    plt.plot(epochs, mean_validation, color='blue', label='Validation')
    plt.xlabel("Epoch values")
    plt.ylabel(" Mean Validation Error values")
    plt.title("Plot for Mean Validation Error Vs Epochs")
    location = "./figures/lab3.a_MeanValidationError" + ".png"
    plt.legend(loc='best')
    plt.savefig(location)
    plt.close()
    #plt.ylim(0.000240, 0.000242)
    plt.plot(epochs, variance_training, color='red', label='Training')
    plt.xlabel("Epoch values")
    plt.ylabel(" Variance Training Error values")
    plt.title("Plot for Variance Training Error Vs Epochs")
    location = "./figures/lab3.a_VarianceTrainingError" + ".png"
    plt.legend(loc='best')
    plt.savefig(location)
    plt.close()
    #plt.ylim(0.000210, 0.000211)
    plt.plot(epochs, variance_validation, color='blue', label='Validation')
    plt.xlabel("Epoch values")
    plt.ylabel("Variance Validation Error values")
    plt.title("Plot for Variance Validation Error Vs Epochs")
    location = "./figures/lab3.a_VarianceValidationError" + ".png"
    plt.legend(loc='best')
    plt.savefig(location)
    plt.close()
    print("\n")
    print("Plots are Generated Successfully In Figures folder")

