In [1]:
import numpy as np 
class NeuralNetwork:

    def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.inp_nodes = input_nodes
        self.hid_nodes = hidden_nodes
        self.out_nodes = output_nodes
        self.lr = learning_rate
        #We are following standard normal distribution to distribute and generate the normal weights randomly from the mean 0.0 to root_over(hidden_nodes,-0.5)
        self.input_weights = np.random.normal(0.0,pow(self.hid_nodes,-0.5),(self.hid_nodes,self.inp_nodes)) #The dimension of the weight matrix between the input
                                                                                                            # and the hidden layer should be (rowXcolumn) = (hid_nodes*inp_nodes)
                                                                                                            # because the column and the row should be the same size while multiplication.
        self.output_weights = np.random.normal(0.0,pow(self.out_nodes,-0.5),(self.out_nodes,self.hid_nodes)) #Weights connecting theprevious layer with the output layer before
                                                                                                             #before the activation function that pushes out the final output.
    def sigmoid(self,inputs):
        return 1 / (1 + np.exp(-inputs))
    

    def train (self,input_list,target_list):
        inputs = np.array(input_list,ndmin = 2).T
        targets = np.array(target_list, ndmin = 2).T
        hidden_input = np.dot(self.input_weights,inputs)
        hidden_output = self.sigmoid(hidden_input)
        output_layer_input = np.dot(self.output_weights, hidden_output)
        self.final_outputs = self.sigmoid(output_layer_input)
        output_errors = targets - self.final_outputs
        hidden_errors = np.dot(self.output_weights.T,output_errors)
        #We need use the derived formula from our derivative which we formulated from the mean erros square function. The derivative is the slope of the function hence
        #minimization
        self.input_weights += self.lr*hidden_errors*self.sigmoid(hidden_output)*(1.0-self.sigmoid(hidden_output))*np.transpose(inputs)
        self.output_weights += self.lr*output_errors*self.sigmoid(self.final_outputs)*(1.0-self.sigmoid(self.final_outputs))*np.transpose(hidden_output)
        pass
    def query(self,input_list):
        inputs = np.array(input_list,ndmin =2).T
        hidden_inps = np.dot(self.input_weights,inputs)
        hidden_outs = self.sigmoid(hidden_inps) # When exiting the hidden layer the hidden_inputs go through an activation function
        output_layer_inputs = np.dot(self.output_weights,hidden_outs) # When entering the output layer the hidden_outputs are multiplied and summed with weighted links from
                                                               # the previous layer to the output_layer
        outputs = self.sigmoid(output_layer_inputs) # When exiting the final outputlayer the output layer inputs go throw the activation function again. 
                                                    # This is simply the forward pass.
        return outputs

    def performance(self,test_data_list): 
        for record in test_data_list:
            # split the record by the ',' commas
            all_values = record.split(',')
            # correct answer is first value
            correct_label = int(all_values[0])
            # scale and shift the inputs
            inputs = (numpy.asfarray(all_values[1:]) / 255.0 * 0.99) + 0.01
            # query the network
            outputs = n.query(inputs)
            # the index of the highest value corresponds to the label
            label = numpy.argmax(outputs)
            # append correct or incorrect to list
            if (label == correct_label):
            # network's answer matches correct answer, add 1 to
            #scorecard
                scorecard.append(1)
            else:
            # network's answer doesn't match correct answer, add 0 to
            #scorecard
                scorecard.append(0)
        
            # calculate the performance score, the fraction of correct answers
            scorecard_array = numpy.asarray(scorecard)
            performance = scorecard_array.sum() / scorecard_array.size
            return performance
            
    def cost_function(self,target):   
        return 0.5*sum((self.final_outputs-target)**2)
        

In [None]:
from os.path import expanduser
import matplotlib.pyplot as plot 
home = expanduser("~")
home = home.replace("\\","/")+"/"
data = open(home+"Downloads/Python_Artificial/MNIST_dataset/mnist_train.csv")
data=data.readlines()
all_values = data[0].split(",")
image_array = np.asfarray(all_values[1:]).reshape(28,28)
plot.imshow(image_array,cmap = 'grey',interpolation= None)
# Weight initilization formula - > So
# if each node has 3 links into it, the initial weights should be in the range 1/(√3) = 0.577. If each
# node has 100 incoming links, the weights should be in the range 1/(√100) = 0.1.

In [None]:

#scaling the raw input from 0-255 to 0.01 to 1.0 and perform the actual training of the dataset
input_nodes = 784
hidden_nodes =100
output_nodes =10
learning_rate =0.02 #It can be anything between 0.01 to 0.5, Finding sweet spot takes time.
n = NeuralNetwork(input_nodes,hidden_nodes,output_nodes,learning_rate)
#n.query([1, 0.5, -1.5]) We are simply observing a neural net outputs from initiated random weights and inputs. 
#The query is exactly the same as a neural net forward pass
#Training of the actual network
training_data = data
cost =[]
for i in range(10000):
        print(i)
        for record in training_data:
            all_values = record.split(",")
            inputs = (np.asfarray(all_values[1:])/(255.00*0.99))+0.01
            if len(inputs)<input_nodes:
                pass
            else:
                targets = np.zeros(output_nodes)+0.1
                targets[int(all_values[0])]=0.99
                n.train(inputs,targets)
                c = n.cost_function(targets)
                cost.append(c)
            
           

In [None]:
test_data = open(home+"Downloads/Python_Artificial/MNIST_dataset/mnist_test_10.csv")
test_data= test_data.readlines()
test_values = test_data[0].split(",")
img_array = np.asfarray(test_values[1:]).reshape(28,28)
plot.imshow(img_array,cmap="grey",interpolation = None)
test_data = (np.asfarray(test_values[1:])/(255.00*0.99)+ 0.01)
output = n.query(test_data)
print(np.argmax(output))


In [None]:
#Reversing a neural net to understand its mind

In [None]:
plot.grid()
plot.plot(range(10000),cost)

plot.title('Cost Function')
plot.xlabel('Training Iterations')
plot.ylabel('Cost')

In [7]:
#Adagrad Optimizer
class NeuralNetworkWithAdagrad:

            def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
                #Initiallizing the nodes
                self.inp_nodes = input_nodes
                self.hid_nodes = hidden_nodes
                self.out_nodes = output_nodes
                self.lr = learning_rate
            
                #Initializing the weights and biases
                self.inptohid_weights = np.random.normal(0.0,pow(self.hid_nodes,-0.5),(self.hid_nodes,self.inp_nodes))
                self.bias1 = np.zeros((1,hid_nodes))
                self.hidtoout_weights = np.random.normal(0.0,pow(self.out_nodes,-0.5),(self.out_nodes,self.hid_nodes))
                self.bias2 = np.zeros((1,out_nodes)) # Tuple 
            
                #Initializing Adagrad Parameters 
                self.grad_inputtohid_W2 = np.zeros_like(self.inptohid_weights)
                self.grad_bias1 = np.zeros_like(self.bias1)
                self.grad_hidtoout_W1 = np.zeros_like(self.hidtoout_weights)
                self.grad_bias2 = np.zeros_like(self.bias2)
            
            def train():
               pass
               
            def forward_pass(self,X):
               x = np.array(X,ndimn = 2).T
               self.z1 = np.dot(self.inptohid_weights,x)+self.bias1
               self.a1 = self.relU(self.z1)
               self.z2 = np.dot(self.hidtoout_weights,self.a1) + self.bias2
               self._pred_y = self.softmax(self.z2)
               return self._pred_y
               
            def backward_pass(Y,X):
            #In back propagation we are trying to find the gradient of the loss function with respect to the weights and biases
            #therefore when we are applying derivative rules over the loss function we come up with an equation that shows the 
            #direct relation with the loss respect to the weights. It means increasing the a unit in the parameters (weights, bias) can 
            #makes a significant change in the loss function. It might increase it or decrease it. In gradient optimization we simply
            #substract the gradient from the weight because a positive gradient means a larger increase and a negative gradient can mean 
            #denote a potential decrease. Negative gradients are added and positve gradients are substracted from the weights which
            #comes up with the desired parameters which we want for our model. 
               true_y = Y
               delta2 = self._pred_y - true_y # (Cross Entropy Loss) Derivate related to the loss function and W2 or dl/dW2 is
                                              #(a2 - y)*a1 (gradients). [(y_hat is a2) from the forward pass]. 
                                              #Because according to the chainrule for the
                                              #second set of weights related to the outputlayer is dL/dw2 = dl/da2*da2/dz2*dz2/dw2
                                              #where dl/dw2 =
                                              #(dl/da2*da2/dz2)*d(w2*a1)/dw2 = d(sum(y_hat-log(a2))/da2*d(softmax_derivative(z2))/dz2*d(W2*a1)/dW2
                                              #=(-sum(y*(1/a2))*a2(1-a2))*a1 [Because ya2 is 0 always for the incorrect class. Y is one hot encoded vector with 0 and 1
                                              #hence we take the first part of softmax derivative which is a2(1-a2)]
                                              #=(a2-y)*a1 
               dW2 = np.dot(delta2,self.a1.T) #From the formula.
               db2 = np.sum(delta2,axis = 0, keepdims = True)
                
               #Let us calculate dW1 for the first set of weights (from the input_layer to the hidden layer)
               #Since we already have the error gradients which is delta2, we will distribute 
               #delta2 inside the hidden layer back
               #to the inputlayer. The formula is 
               #da1/dw1 = da1/dz1*dz1/dw1*(W2*delta2)
               #d(relu(z1))/dz1*d(w1*x)/dw1*(W2*delta2)
               #relu'(z1)*(W2*delta2)*x
               
               delta1 = np.dot(self.hidtoout_weights.T,delta2)*self.relu_derivative(z1)
               dW1 = np.dot(delta1,X.T)
               db1 = np.sum(delta1,axis = 0)
               return dW1,db1,dW2,db2
            
            def relU(self,X):
               return np.maximum(0,X)
               
            def relU_derivative(self,X):
               return np.where(x>0, 1, 0)
               
            def softmax(self,z2):
               ex_y = np.exp(z2) / np.sum(np.exp(z2),axis = 1, keepdims = True)
               return ex_y/ex_y.sum(axis = 0)
               
            def cost_function(self,y_true):
               loss = -np.sum(y_true * np.log(self._pred_y))
               return loss
               
            def Adagrad_Update(self,dW1,db1,dW2,db2):
               
                self.grad_hidtoout_W1 += dW1 ** 2
                self.grad_bias1 += db1 ** 2
                self.grad_inputtohid_W2 += dW2 ** 2
                self.grad_bias2 += db2 ** 2
                
                self.hidtoout_weights -= learning_rate * dW1 / (np.sqrt(self.grad_hidtoout_W1) + self.eps)
                self.bias1 -= learning_rate * db1 / (np.sqrt(self.grad_bias1) + self.eps)
                self.inptohid_weights -= learning_rate * dW2 / (np.sqrt(self.grad_inputtohid_W2) + self.eps)
                self.bias2 -= learning_rate * db2 / (np.sqrt(self.grad_bias2) + self.eps)

        
        

In [5]:
#Adam Optimizer
class NeuralNetworkwithAdamOptimizer:

    def __init__(self,input_nodes,output_nodes,hidden_nodes,learning_rate):
        self.inp_nodes = input_nodes
        self.hid_nodes = hidden_nodes
        self.out_nodes = output_nodes

        self.W1 = np.random.normal(0.0,pow(self.hid_nodes,-0.5),(self.hid_nodes,self.inp_nodes))
        self.b1 = np.zeros((1,hid_nodes))
        self.W2 = np.random.normal(0.0,pow(self.out_nodes,-0.5),(self.out_nodes,self.hid_nodes))
        self.b2 =  np.zeros((1,out_nodes))

        self.beta1 = 0.9
        self.beta2 = 0.999
        self.epsilon = 1e-8
        self.mW1 = np.zeros_like(self.W1)
        self.vW1 = np.zeros_like(self.W1)
        self.mb1 = np.zeros_like(self.b1)
        self.vb1 = np.zeros_like(self.b1)
        self.mW2 = np.zeros_like(self.W2)
        self.vW2 = np.zeros_like(self.W2)
        self.mb2 = np.zeros_like(self.b2)
        self.vb2 = np.zeros_like(self.b2)
        self.t = 0
        
        def train(self):
            pass
        
        def relu(self,X):
            return np.maximum(0,X)
        
        def relu_der(self,X):
            return np.where(x>0, 1,0)
        
        def softmax(self,X):
            e_X = np.exp(X) / np.sum(np.exp(X),axis = 1, keepdims = True)
            return e_X/e_X.sum(axis = 0)
        
        def softmax_der(self,y_pred):
            softmax_output = np.expand_dims(probs, axis=-1)
            return np.diagflat(softmax_output) - np.dot(softmax_output, softmax_output.T)
        
        def cost_function(self,y_true,y_pred):
            loss = -np.sum(y_true * np.log(y_pred))
            return loss
        
        def forward_pass(self,X):
            X = np.array(X,ndmin =2).T
            z1 = np.dot(self.W1,X)+self.b1
            a1 = self.relu(z1)
            z2 = np.dot(self.W2,a1)+self.b2
            y_pred = self.softmax(z2)
            
            return y_pred
                             
        def backward_propagation(self,y_true,y_pred,X):
            delta2 = y_pred-y_true #Cross entropy categorical loss derivative
            dW2 = np.dot(delta1,a1.T) #1. dL/dW2 = (dL/dy_pred)*(dy_pred/dz2)*(dz2/da1)
            db2 = np.sum(delta1,axis = 0, keepdims =True)
            delta1 = np.dot(delta2,W2.T)*self.relu_der(z1)
            dW1 = np.dot(delta1,X.T) #2. dL/dW1 = (da1/dz1)*(dz1/dW1)*delta2*W2 [Back propagate the errors]
            db1 = np.sum(delta1,axis =0) #The obtained gradient formula is applied on both of the junctions. Inp to Hid layer
                                      #and Hid to Out Layer. 
            return dW1,db1,dW2,db2
        
        def Adam_Optimizer(self,dW1,db1,dW2,db2):
            self.t += 1
            self.mW1 = self.beta1 * self.mW1 + (1 - self.beta1) * dW1
            self.vW1 = self.beta2 * self.vW1 + (1 - self.beta2) * (dW1 ** 2)
            self.mb1 = self.beta1 * self.mb1 + (1 - self.beta1) * db1
            self.vb1 = self.beta2 * self.vb1 + (1 - self.beta2) * (db1 ** 2)
            mW1_hat = self.mW1 / (1 - self.beta1 ** self.t)
            vW1_hat = self.vW1 / (1 - self.beta2 ** self.t)
            mb1_hat = self.mb1 / (1 - self.beta1 ** self.t)
            vb1_hat = self.vb1 / (1 - self.beta2 ** self.t)
            self.W1 -= learning_rate * mW1_hat / (np.sqrt(vW1_hat) + self.epsilon)
            self.b1 -= learning_rate * mb1_hat / (np.sqrt(vb1_hat) + self.epsilon)
            
            self.mW2 = self.beta1 * self.mW2 + (1 - self.beta1) * dW2
            self.vW2 = self.beta2 * self.vW2 + (1 - self.beta2) * (dW2 ** 2)
            self.mb2 = self.beta1 * self.mb2 + (1 - self.beta1) * db2
            self.vb2 = self.beta2 * self.vb2 + (1 - self.beta2) * (db2 ** 2)
            mW2_hat = self.mW2 / (1 - self.beta1 ** self.t)
            vW2_hat = self.vW2 / (1 - self.beta2 ** self.t)
            mb2_hat = self.mb2 / (1 - self.beta1 ** self.t)
            vb2_hat = self.vb2 / (1 - self.beta2 ** self.t)
            self.W2 -= learning_rate * mW2_hat / (np.sqrt(vW2_hat) + self.epsilon)
            self.b2 -= learning_rate *  mb2_hat / (np.sqrt(vb2_hat) + self.epsilon)
            
            return 
