<a href="https://colab.research.google.com/github/Petrichoeur/Neural_Net_from_scratch/blob/master/NN_Adam_optimizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import 

In [0]:
    import numpy as np 
    import pandas as pd  
    from collections import OrderedDict

## Neural Network from scratch

In [0]:
def relu_prime(x): 
    ''' Useful function for later 
     All derivative are kinda easy , but Relu derivative is not well fit for numpy wise simple calculation , 
     so i have to huse a pre_made function to make it easier ''' 
    tmp=x
    tmp[x<=0] = 0
    tmp[x>0] = 1
    return tmp 







class DenseLayer() : 

    def __init__(self,size,activation='tanh',out=False):  
        
        self.out= out # For make some prediction later
        self.size = size   # Size of the layer
        self.activation=activation # Type of activation
        self.weights=None  # Weights
        self.weights_shape=None # Input_shape of weights
        self.delta=None   # Delta for backward pass
        self.output=None  # Output activation for the current layer 
        self.bias=None   # Bias 
        self.derivative=None # Derivative of the output activation 
        self.momentum= [0,0]
        self.momentum_bias= [0,0]
      
    def weight_init(self): 
        np.random.seed(0) # Seed so we always have the same initialisation, so we can compare different activation function and different optimizer
        self.weights=2*np.random.rand(self.weights_shape,self.size)-1  # Init the weights beetween [-1,1]
        self.bias=np.ones((1,self.size)) 

    def Output(self,input_):  
        '''Input_ = Input_ from the previous layer for forward pass    
           At each pass the output is calculated but the derivative of the output too  
           It's a way of having always the specific activation and derivative function for each layer. 
             '''
        self.function=self.activation  # Wich type of activation we will use 
        #self.output= 1/(1+np.exp(-(np.dot(input_,self.weights)+self.bias))).reshape(1,self.size) 
        self.dot_activation=np.dot(input_,self.weights)+self.bias # We get the input dot the weights with add of the bias 

        if self.function=='tanh' :    # Hyperbolique tangente 
            self.output=np.tanh(self.dot_activation)  
            self.derivative= (1-self.output**2)
        elif self.function =='sigmoid' : # Sigmoid
            self.output = 1/(1+np.exp(-1.0*self.dot_activation))  
            self.derivative = self.output*(1-self.output)
        #elif self.function == 'Relu' :  # Rectified Linear Unit
         #   self.output= np.maximum(self.dot_activation,0)  
          #  self.derivative=relu_prime(self.output)    # Some issues with that one !
        elif self.function == 'Lecun_tanh':   # Lecun hyperbolic tangente activation 
            self.output= 1.7159*np.tanh((2/3)*self.dot_activation)
            self.derivative =1.7159*(2/3)*(1-((self.output / 1.7159)**2)) 
        elif self.function=='nothing': 
            self.output = self.dot_activation 
            self.derivative= 1  
        else :  
             raise ValueError('Activation function unknown')
         


class NeuralNet():   # Class for Neural Net , It initialize the neural net when call .
    def __init__(self,input_shape,batch_size=1): 
        self.input_shape=input_shape # Shape of the input
        self.batch_size = 1  #Batch size for further implementation of Mini Batch Descend and other optimizer than SGD
        self.layers={}  # A dic of Layer, better than list for memory use 
        self.input_layer=np.empty((1,input_shape)) # Input_layer size .
        self.nn_size=0   # Number of Layers, Input doesn't count.
        self.layer_shape=[input_shape] # Shape of each layer 
        self.forward_count=0 # To keep the count , just in case 
        self.pred=None  # To get the output after a forward pass
        self.output_error=None # The error to minimize  
        self.backward_count=0 # Number of backward pass  

    def add(self,layer): 
        self.layers[self.nn_size]=layer   # We add a layer
        self.layers[self.nn_size].weights_shape=self.layer_shape[self.nn_size] # We initialize the shape to match other layers 
        self.layers[self.nn_size].weight_init() # We initialize the weights .
        self.layer_shape.append(self.layers[self.nn_size].size) # We keep the size for next layer use .
        self.nn_size += 1  # We are getting bigger !!!!
    
    def forward(self,X,return_=False): 
        self.forward_count += 1 # Keep the count ! 
        self.input_layer=X # First input layer is the data to use 
        self.layers=OrderedDict(self.layers.items()) # You don't have to, but it's a way to preventing some index mistakes.
        for i in range(self.nn_size) :   # Loop for passing the information through the network 
            if i==0:
                self.layers[i].Output(self.input_layer)  # We calculate the first output
            else : 
                self.layers[i].Output(self.layers[i-1].output) # We calculate the output depending the previous output 
        self.pred=self.layers[self.nn_size-1].output[0]  # The last outputs, aka the prediction.
        if return_ == True :  # If you want to keep only the prediction
            return  self.pred 
    def adam_op(self,momentum,grad,timestep,beta1=0.9,beta2=0.99,eps=1e-8): # function to calculate the update-rule with adam optimisation
        m1=momentum[0] # First order  momentum  || Moving average of the gradient 
        m2=momentum[1] # Second order momentum, || Uncentered variance of the gradient
        m1_ = beta1*m1 + (1-beta1)*grad	 # Update  and calculation of the moving average depending the last iteration
        m2_ = beta2*m2 + (1-beta2)*np.square(grad) # Update  and calculation of the uncentered variance depending the last iteration
        m1_hat = m1_/(1-(beta1**timestep))	# Bias correction for the estimation of the first order  momentum 
        m2_hat = m2_/(1-(beta2**timestep))	# Bias correction for the estimation of the second order  momentum 		 

        return m1_,m2_,np.divide(m1_hat,np.sqrt(m2_hat)+eps)
    def Backward(self,y,X_train,alpha=0.02,optimizer='SGD'): 
        ''' FOr this function i won't go into mathematics details because you can easily find all the 
        maths you need on backward pass , gradient descend and weights updates. All you have to remember is : 
         for each layer , the delta is :  
                Layer_error =(delta of the next layer dot the weights of the actual layer ) 
              (Not exactly but i use this for  my algorithm)  Layer _ delta =   Layer_error *( the derivative of the output of the previous layer )  
              ( It eventually do the same stuff as basic SGD, but i keep the bias of the previous layer in each layer , so i have to do it differently)
                update_on_the_weights_of_the_actual_layer == The weights + learning_rate*Previous_layers_output*Next_layers_delta  
                
                '''
        self.X_train = X_train # Crazy train on the station !!! The data we will use for backward pass
        self.output_error= y-self.pred  # The error on prediction.
        self.output_delta=self.output_error*self.layers[self.nn_size-1].derivative # The delta of the output !!
        for i in range(self.nn_size-1,0,-1):  # Reversed range , we start from the end
            if i==self.nn_size-1:
                self.layers[i].error=self.output_delta.dot(self.layers[i].weights.T)  #First layer error is special, output_delta it is 
            else : 
                self.layers[i].error=self.layers[i+1].delta.dot(self.layers[i].weights.T) #others layers errors act the same , mainstream layers .
            self.layers[i].delta=self.layers[i].error*self.layers[i-1].derivative  # We get the delta values for each layers
        if optimizer == 'SGD':
            for i in range(self.nn_size-1) :  
                if i ==0 : 
                    self.layers[i].weights += alpha*self.X_train.T.dot(self.layers[i+1].delta) # Update the weights , special because it's the input
                    self.layers[i].bias += alpha*self.layers[i+1].delta # Update the bias , special too
                elif i ==self.nn_size-1 :  
                    self.layers[i].weights +=alpha*self.layers[i-1].output.T.dot(self.output_delta) # Update the weights, special because 
                                                                                                                #it's the end of the backwardpass journey
                    self.layers[i].bias += alpha*self.output_delta # Update the bias 
                else : 
                    self.layers[i].weights += alpha*self.layers[i-1].output.T.dot(self.layers[i+1].delta)  # Update the weights
                    self.layers[i].bias += alpha*self.layers[i+1].delta # Update the bias  
        if optimizer =='Adam':   
            
            for i in range(self.nn_size-1):  
                m =self.layers[i].momentum  # We take the momentums from weights
                m_b= self.layers[i].momentum_bias # We take the momentums from bias 
                timestep=self.backward_count # numbers of iterations done on the train data set .
                
                if i ==0 : 

                    m[0],m[1],ADAM= self.adam_op(m,self.X_train.T.dot(self.layers[i+1].delta),timestep) # Get the new momentum and the update function for weights
                    m_b[0],m_b[1],ADAM_bias= self.adam_op(m_b,self.layers[i+1].delta,timestep) # Get the new momentum and the update function for bias 
                    self.layers[i].weights += alpha*ADAM# Update the weights , special because it's the input
                    self.layers[i].bias += alpha*ADAM_bias # Update the bias , special too   

                  
                                
                elif i ==self.nn_size-1 : 
                    m[0],m[1],ADAM= self.adam_op(m,self.layers[i-1].output.T.dot(self.output_delta),timestep) # Get the new momentum and the update function for weights
                    m_b[0],m_b[1],ADAM_bias=self.adam_op(m_b,self.output_delta,timestep)  # Get the new momentum and the update function for bias 
                    self.layers[i].weights +=alpha*ADAM # Update the weights, special because 
                                                                    #it's the end of the backwardpass journey
                    self.layers[i].bias += alpha*ADAM_bias # Update the bias 

                else : 
                    m[0],m[1],ADAM= self.adam_op(m,self.layers[i-1].output.T.dot(self.layers[i+1].delta),timestep)# Get the new momentum and the update function for weights
                    m_b[0],m_b[1],ADAM_bias= self.adam_op(m_b,self.layers[i+1].delta,timestep)  # Get the new momentum and the update function for bias 


                    self.layers[i].weights += alpha*ADAM  # Update the weights
                    self.layers[i].bias += alpha*ADAM_bias # Update the bias




    def train(self,y,X,epoch=5, optimizer='SGD',history='False',alpha=0.01):  # Okay so , we can train now 
        if optimizer =='SGD':
            for _ in range(epoch): # Number of epochs or how many time the entire train set is being used for minimizing the error .
                for idx,el in enumerate(X): 
                        el=el.reshape(1,X.shape[1]) # Reshape, i don't want shape errors .
                        self.forward(el) # One pass forward
                        self.Backward(y[idx],el,alpha=alpha) # One pass backward   
        if optimizer=='ADAM':  
            for _ in range(epoch): # Number of epochs or how many time the entire train set is being used for minimizing the error . 
                self.backward_count += 1  # increment the number of backward pass done .
                for idx,el in enumerate(X): 
                    
                    el=el.reshape(1,X.shape[1]) # Reshape, i don't want shape errors .
                    self.forward(el) # One pass forward 
                    self.Backward(y[idx],el,alpha=alpha,optimizer='Adam')  #One pass backward but with adam_optimizer
                

                
                    


            

In [0]:
test = NeuralNet(5) # Input_size = 5
test.add(DenseLayer(15,activation='Lecun_tanh')) # I want to try a Lecun activation Function ! 
test.add(DenseLayer(10,activation='Lecun_tanh'))# Lecun_tanh is great 
test.add(DenseLayer(1,activation='Lecun_tanh',out=True))


test2 = NeuralNet(5) # Input_size = 5
test2.add(DenseLayer(15,activation='Lecun_tanh')) # I want to try a Lecun activation Function ! 
test2.add(DenseLayer(10,activation='Lecun_tanh'))# Lecun_tanh is great 
test2.add(DenseLayer(1,activation='Lecun_tanh',out=True))

In [0]:
from sklearn.datasets import make_classification # for generating a data set 
from sklearn.model_selection import train_test_split # For getting validation data and shuffling the data too
X, Y = make_classification(n_samples=1000 ,n_features=5, n_redundant=0, n_informative=2,
                             n_classes=2)  # we get X with features and Y with label
x_train,x_test,y_train,y_test = train_test_split(X,Y,shuffle=True,test_size=0.3) # Splitting for train and test 

In [0]:

test.train(y_train,x_train,epoch=5,optimizer='ADAM')  # Let's train our data with ADAM  with 5 epochs !! 
test2.train(y_train,x_train,epoch=5,optimizer='SGD') # Let's train our data with SGD for compare  with 5 epochs !! 


In [0]:
def pred_to_binary(x): 
    for ind,val in enumerate(x): 
        if val >= 0.5:
            x[ind]= 1 
        else : 
            x[ind]=0
    return x


In [138]:

testy=[] 
for i in range(len(x_test)):
    prediction= test.forward(x_test[i],return_=True)
    testy.append(prediction) 
testy=pred_to_binary(testy)
from sklearn.metrics import classification_report  


testy2=[] 
for i in range(len(x_test)):
    prediction= test2.forward(x_test[i],return_=True)
    testy2.append(prediction) 
testy2=pred_to_binary(testy2)
from sklearn.metrics import classification_report 


print("===========================================")
print("With Adam optimization")
print("===========================================")
print(classification_report(y_test,np.round(testy).reshape(300)))   
print("===========================================")
print("With Stochastic Gradient Descent optimization")
print("===========================================")
print(classification_report(y_test, np.round(testy2).reshape(300)))

With Adam optimization
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       144
           1       0.93      0.96      0.95       156

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300

With Stochastic Gradient Descent optimization
              precision    recall  f1-score   support

           0       0.88      0.93      0.91       144
           1       0.93      0.88      0.91       156

    accuracy                           0.91       300
   macro avg       0.91      0.91      0.91       300
weighted avg       0.91      0.91      0.91       300



# Conclusion on Adam Optimizer 
You can see that on the same amount of epochs (5), the adam optimizer converge faster to the good update of the weights. For a big amount of epochs the SGD and the ADAM optimizer will have nearly the same efficiency. But on a big amount of data , with complex structure, time is important, and with Adam you are faster on optimization ! 