In [1]:
import numpy as np

In [2]:
from tqdm import tqdm

In [3]:
def get_activation(activation):#maybe getderivatives here iteself ?
    def sigmoid(x):
        return np.where(x >= 0, 
                        1 / (1 + np.exp(-x)), 
                        np.exp(x) / (1 + np.exp(x)))
    def softmax(x):
        z=x-np.max(x,axis=0)
        return np.exp(z)/np.sum(np.exp(z),axis=0)
    if activation=='sigmoid':
        return sigmoid
    elif activation=='softmax':
        return softmax
    elif activation== 'tanh':
        return np.tanh(x)

In [4]:
def get_activation_derivative(activation):#maybe getderivatives here iteself ?
    def sigmoid_d(x):
        sig= np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
        return sig*(1-sig)
    def softmax_d(x):
        z=x-np.max(x,axis=0)
        soft=np.exp(z)/np.sum(np.exp(z),axis=0)
        return soft*(1-soft)
    if activation=='sigmoid':
        return sigmoid_d
    elif activation=='softmax':
        '''
        need to think more, not required for backprop as we look directly at dL/da_l
        But still, for the sake of completeness, and if user wants softmax in the middle?
        d S(x_i) /d x_j= S(x_i)*(kronecker delta_i,j -S(x_j))
        But we care about only dh_k,j/da_k,j So no need to implement d S(x_i) /d x_j
        d S(x_i) /d x_i should suffice
        so we get array of [ d S(x_1) /d x_1, d S(x_2) /d x_2, ....]
        
        For MSE loss after softmax, we need cross terms...
        '''
        
        return softmax_d
    elif activation=='tanh':
        return 1-tanh(x)**2

In [62]:
def get_loss(loss):
    
   
    
        
    def crossentropy(P,Q):
        assert(P.shape==Q.shape), "Inputs must be of same shape"

        return np.sum([-np.dot(P[:,i],np.log2(Q[:,i])) for i in range(P.shape[1])])
    def SE(P,Q):
        assert(P.shape==Q.shape), "Inputs must be of same shape"

        return np.sum(np.square(P-Q))
    
    if loss=="SE":
        return SE
    return crossentropy
    
    
      
    
    
    
    

In [63]:
def get_loss_derivative(loss):
    def SE_d(y_in,y_pred_in):
        '''
        derivative of MSE after softmax is used to get probabs from a_L:
        We need indicator because the all terms of y_true are required unlike cross-entropy where only y_pred[l] is required
        Thus transforming the stacked indicator to y_true, not here...
        
        '''

        def indicator(i,j):
                if i==j:
                    return 1
                return 0


        assert(y_in.shape[0]==y_pred_in.shape[0]),"Inputs must contain same number of examples"

        y=y_in.ravel()
        y_pred=y_pred_in.ravel()


        return np.array([
            [2*np.sum([(y_pred[i]-y[i])*y[i]*(indicator(i,j) - y_pred[j]) for i in range(y.shape[0])])]
            for j in range(len(y))
        ])    
   
    
        
    def crossentropy_d(y,y_pred):
        

        return -(y-y_pred)
    
    
    if loss=="cross-entropy":
        return crossentropy_d
    return SE_d
    

In [64]:
class layer:
    def __init__(self,input_size,output_size,activation='sigmoid'):
            
        ''' 
        output size number of neurons i
        input size j
        
        '''
        self.W=np.random.randn(output_size,input_size) #size ixj
        self.b=np.random.randn(output_size,1)           #size i
        self.a=np.random.randn(output_size,1)           #size i
        self.h=np.random.randn(output_size,1)           #size i
        self.g=get_activation(activation)
        
        self.d_a=np.zeros((output_size,1))
        self.d_h=np.zeros((output_size,1))
        self.d_W=np.zeros((output_size,input_size))
        self.d_b=np.zeros((output_size,1))
        self.d_g=get_activation_derivative(activation)
        
        
        
    def forward(self, inputs):
        self.a=self.b+np.matmul(self.W,inputs)
        self.h=self.g(self.a)
        return self.h
    def reset(self):
        self.d_a=np.zeros(np.shape(self.d_a))
        self.d_h=np.zeros(np.shape(self.d_h))
        self.d_W=np.zeros(np.shape(self.d_W))
        self.d_b=np.zeros(np.shape(self.d_b))
        


In [65]:
class Model:
    def __init__(self,X_size,Y_size,hidden_layer_sizes=[4],hidden_layer_activations=['sigmoid'],loss='cross-entropy',lamdba_m=0):
        '''
        '''   
        
        self.input_size=X_size
        self.output_size=Y_size
        self.hidden_layer_sizes=hidden_layer_sizes
        self.layers=[]
        
        prev_size=self.input_size
    
        for size,activation in zip(hidden_layer_sizes,hidden_layer_activations):
            self.layers.append(layer(prev_size,size,activation))
            prev_size=size
        self.layers.append(layer(size,self.output_size,'softmax'))
        
        self.loss=get_loss(loss)#without regularization term
        self.loss_d=get_loss_derivative(loss)
        self.lamdba_m=lamdba_m #we shall pass lambda/m to this, where m is patch size
        
    def forward(self,x):
        output=x
        # print(output.shape)
        for layer in  self.layers:
            # print('W',layer.W.shape)
            output=layer.forward(output)
            # print(output.shape)   
        return output
    
    def reset(self):
        for layer in  self.layers:
            # resets the dWs
            layer.reset()
    
 
        
    
          
            
    def backward(self,x,y,y_pred):
        # self.layers[-1].d_h is not needed as d_h is used to calculate d_a and self.layers[-1].h is softmax
        self.layers[-1].d_a=self.loss_d(y,y_pred)
            
        
        
        for idx in range(len(self.layers)-1,0,-1): #goes from L->2, for l=1 we do outside
            
            
            #compute gradient wrt parameters
            self.layers[idx].d_W+=np.matmul(self.layers[idx].d_a,np.transpose(self.layers[idx-1].h))+self.lamdba_m*self.layers[idx].W
            self.layers[idx].d_b+=self.layers[idx].d_a
            
            #compute gradient wrt layer below -- will help in next layer iter
            self.layers[idx-1].d_h=np.matmul(np.transpose(self.layers[idx].W),self.layers[idx].d_a)
            
            #compute gradient -- element wise multiplivation, derivative of the activation function of layer idx-1
            self.layers[idx-1].d_a=self.layers[idx-1].d_h*self.layers[idx-1].d_g(self.layers[idx-1].a)
                        
        self.layers[0].d_W+=np.matmul(self.layers[0].d_a,np.transpose(x))+self.lamdba_m*self.layers[0].W
        self.layers[0].d_b+=self.layers[0].d_a 
        
    def predict(self,Xtest):
        preds=[]
        for i in range(Xtest.shape[1]):
            preds.append(self.forward(Xtest[:,[i]]))
        
        ytest_pred=np.hstack(preds)
        return ytest_pred
    
        
        
                    
                
                
            
            
            
        
    
            
            
        
            
            

In [66]:
temp_model=Model(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])

In [67]:
class optimizers(Model):
    def __init__(self,X_size,Y_size,hidden_layer_sizes=[4],hidden_layer_activations=['sigmoid']):
        super().__init__(X_size,Y_size,hidden_layer_sizes,hidden_layer_activations)

    def batch_gradient_descent(self,X,Y,eta=1,batch_size=1,max_iters=1000):
        
        '''
        Mini-Batch Gradient Descent
        at batchsize=1, behaves like sgd, batchsize=np.shape(X), behaves as gd
        eta is the learning rate
        '''
        t=1
        
        

        while t<max_iters:
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                
                #update if the number of points seen==batch size, or if data ends
                if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                    for layer in self.layers:
                        layer.W=layer.W-eta*layer.d_W
                        layer.b=layer.b-eta*layer.d_b
                    self.reset() #reset grads before new epoch
            t+=1
            
    def stochastic_gradient_descent(self,X,Y,eta=1,max_iters=1000):
        
        '''
        stochastic Gradient Descent
        '''
        t=1        
        

        while t<max_iters:
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                
                #update
                for layer in self.layers:
                    layer.W=layer.W-eta*layer.d_W
                    layer.b=layer.b-eta*layer.d_b
            t+=1
        
        
    def Momentum_based(self,X,Y,eta=1,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        u_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        u_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]

        



        for t in range(max_iters):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                



            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    u_W[i]=beta*u_W[i]+layer.d_W
                    u_b[i]=beta*u_b[i]+layer.d_b
                    layer.W=layer.W-eta*u_W[i]
                    layer.b=layer.b-eta*u_b[i]
                self.reset() #reset grads before new epoch
                


    def rmsprop(self,X,Y,eta=1,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in range(max_iters):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    v_W[i]=beta*v_W[i]+(1-beta)*layer.d_W**2
                    v_b[i]=beta*v_b[i]+(1-beta)*layer.d_b**2
                    layer.W=layer.W-(eta/np.sqrt(v_W[i]+epsilon))*layer.d_W
                    layer.b=layer.b-(eta/np.sqrt(v_b[i]+epsilon))*layer.d_b
                self.reset() #reset grads before new epoch
                


            
    def adam(self,X,Y,eta=1,beta1=0.9, beta2=0.999, batch_size=100,max_iters=1000):
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in range(max_iters):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    #updating momentum, velocity
                    m_W[i]=beta1*m_W[i]+(1-beta1)*layer.d_W
                    m_b[i]=beta1*m_b[i]+(1-beta1)*layer.d_b
                    
                    v_W[i]=beta2*v_W[i]+(1-beta2)*layer.d_W**2
                    v_b[i]=beta2*v_b[i]+(1-beta2)*layer.d_b**2
                    
                    m_W_hat=m_W[i]/(1-np.power(beta1,t+1))
                    m_b_hat=m_b[i]/(1-np.power(beta1,t+1))
                    v_W_hat=v_W[i]/(1-np.power(beta2,t+1))
                    v_b_hat=v_b[i]/(1-np.power(beta2,t+1))
                    
                    
                    
                    layer.W=layer.W-(eta*m_W_hat)/(np.sqrt(v_W_hat)+epsilon)
                    layer.b=layer.b-(eta*m_b_hat)/(np.sqrt(v_b_hat)+epsilon)
                self.reset() #reset grads before new epoch
    
    def NAG(self,X,Y,eta=1,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]

        



        for t in range(max_iters):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                



            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    m_W[i]=beta*m_W[i]+eta*layer.d_W
                    m_b[i]=beta*m_b[i]+eta*layer.d_b

                    
                    layer.W=layer.W-(beta*m_W[i]+eta*layer.d_W[i])
                    layer.b=layer.b-(beta*m_b[i]+eta*layer.d_b[i])
                self.reset() #reset grads before new epoch
                
                
    def NAdam(self,X,Y,eta=1,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]

        



        for t in range(max_iters):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                



            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    m_W[i]=beta*m_W[i]+eta*layer.d_W
                    m_b[i]=beta*m_b[i]+eta*layer.d_b

                    
                    layer.W=layer.W-(beta*m_W[i]+eta*layer.d_W[i])
                    layer.b=layer.b-(beta*m_b[i]+eta*layer.d_b[i])
                self.reset() #reset grads before new epoch
                

                
        
            



        

        
    

In [95]:
class optimizers_beta:
    def __init__(self,X_size,Y_size,hidden_layer_sizes=[4],hidden_layer_activations=['sigmoid'],
                 loss='cross-entropy',optimizer='adam',lamdba=0,batch_size=5,epochs=10,eta=1e-3):
        self.batch_size=batch_size
        self.epochs=epochs
        self.train_loss=0
        self.val_loss=0
        self.model=Model(X_size,Y_size,hidden_layer_sizes,hidden_layer_activations,loss,lamdba/self.batch_size)
        self.learning_rate=eta
        self.optimizer=optimizer
        

    def batch_gradient_descent(self,traindat,testdat,verbose=False):
        
        '''
        Mini-Batch Gradient Descent
        at batchsize=1, behaves like sgd, batchsize=np.shape(X), behaves as gd
        eta is the learning rate
        '''
        X,Y=traindat
        
        Xval,Yval=testdat
        eta=self.learning_rate
        

        for t in tqdm(range(self.epochs)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.model.forward(x)
                self.model.backward(x,y,y_pred)
                number_points_seen+=1
                
                #update if the number of points seen==batch size, or if data ends
                if number_points_seen%self.batch_size==0 or number_points_seen==np.shape(X)[1]:
                    for layer in self.model.layers:
                        layer.W=layer.W-eta*layer.d_W
                        layer.b=layer.b-eta*layer.d_b
                    self.model.reset() #reset grads before new epoch
            
            #end of epoch
            regularization=1/2*self.model.lamdba_m*np.sum([np.sum(layer.W**2) for layer in self.model.layers])
            self.train_loss=self.model.loss(Y,self.model.predict(X))
            self.val_loss=self.model.loss(Yval,self.model.predict(Xval))
            
    def stochastic_gradient_descent(self,X,Y,eta=1e-3,max_iters=10):
        
        
        '''
        stochastic Gradient Descent
        '''
                
        

        for t in tqdm(range(max_iters)):
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                
                #update
                for layer in self.layers:
                    layer.W=layer.W-eta*layer.d_W
                    layer.b=layer.b-eta*layer.d_b
                self.reset() #reset grads before new update
                    
            
        
        
    def Momentum(self,X,Y,eta=1e-3,beta=0.9,batch_size=100,max_iters=10):
        ''''''
        u_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        u_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]

        



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                



            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    u_W[i]=beta*u_W[i]+layer.d_W
                    u_b[i]=beta*u_b[i]+layer.d_b
                    layer.W=layer.W-eta*u_W[i]
                    layer.b=layer.b-eta*u_b[i]
                self.reset() #reset grads before new update
                


    def rmsprop(self,X,Y,eta=1e-3,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    v_W[i]=beta*v_W[i]+(1-beta)*layer.d_W**2
                    v_b[i]=beta*v_b[i]+(1-beta)*layer.d_b**2
                    layer.W=layer.W-(eta/np.sqrt(v_W[i]+epsilon))*layer.d_W
                    layer.b=layer.b-(eta/np.sqrt(v_b[i]+epsilon))*layer.d_b
                self.reset() #reset grads before new epoch
                


            
    def Adam(self,X,Y,eta=1e-3,beta1=0.9, beta2=0.999, batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    #updating momentum, velocity
                    m_W[i]=beta1*m_W[i]+(1-beta1)*layer.d_W
                    m_b[i]=beta1*m_b[i]+(1-beta1)*layer.d_b
                    
                    v_W[i]=beta2*v_W[i]+(1-beta2)*layer.d_W**2
                    v_b[i]=beta2*v_b[i]+(1-beta2)*layer.d_b**2
                    
                    m_W_hat=m_W[i]/(1-np.power(beta1,t+1))
                    m_b_hat=m_b[i]/(1-np.power(beta1,t+1))
                    v_W_hat=v_W[i]/(1-np.power(beta2,t+1))
                    v_b_hat=v_b[i]/(1-np.power(beta2,t+1))
                    
                    
                    
                    layer.W=layer.W-(eta*m_W_hat)/(np.sqrt(v_W_hat)+epsilon)
                    layer.b=layer.b-(eta*m_b_hat)/(np.sqrt(v_b_hat)+epsilon)
                self.reset() #reset grads before new epoch
    
    def NAG(self,X,Y,eta=1e-3,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]

        



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                



            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    m_W[i]=beta*m_W[i]+eta*layer.d_W
                    m_b[i]=beta*m_b[i]+eta*layer.d_b

                    
                    layer.W=layer.W-(beta*m_W[i]+eta*layer.d_W[i])
                    layer.b=layer.b-(beta*m_b[i]+eta*layer.d_b[i])
                self.reset() #reset grads before new epoch
                
                
    
    def NAdam(self,X,Y,eta=1e-3,beta1=0.9, beta2=0.999, batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    #updating momentum, velocity
                    m_W[i]=beta1*m_W[i]+(1-beta1)*layer.d_W
                    m_b[i]=beta1*m_b[i]+(1-beta1)*layer.d_b
                    
                    v_W[i]=beta2*v_W[i]+(1-beta2)*layer.d_W**2
                    v_b[i]=beta2*v_b[i]+(1-beta2)*layer.d_b**2
                    
                    m_W_hat=m_W[i]/(1-np.power(beta1,t+1))
                    m_b_hat=m_b[i]/(1-np.power(beta1,t+1))
                    v_W_hat=v_W[i]/(1-np.power(beta2,t+1))
                    v_b_hat=v_b[i]/(1-np.power(beta2,t+1))
                    
                    
                    
                    layer.W=layer.W-(eta/(np.sqrt(v_W_hat)+epsilon))*\
                    (beta1*m_W_hat+((1-beta1)/(1-np.power(beta1,t+1)))*layer.d_W)
                    layer.b=layer.b-(eta/(np.sqrt(v_b_hat)+epsilon))*\
                    (beta1*m_b_hat+((1-beta1)/(1-np.power(beta1,t+1)))*layer.d_b)
                    
                self.reset() #reset grads before new epoch
            

                
        
            



        

        
    

In [69]:
class optimizers_alpha:
    def __init__(self,X_size,Y_size,hidden_layer_sizes=[4],hidden_layer_activations=['sigmoid'],
                 loss='cross-entropy',optimizer='adam',lamdba=0,batch_size=5,epochs=10,eta=1e-3):
        self.batch_size=batch_size
        self.epochs=epochs
        self.train_loss=0
        self.val_loss=0
        self.model=Model(X_size,Y_size)
        self.model.__init__(X_size,Y_size,hidden_layer_sizes,hidden_layer_activations,loss,lamdba/self.batch_size)
        self.learning_rate=eta
        self.optimizer=optimizer
        
    def batch_gradient_descent(self,X,Y):
        
        '''
        Mini-Batch Gradient Descent
        at batchsize=1, behaves like sgd, batchsize=np.shape(X), behaves as gd
        eta is the learning rate
        '''
        t=1
        batch_size=self.batch_size
        eta=self.learning_rate
        
        

        while t<self.epochs:
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.model.forward(x)
                self.model.backward(x,y,y_pred)
                number_points_seen+=1
                
                #update if the number of points seen==batch size, or if data ends
                if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                    for layer in self.model.layers:
                        layer.W=layer.W-eta*layer.d_W
                        layer.b=layer.b-eta*layer.d_b
                    self.model.reset() #reset grads before new epoch
            t+=1

In [70]:
ce=get_loss('ay')

In [71]:
ce(Y,ypred)

0.0003623997906173437

In [72]:
X=np.transpose([[1,2,-3,4],[4,-5,6,7],[2,3,4,-5]])

Y=np.transpose([[0,0,1,0],[0,1,0,0],[0,0,0,1]])

test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])

# test_NN.stochastic_gradient_descent(X,Y)
# test_NN.Momentum_based(X,Y)
test_NN.batch_gradient_descent(X,Y,batch_size=3,max_iters=10000)
# test_NN.rmsprop(X,Y)
# test_NN.adam(X,Y,eta=0.001,batch_size=2,max_iters=10000)
# test_NN.adam(X,Y,eta=0.001,batch_size=2,max_iters=10000)
# test_NN.NAG(X,Y,eta=0.001,beta=0.9,batch_size=2,max_iters=10000)






ypred0=test_NN.forward(X[:,[0]])
ypred1=test_NN.forward(X[:,[1]])
ypred2=test_NN.forward(X[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))



ce(Y,ypred)


0.000570604765189598

In [79]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])

In [80]:
test_NN_beta=optimizers_beta(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'],batch_size=3,epochs=10000)

In [91]:
test_NN.

<bound method Model.predict of <__main__.optimizers object at 0x7fa80887b7c0>>

In [89]:
test_NN_beta.model.

<__main__.Model at 0x7fa80887b130>

In [99]:
X=np.transpose([[1,2,-3,4],[4,-5,6,7],[2,3,4,-5]])

Y=np.transpose([[0,0,1,0],[0,1,0,0],[0,0,0,1]])

test_NN_beta=optimizers_beta(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'],batch_size=3,epochs=10000,eta=1)


test_NN_beta.batch_gradient_descent((X,Y),(X,Y))







ypred0=test_NN_beta.model.forward(X[:,[0]])
ypred1=test_NN_beta.model.forward(X[:,[1]])
ypred2=test_NN_beta.model.forward(X[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))

yy=test_NN_beta.model.predict(X)

assert(yy.all()==ypred.all())

ce(Y,ypred)

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 2690.48it/s]


0.00043434702379719557

In [78]:
X=np.transpose([[1,2,-3,4],[4,-5,6,7],[2,3,4,-5]])

Y=np.transpose([[0,0,1,0],[0,1,0,0],[0,0,0,1]])

test_NN_alpha=optimizers_alpha(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'],batch_size=3,epochs=10000)


test_NN_alpha.batch_gradient_descent(X,Y)







ypred0=test_NN_alpha.model.forward(X[:,[0]])
ypred1=test_NN_alpha.model.forward(X[:,[1]])
ypred2=test_NN_alpha.model.forward(X[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))

ypred

ce(Y,ypred)

3.6831427213860777