In [1]:
import numpy as np

In [2]:
from tqdm import tqdm

In [3]:
from keras.datasets import fashion_mnist

2023-03-13 19:08:33.387279: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-13 19:08:33.460427: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-13 19:08:33.477675: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-13 19:08:33.731065: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: li

In [4]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [5]:
X_train.shape

(60000, 28, 28)

In [6]:
y_train.shape

(60000,)

In [7]:
np.random.seed(42)#sets a seed, used for reproducability

In [8]:
def one_hot(inarray): #converts to one hot encoding
    outarray = np.zeros((inarray.size, inarray.max() + 1))
    outarray[np.arange(inarray.size), inarray] = 1
    return outarray

In [9]:
def Preprocess(X,y):
      
    '''Unrolls X,y, rehsapes into column vectors, one hots y'''
    assert(X.shape[0]==y.shape[0]),"Inputs must contain same number of examples, stored in rows" #checks if same dim
    X_processed=[]
    y_processed=[]
    
    for i in range(np.shape(X)[0]):
        X_processed.append(X_train[i].ravel())
    y_processed=one_hot(y_train).T
    return np.array(X_processed).T,y_processed
        
    

In [10]:
X_train_clean,y_train_clean=Preprocess(X_train,y_train)

In [11]:
Xtest,ytest=Preprocess(X_test,y_test)

In [12]:
def tran_val_split(X,y,split=0.1):
    assert(X.shape[1]==y.shape[1]), "Inputs must contain same number of examples, stored in columns"# as vectors are now stored in cols, do check if no of elemnts are equal
    len_split=int(np.shape(X)[1]*split)
    np.random.shuffle(X)
    np.random.shuffle(y)
    X_val=X[:,:len_split]
    y_val=y[:,:len_split]
    
    X_train=X[:,len_split:]
    y_train=y[:,len_split:]
    
    return (X_train,y_train),(X_val,y_val)
    
        

In [13]:
(Xtrain,ytrain),(Xval,yval)=tran_val_split(X_train_clean,y_train_clean)

In [14]:

def sigmoid(x):
    return np.where(x >= 0, 
                    1 / (1 + np.exp(-x)), 
                    np.exp(x) / (1 + np.exp(x)))
def softmax(x):

    z=x-np.max(x,axis=0) #doing this for numerical stability, prevents over/undeflow
    return np.exp(z)/np.sum(np.exp(z),axis=0)

In [15]:
def get_activation(activation):#maybe getderivatives here iteself ?
    def sigmoid(x):
        return np.where(x >= 0, 
                        1 / (1 + np.exp(-x)), 
                        np.exp(x) / (1 + np.exp(x)))
    def softmax(x):
        z=x-np.max(x,axis=0)
        return np.exp(z)/np.sum(np.exp(z),axis=0)
    if activation=='sigmoid':
        return sigmoid
    elif activation=='softmax':
        return softmax
    elif activation== 'tanh':
        return np.tanh(x)

In [92]:
def get_activation_derivative(activation):#maybe getderivatives here iteself ?
    def sigmoid_d(x):
        sig= np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
        return sig*(1-sig)
    def softmax_d(x):
        z=x-np.max(x,axis=0)
        soft=np.exp(z)/np.sum(np.exp(z),axis=0)
        return soft*(1-soft)
    if activation=='sigmoid':
        return sigmoid_d
    elif activation=='softmax':
        '''
        need to think more, not required for backprop as we look directly at dL/da_l
        But still, for the sake of completeness, and if user wants softmax in the middle?
        d S(x_i) /d x_j= S(x_i)*(kronecker delta_i,j -S(x_j))
        But we care about only dh_k,j/da_k,j So no need to implement d S(x_i) /d x_j
        d S(x_i) /d x_i should suffice
        so we get array of [ d S(x_1) /d x_1, d S(x_2) /d x_2, ....]
        
        For MSE loss after softmax, we need cross terms...
        '''
        
        return softmax_d
    elif activation=='tanh':
        return 1-tanh(x)**2

In [17]:
def get_loss(loss):
    
   
    
        
    def crossentropy(P,Q):
        assert(P.shape==Q.shape), "Inputs must be of same shape"

        return np.sum([-np.dot(P[:,i],np.log2(Q[:,i])) for i in range(P.shape[1])])
    def SE(P,Q):
        assert(P.shape==Q.shape), "Inputs must be of same shape"

        return np.square(P-Q)
    
    if loss=="cross-entropy":
        return crossentropy
    return SE
    
    
      
    
    
    
    

In [18]:
def get_loss_derivative(loss):
    def SE_d(y_in,y_pred_in):
        '''
        derivative of MSE after softmax is used to get probabs from a_L:
        We need indicator because the all terms of y_true are required unlike cross-entropy where only y_pred[l] is required
        Thus transforming the stacked indicator to y_true, not here...
        
        '''

        def indicator(i,j):
                if i==j:
                    return 1
                return 0


        assert(y_in.shape[0]==y_pred_in.shape[0]),"Inputs must contain same number of examples"

        y=y_in.ravel()
        y_pred=y_pred_in.ravel()


        return np.array([
            [2*np.sum([(y_pred[i]-y[i])*y[i]*(indicator(i,j) - y_pred[j]) for i in range(y.shape[0])])]
            for j in range(len(y))
        ])    
   
    
        
    def crossentropy_d(y,y_pred):
        

        return -(y-y_pred)
    
    
    if loss=="cross-entropy":
        return crossentropy_d
    return SE_d
    

In [19]:
# def SE_d(y_in,y_pred_in):
    
#     def indicator(i,j):
#             if i==j:
#                 return 1
#             return 0
        
        
#     assert(y_in.shape[0]==y_pred_in.shape[0]),"Inputs must contain same number of examples"

#     y=y_in.ravel()
#     y_pred=y_pred_in.ravel()


#     return np.array([
#         [2*np.sum([(y_pred[i]-y[i])*y[i]*(indicator(i,j) - y_pred[j]) for i in range(y.shape[0])])]
#         for j in range(len(y))
#     ])

In [20]:
class layer:
    def __init__(self,input_size,output_size,activation='sigmoid'):
            
        ''' 
        output size number of neurons i
        input size j
        
        '''
        self.W=np.random.randn(output_size,input_size) #size ixj
        self.b=np.random.randn(output_size,1)           #size i
        self.a=np.random.randn(output_size,1)           #size i
        self.h=np.random.randn(output_size,1)           #size i
        self.g=get_activation(activation)
        
        self.d_a=np.zeros((output_size,1))
        self.d_h=np.zeros((output_size,1))
        self.d_W=np.zeros((output_size,input_size))
        self.d_b=np.zeros((output_size,1))
        self.d_g=get_activation_derivative(activation)
        
        
        
    def forward(self, inputs):
        self.a=self.b+np.matmul(self.W,inputs)
        self.h=self.g(self.a)
        return self.h
    def reset(self):
        self.d_a=np.zeros(np.shape(self.d_a))
        self.d_h=np.zeros(np.shape(self.d_h))
        self.d_W=np.zeros(np.shape(self.d_W))
        self.d_b=np.zeros(np.shape(self.d_b))
        


In [83]:
class Model:
    def __init__(self,X_size,Y_size,hidden_layer_sizes=[4],hidden_layer_activations=['sigmoid'],loss='cross-entropy'):
        '''
        '''   
        
        self.input_size=X_size
        self.output_size=Y_size
        self.hidden_layer_sizes=hidden_layer_sizes
        self.layers=[]
        
        prev_size=self.input_size
    
        for size,activation in zip(hidden_layer_sizes,hidden_layer_activations):
            self.layers.append(layer(prev_size,size,activation))
            prev_size=size
        self.layers.append(layer(size,self.output_size,'softmax'))
        
        self.loss=get_loss(loss)
        self.loss_d=get_loss_derivative(loss)
        
    def forward(self,x):
        output=x
        # print(output.shape)
        for layer in  self.layers:
            # print('W',layer.W.shape)
            output=layer.forward(output)
            # print(output.shape)   
        return output
    
    def reset(self):
        for layer in  self.layers:
            # resets the dWs
            layer.reset()
        
    
          
            
    def backward(self,x,y,y_pred):
        # self.layers[-1].d_h is not needed as d_h is used to calculate d_a and self.layers[-1].h is softmax
        self.layers[-1].d_a=self.loss_d
            
        
        
        for idx in range(len(self.layers)-1,0,-1): #goes from L->2, for l=1 we do outside
            
            
            #compute gradient wrt parameters
            self.layers[idx].d_W+=np.matmul(self.layers[idx].d_a,np.transpose(self.layers[idx-1].h))
            self.layers[idx].d_b+=self.layers[idx].d_a
            
            #compute gradient wrt layer below -- will help in next layer iter
            self.layers[idx-1].d_h=np.matmul(np.transpose(self.layers[idx].W),self.layers[idx].d_a)
            
            #compute gradient -- element wise multiplivation, derivative of the activation function of layer idx-1
            self.layers[idx-1].d_a=self.layers[idx-1].d_h*self.layers[idx-1].d_g(self.layers[idx-1].a)
                        
        self.layers[0].d_W+=np.matmul(self.layers[0].d_a,np.transpose(x))
        self.layers[0].d_b+=self.layers[0].d_a 
        
    def predict(self,Xtest):
        preds=[]
        for i in range(Xtest.shape[1]):
            preds.append(self.forward(Xtest[:,[i]]))
        
        ytest_pred=np.hstack(preds)
        return ytest_pred
        
        
                    
                
                
            
            
            
        
    
            
            
        
            
            

In [87]:
temp_model=Model(784,10,[32],['sigmoid'])

In [90]:
temp_model.predict(Xtest).shape

  1 / (1 + np.exp(-x)),
  np.exp(x) / (1 + np.exp(x)))
  np.exp(x) / (1 + np.exp(x)))


(10, 10000)

In [None]:
class optimizers(Model):
    def __init__(self,X_size,Y_size,hidden_layer_sizes=[4],hidden_layer_activations=['sigmoid'],loss='cross-entropy'):
        super().__init__(X_size,Y_size,hidden_layer_sizes,hidden_layer_activations,loss)

    def batch_gradient_descent(self,X,Y,eta=1e-3,batch_size=1,max_iters=1000):
        
        '''
        Mini-Batch Gradient Descent
        at batchsize=1, behaves like sgd, batchsize=np.shape(X), behaves as gd
        eta is the learning rate
        '''
        
        
        

        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                
                #update if the number of points seen==batch size, or if data ends
                if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                    for layer in self.layers:
                        layer.W=layer.W-eta*layer.d_W
                        layer.b=layer.b-eta*layer.d_b
                    self.reset() #reset grads before new epoch
            
            
    def stochastic_gradient_descent(self,X,Y,eta=1e-3,max_iters=10):
        
        
        '''
        stochastic Gradient Descent
        '''
                
        

        for t in tqdm(range(max_iters)):
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                
                #update
                for layer in self.layers:
                    layer.W=layer.W-eta*layer.d_W
                    layer.b=layer.b-eta*layer.d_b
                self.reset() #reset grads before new update
                    
            
        
        
    def Momentum(self,X,Y,eta=1e-3,beta=0.9,batch_size=100,max_iters=10):
        ''''''
        u_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        u_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]

        



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                



            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    u_W[i]=beta*u_W[i]+layer.d_W
                    u_b[i]=beta*u_b[i]+layer.d_b
                    layer.W=layer.W-eta*u_W[i]
                    layer.b=layer.b-eta*u_b[i]
                self.reset() #reset grads before new update
                


    def rmsprop(self,X,Y,eta=1e-3,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    v_W[i]=beta*v_W[i]+(1-beta)*layer.d_W**2
                    v_b[i]=beta*v_b[i]+(1-beta)*layer.d_b**2
                    layer.W=layer.W-(eta/np.sqrt(v_W[i]+epsilon))*layer.d_W
                    layer.b=layer.b-(eta/np.sqrt(v_b[i]+epsilon))*layer.d_b
                self.reset() #reset grads before new epoch
                


            
    def Adam(self,X,Y,eta=1e-3,beta1=0.9, beta2=0.999, batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    #updating momentum, velocity
                    m_W[i]=beta1*m_W[i]+(1-beta1)*layer.d_W
                    m_b[i]=beta1*m_b[i]+(1-beta1)*layer.d_b
                    
                    v_W[i]=beta2*v_W[i]+(1-beta2)*layer.d_W**2
                    v_b[i]=beta2*v_b[i]+(1-beta2)*layer.d_b**2
                    
                    m_W_hat=m_W[i]/(1-np.power(beta1,t+1))
                    m_b_hat=m_b[i]/(1-np.power(beta1,t+1))
                    v_W_hat=v_W[i]/(1-np.power(beta2,t+1))
                    v_b_hat=v_b[i]/(1-np.power(beta2,t+1))
                    
                    
                    
                    layer.W=layer.W-(eta*m_W_hat)/(np.sqrt(v_W_hat)+epsilon)
                    layer.b=layer.b-(eta*m_b_hat)/(np.sqrt(v_b_hat)+epsilon)
                self.reset() #reset grads before new epoch
    
    def NAG(self,X,Y,eta=1e-3,beta=0.9,batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]

        



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                



            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    m_W[i]=beta*m_W[i]+eta*layer.d_W
                    m_b[i]=beta*m_b[i]+eta*layer.d_b

                    
                    layer.W=layer.W-(beta*m_W[i]+eta*layer.d_W[i])
                    layer.b=layer.b-(beta*m_b[i]+eta*layer.d_b[i])
                self.reset() #reset grads before new epoch
                
                
    
    def NAdam(self,X,Y,eta=1e-3,beta1=0.9, beta2=0.999, batch_size=100,max_iters=1000):
        ''''''
        m_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        v_W=[np.zeros(np.shape(layer.d_W)) for layer in self.layers]
        m_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        v_b=[np.zeros(np.shape(layer.d_b)) for layer in self.layers]
        t=0
        epsilon=1e-10



        for t in tqdm(range(max_iters)):
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                number_points_seen+=1
                


    

            #update if the number of points seen==batch size, or if data ends
            if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[1]:
                for i in range(len(self.layers)):
                    layer=self.layers[i]
                    #updating momentum, velocity
                    m_W[i]=beta1*m_W[i]+(1-beta1)*layer.d_W
                    m_b[i]=beta1*m_b[i]+(1-beta1)*layer.d_b
                    
                    v_W[i]=beta2*v_W[i]+(1-beta2)*layer.d_W**2
                    v_b[i]=beta2*v_b[i]+(1-beta2)*layer.d_b**2
                    
                    m_W_hat=m_W[i]/(1-np.power(beta1,t+1))
                    m_b_hat=m_b[i]/(1-np.power(beta1,t+1))
                    v_W_hat=v_W[i]/(1-np.power(beta2,t+1))
                    v_b_hat=v_b[i]/(1-np.power(beta2,t+1))
                    
                    
                    
                    layer.W=layer.W-(eta/(np.sqrt(v_W_hat)+epsilon))*\
                    (beta1*m_W_hat+((1-beta1)/(1-np.power(beta1,t+1)))*layer.d_W)
                    layer.b=layer.b-(eta/(np.sqrt(v_b_hat)+epsilon))*\
                    (beta1*m_b_hat+((1-beta1)/(1-np.power(beta1,t+1)))*layer.d_b)
                    
                self.reset() #reset grads before new epoch
            

                
        
            



        

        
    

In [None]:
# test_NN=optimizers(784,10,[32,32,32],['sigmoid','sigmoid','sigmoid'])

In [None]:
# # test_NN.stochastic_gradient_descent(X,Y)
# # test_NN.Momentum_based(X,Y)
# # test_NN.batch_gradient_descent(X,Y,batch_size=3,max_iters=10000)
# # test_NN.rmsprop(X,Y)
# # test_NN.adam(X,Y,eta=0.001,batch_size=2,max_iters=10000)
# test_NN.stochastic_gradient_descent(Xtrain,ytrain,eta=0.001,max_iters=10)
# # test_NN.adam(Xtrain,ytrain,eta=0.001,batch_size=64,max_iters=10)
# # test_NN.NAdam(Xtrain,ytrain,eta=0.001,batch_size=64,max_iters=10)




In [None]:
# ypred0=test_NN.forward(Xtrain[:,[0]])
# ypred1=test_NN.forward(Xtrain[:,[1]])
# ypred2=test_NN.forward(Xtrain[:,[2]])
# ypred=np.hstack((ypred0,ypred1,ypred2))

In [None]:
# Y=np.hstack((ytrain[:,[0]],ytrain[:,[1]],ytrain[:,[2]]))

In [None]:
#keep small laerning rate, batch size<shape(X)

In [None]:
a=np.transpose([[1,2,-3,4],[4,-5,6,7],[2,3,4,-5]])

In [None]:
b=np.transpose([[0,0,1,0],[0,1,0,0],[0,0,0,1]])

In [None]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])
test_NN.stochastic_gradient_descent(a,b,eta=0.001,max_iters=100000)
ypred0=test_NN.forward(a[:,[0]])
ypred1=test_NN.forward(a[:,[1]])
ypred2=test_NN.forward(a[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))
print('AE:-',np.sum(np.abs(ypred-b)))
print(ypred,'\n',b)

In [None]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])
test_NN.batch_gradient_descent(a,b,eta=0.001,batch_size=2,max_iters=100000)
ypred0=test_NN.forward(a[:,[0]])
ypred1=test_NN.forward(a[:,[1]])
ypred2=test_NN.forward(a[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))
np.sum(np.abs(ypred-b))

print('AE:-',np.sum(np.abs(ypred-b)))
print(ypred,'\n',b)

In [None]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])
test_NN.Momentum_based(a,b,eta=0.001,batch_size=2,max_iters=100000)
ypred0=test_NN.forward(a[:,[0]])
ypred1=test_NN.forward(a[:,[1]])
ypred2=test_NN.forward(a[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))
np.sum(np.abs(ypred-b))

print('AE:-',np.sum(np.abs(ypred-b)))
print(ypred,'\n',b)

In [None]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])
test_NN.rmsprop(a,b,eta=0.001,batch_size=2,max_iters=100000)
ypred0=test_NN.forward(a[:,[0]])
ypred1=test_NN.forward(a[:,[1]])
ypred2=test_NN.forward(a[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))
np.sum(np.abs(ypred-b))

print('AE:-',np.sum(np.abs(ypred-b)))
print(ypred,'\n',b)

In [None]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])
test_NN.adam(a,b,eta=0.001,batch_size=2,max_iters=100000)
ypred0=test_NN.forward(a[:,[0]])
ypred1=test_NN.forward(a[:,[1]])
ypred2=test_NN.forward(a[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))
np.sum(np.abs(ypred-b))

print('AE:-',np.sum(np.abs(ypred-b)))
print(ypred,'\n',b)

In [None]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])
test_NN.NAG(a,b,eta=0.001,batch_size=2,max_iters=100000)
ypred0=test_NN.forward(a[:,[0]])
ypred1=test_NN.forward(a[:,[1]])
ypred2=test_NN.forward(a[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))
np.sum(np.abs(ypred-b))

print('AE:-',np.sum(np.abs(ypred-b)))
print(ypred,'\n',b)

In [None]:
test_NN=optimizers(4,4,[3,4,3],['sigmoid','sigmoid','sigmoid'])
test_NN.NAdam(a,b,eta=0.001,batch_size=2,max_iters=100000)
ypred0=test_NN.forward(a[:,[0]])
ypred1=test_NN.forward(a[:,[1]])
ypred2=test_NN.forward(a[:,[2]])
ypred=np.hstack((ypred0,ypred1,ypred2))
np.sum(np.abs(ypred-b))

print('AE:-',np.sum(np.abs(ypred-b)))
print(ypred,'\n',b)