In [7]:
# from keras.datasets import fashion_mnist



In [8]:
import numpy as np

In [9]:
# (x_train, y_train), (x_test, y_test)=fashion_mnist.load_data()

In [124]:
def sigmoid(x):
    return 1/(1+np.exp(-x))
def softmax(x):

    z=x-np.max(x,axis=0) #doing this for numerical stability, prevents over/undeflow
    return np.exp(z)/np.sum(np.exp(z),axis=0)

In [90]:
def get_activation(activation):#maybe getderivatives here iteself ?
    def sigmoid(x):
        return 1/(1+np.exp(-x))
    def softmax(x):
        z=x-np.max(x,axis=0)
        return np.exp(z)/np.sum(np.exp(z),axis=0)
    if activation=='sigmoid':
        return sigmoid
    elif activation=='softmax':
        return softmax
    elif activation== 'tanh':
        return np.tanh(x)

In [18]:
def get_activation_derivative(activation):#maybe getderivatives here iteself ?
    def sigmoid_d(x):
        sig=1/(1+np.exp(-x))
        return sig*(1-sig)
    def softmax_d(x):
        soft=np.exp(x)/np.sum(np.exp(x),axis=0)
        return soft*(1-soft)
    if activation=='sigmoid':
        return sigmoid_d
    elif activation=='softmax':
        '''
        need to think more, not required for backprop as we look directly at dL/da_l
        But still, for the sake of completeness, and if user wants softmax in the middle?
        d S(x_i) /d x_j= S(x_i)*(kronecker delta_i,j -S(x_j))
        But we care about only dh_k,j/da_k,j So no need to implement d S(x_i) /d x_j
        d S(x_i) /d x_i should suffice
        so we get array of [ d S(x_1) /d x_1, d S(x_2) /d x_2, ....]
        '''
        
        return softmax_d
    elif activation=='tanh':
        return 1-tanh(x)**2

In [13]:
class layer:
    def __init__(self,input_size,output_size,activation='sigmoid'):
            
        ''' 
        output size number of neurons i
        input size j
        
        '''
        self.W=np.random.randn(output_size,input_size) #size ixj
        self.b=np.random.randn(output_size,1)           #size i
        self.a=np.random.randn(output_size,1)           #size i
        self.h=np.random.randn(output_size,1)           #size i
        self.g=get_activation(activation)
        
        self.d_a=np.random.randn(output_size,1)
        self.d_h=np.random.randn(output_size,1)
        self.d_W=np.random.randn(output_size,input_size)
        self.d_b=np.random.randn(output_size,1)
        self.d_g=get_activation_derivative(activation)
        
        
        
    def forward(self, inputs):
        self.a=self.b+np.matmul(self.W,inputs)
        self.h=self.g(self.a)
        return self.h


In [14]:
temp=[1,2,3,4]
temp1=[1,2,3,4]

In [15]:
for i,j in zip(temp,temp1):
    
    print(i,j)

1 1
2 2
3 3
4 4


In [16]:
for i in range(len(temp)-1,0,-1):
    
    print (temp[i],temp[i-1])

4 3
3 2
2 1


In [186]:
class NeuralNetwork:
    def __init__(self,X_size,Y_size,hidden_layer_sizes=[4],hidden_layer_activations=['sigmoid']):
        '''
        '''
        
        
        
        
        
        self.input_size=X_size
        self.output_size=Y_size
        self.hidden_layer_sizes=hidden_layer_sizes
        self.layers=[]
        
        prev_size=self.input_size
    
        for size,activation in zip(hidden_layer_sizes,hidden_layer_activations):
            self.layers.append(layer(prev_size,size,activation))
            prev_size=size
        self.layers.append(layer(size,self.output_size,'softmax'))
        
    def forward(self,x):
        output=x
        # print(output.shape)
        for layer in  self.layers:
            # print('W',layer.W.shape)
            output=layer.forward(output)
            # print(output.shape)   
        return output
    
    def backward(self,x,y,y_pred):
        # self.layers[-1].d_h is not needed as d_h is used to calculate d_a and self.layers[-1].h is softmax
        self.layers[-1].d_a=-(y-y_pred)
        
        
        for idx in range(len(self.layers)-1,0,-1): #goes from L->2, for l=1 we do outside
            
            
            #compute gradient wrt parameters
            self.layers[idx].d_W=np.matmul(self.layers[idx].d_a,np.transpose(self.layers[idx-1].h))
            self.layers[idx].d_b=self.layers[idx].d_a
            
            #compute gradient wrt layer below -- will help in next layer iter
            self.layers[idx-1].d_h=np.matmul(np.transpose(self.layers[idx].W),self.layers[idx].d_a)
            
            #compute gradient -- element wise multiplivation, derivative of the activation function of layer idx-1
            self.layers[idx-1].d_a=self.layers[idx-1].d_h*self.layers[idx-1].d_g(self.layers[idx-1].a)
                        
        self.layers[0].d_W=np.matmul(self.layers[0].d_a,np.transpose(x))
        self.layers[0].d_b=self.layers[0].d_a 
        
    def batch_gradient_descent(self,X,Y,eta=1,batch_size=1):
        
        '''
        Mini-Batch Gradient Descent
        at batchsize=1, behaves like sgd, batchsize=np.shape(X), behaves as gd
        '''
        t=1
        max_iters=1000
        
        

        while t<max_iters:
            number_points_seen=0
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                
                #update if the number of points seen==batch size, or if 
                if number_points_seen%batch_size==0 or number_points_seen==np.shape(X)[0]:
                    for layer in self.layers:
                        layer.W=layer.W-eta*layer.d_W
                        layer.b=layer.b-eta*layer.d_b
            t+=1
            
    def stochastic_gradient_descent(self,X,Y,eta=1,batch_size=1):
        
        '''
        stochastic Gradient Descent
        '''
        t=1
        max_iters=1000
        
        

        while t<max_iters:
            for i in range(np.shape(X)[1]):
                x=X[:,[i]]
                y=Y[:,[i]]
                y_pred=self.forward(x)
                self.backward(x,y,y_pred)
                
                #update
                for layer in self.layers:
                    layer.W=layer.W-eta*layer.d_W
                    layer.b=layer.b-eta*layer.d_b
            t+=1
                
                
                
            
            
            
        
    
            
            
        
            
            

In [121]:
X[:,1]*5

array([20, 25, 30, 35])

In [122]:
np.array_split([X[:,0]]*5,2,axis=1)

[array([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]]),
 array([[4],
        [4],
        [4],
        [4],
        [4]])]

In [84]:
np.max(X,axis=0)

array([4, 7])

In [11]:
temp=[1,2,3,4]

In [12]:
for idx in range(len(temp)-1,0,-1):
    # print (temp[idx],temp[idx-1])    
    print (temp[idx])

4
3
2


In [202]:
test_NN=NeuralNetwork(4,5,[3,1],['sigmoid','sigmoid'])

In [203]:
X=np.transpose([[1,2,3,4],[4,5,6,7]])

In [204]:
Y=np.transpose([[0,0,1,1,1],[1,1,0,0,0]])

In [206]:
test_NN.stochastic_gradient_descent(X,Y)

In [208]:
test_NN.layers[0].a

array([[10.39801483],
       [11.45417734],
       [16.58635686]])

In [209]:
test_NN.layers[-1].a

array([[1197.27883803],
       [1197.27883801],
       [1198.65485279],
       [1198.65485278],
       [1198.65485278]])

In [212]:
np.exp(test_NN.layers[-1].a)/np.sum(np.exp(test_NN.layers[-1].a))

  np.exp(test_NN.layers[-1].a)/np.sum(np.exp(test_NN.layers[-1].a))
  np.exp(test_NN.layers[-1].a)/np.sum(np.exp(test_NN.layers[-1].a))


array([[nan],
       [nan],
       [nan],
       [nan],
       [nan]])

In [210]:
test_NN.layers[-1].h

array([[0.07206025],
       [0.07206025],
       [0.28529317],
       [0.28529317],
       [0.28529317]])