# This is an exercise to delve into the architecture of the CNN.

This exercise will only use numpy. No autograd schemes will be used. Backpropogation will be carried out manually to ensure full understanding of the architecture. 

In [10]:
import numpy as np
import os 
import math
import time
import matplotlib.pyplot as plt

## Start by defining parameter and computation tree classes

### Each parameter will keep track of:   
1) Its current value  
2) The gradient of the loss with respect to itself

### Each tree will be made up of a sequence of layers, and each layer will contain:  
1) Its parameters (Object)  
2) A forward pass (Method)  
3) A backward pass to calculate gradients wrt the previous layer (Method)  
4) A backward pass to calculate gradients wrt its parameters (Method)  
5) A gradient update for its parameters (Method)  

In [11]:
class parameter:
    #base class for all parameters
    def __init__(self, data, gradient):
        self.data = data
        self.gradient = gradient
        

class tree:
    def __init__(self, list_layers, loss_layer):
        self.layers = list_layers
        self.loss_layer = loss_layer
        self.loss = 0
        
    def forward_pass(self, X ,y):
        for layer in self.layers:
            layer.forward_pass(X)
            X = layer.forward['out']
            
        self.prediction = np.argmax(X, axis = 0)
        
        
        self.loss_layer.forward_pass(X, y)
        self.loss = self.loss_layer.forward['out']
    
    def backward_pass(self, X, y):
        self.loss_layer.backward_pass()
        grad = self.loss_layer.backward['t1']
        
        for layer in reversed(self.layers):
            layer.backward_pass(grad)
            grad = layer.backward['t1']
    
    def calculate_gradients(self):
        for layer in self.layers:
            layer.calculate_gradient()
    
    def update_parameters(self, epsilon):
        for layer in self.layers:
            layer.update_parameters(epsilon = epsilon)

## Download the data (MNIST) and preprocess

In [12]:
# ORIGINAL MNIST DATASET 
#######################################################################
X_train_vec = np.load("mnist.npz_FILES/train.npy")/255
y_train = np.squeeze(np.load("mnist.npz_FILES/train_labels.npy"))
temp_y = np.zeros([10,len(y_train)])
temp_y[y_train.astype(int), np.arange(len(y_train))] = 1
y_train = temp_y

#shuffle the dataset
index = (np.random.choice(60000,60000, replace=False))
X_train_vec = X_train_vec[:,index]
y_train = y_train[:,index]


X_train = np.zeros((60000, 28, 28))
for i in range(60000):
    X_train[[i],:,:] = np.reshape(X_train_vec[:,i], [28,28])




X_test_vec = np.load("mnist.npz_FILES/test.npy")/255
y_test = np.squeeze(np.load("mnist.npz_FILES/test_labels.npy"))
temp_y = np.zeros([10,len(y_test)])
temp_y[y_test.astype(int), np.arange(len(y_test))] = 1
y_test = temp_y

X_test = np.zeros((10000, 28, 28))
for i in range(10000):
    X_test[[i],:,:] = np.reshape(X_test_vec[:,i], [28,28])

#######################################################################

## This allows us to implement convolution as a matrix multiplication

In [13]:
def im2col(A, b, stepsize=1):
    # Parameters
    #rollaxis first
    A = np.rollaxis(A, 0, 3)
    
    M,N,depth,batch = A.shape
    col_extent = N - b[2] + 1
    row_extent = M - b[1] + 1
    
    
    
    # Get Starting block indices
    start_idx = np.arange(b[1])[:,None]*N + np.arange(b[2])

    # Get offsetted indices across the height and width of input array
    offset_idx = np.arange(row_extent)[::stepsize,None]*N + np.arange(col_extent)[::stepsize]
    
    #get offsetted indices across the depth of input array
    offset_depth = np.arange(b[0])[None,None,:]*M*N 
    
    #get offsetted indices across the different images
    offset_images = np.arange(batch)[None,None,None,:]*(M*N*depth)
    
    # Get all actual indices & index into input array for final output
    indices = (start_idx.ravel()[:,None] + offset_idx.ravel()[::1])[:,:,None]
    indices = (indices + offset_depth)[:,:,:,None]


    indices = indices + offset_images
    output = np.take (A.ravel('F'), indices)

    return np.rollaxis(output,2,0 )


## There are three different kinds of convolutions we need to consider:

1) Forward convolution

2) Backward convolution (for the images)

3) Backward convolution (for the weights)

In [126]:
def pad_zeros_4d(image, zeros):
    #zeros is the thickness of zeros on each side(x, y). scalar.
    #image is a 3D tensor
    depth, I,J, batch = image.shape
    dim = 2*zeros + image.shape[1]
    temp=np.zeros([depth,dim,dim,batch])
    temp[:,zeros:(I+zeros), zeros:(J+zeros),:] = image
    return temp
def convolution_forward(Images, Kernels):
    k_depth, k_size, k_size, k_num = Kernels.shape
    
    #mode is either valid or same
    Images = pad_zeros_4d(Images, int((k_size-1)/2))
    
    
    depth,M,N,batch = Images.shape    
    
    new_Image = im2col(Images, Kernels.shape[0:3]) #we don't need number of kernels
    
    Kernels = Kernels.reshape((k_depth, k_size**2, k_num),order = 'F')

    output = np.einsum('ijkl,ijm->mkl', new_Image, Kernels)
    
    #reshaping the output to have proper dimensions (similar to input)
    N_new = int(np.sqrt(output.shape[1]))
    output = output.reshape((k_num, N_new, N_new,batch), order = 'F')

    return output


def convolution_param(Images, in_grad):
    #hardcode alert
    Images = pad_zeros_4d(image=Images,zeros=1)
    
    #in_grad has shape (n_kernels X height X width X batch_size)
    depth,M,N,batch = Images.shape  
    n_ker, M_grad, N_grad, batch = in_grad.shape
    
    new_Image = im2col(Images, (depth, M_grad, N_grad))
    
    in_grad = in_grad.reshape((n_ker, M_grad * N_grad, batch),order = 'F')
    
    gradient = np.einsum('ijkl,mjl->ikm', new_Image, in_grad)
    
    gradient = gradient.reshape((depth, M-M_grad+1, N-N_grad+1, n_ker), order = 'F')
    return gradient


def convolution_backward(in_grad, Kernels):

    k_depth, k_size, k_size, k_num = Kernels.shape
    #need to pad
    in_grad = pad_zeros_4d(image=in_grad,zeros=int((k_size-1)/2))
    
    n_ker, M_grad,N_grad,batch = in_grad.shape
    
    in_grad = im2col(in_grad, (n_ker,k_size,k_size))
    
    Kernels = np.flip(np.flip(Kernels, 1), 2)
    Kernels = Kernels.reshape((k_depth, k_size * k_size, k_num), order = 'F')
    
    gradient = np.einsum('ijkl,mji->mkl',in_grad, Kernels)
    
    gradient = gradient.reshape((k_depth, M_grad-k_size+1, N_grad-k_size+1, batch), order = 'F')
    return gradient



## These are the layers required to construct the CNN, alongside with the cross entropy loss layer. 

In [128]:
class flatten_batch:
    def __init__(self):
        self.forward = {}
        self.backward = {}
        pass
    def forward_pass(self, x):
        self.shape = x.shape
        batch_size = self.shape[-1]
        self.forward["out"] = x.reshape((-1,batch_size))
    def backward_pass(self, in_grad):
        self.backward["t1"] = in_grad.reshape(self.shape)
    def calculate_gradient(self):
        pass
    
    def update_parameters(self, epsilon):
        pass
    

class linear_layer_batch:
    def __init__(self, in_dim, out_dim):
        self.M = {}
        self.M['W'] = parameter(data= 0.1*np.random.normal(size = [out_dim, in_dim]), 
                                gradient= np.zeros([out_dim, in_dim]))
        self.M['b'] = parameter(data= 0.1*np.random.normal(size = [out_dim, 1]), 
                                gradient= np.zeros([out_dim, 1]))
        self.forward = {}
        self.backward = {}
        self.X = 0
    
    def forward_pass(self, X):
        self.X = X
        self.forward['f1'] = np.matmul(self.M['W'].data, self.X)
        self.forward['out'] = self.forward['f1'] +  np.tile(self.M['b'].data, (1,self.forward['f1'].shape[-1] ) )
        
    def backward_pass(self, in_grad):
        self.backward['t2'] = in_grad
        self.backward['t1'] = np.matmul(self.M['W'].data.T, self.backward['t2'])
    
    def calculate_gradient(self):
        self.M['W'].gradient =+ np.matmul(self.backward['t2'], self.X.T)
        self.M['b'].gradient =+ np.sum(self.backward['t2'], axis = 1, keepdims= True)

    
    def update_parameters(self, epsilon):
        for key in self.M:
            
            param = self.M[key]
            
            assert (param.gradient).shape == (param.data).shape
            
            param.data = param.data - epsilon * param.gradient/self.forward['f1'].shape[1]
            #zero the gradients
            param.gradient = np.zeros(np.shape(param.gradient))
class relu_batch:
    def __init__(self):
        self.forward = {}
        self.backward = {}
        self.X = 0
        self.prediction = 0
    
    def forward_pass(self, X):
        self.X = X
        self.forward['out'] = np.maximum(self.X, 0)
        
    def backward_pass(self, in_grad):
        self.backward['t1'] = (self.forward['out'] > 0)* in_grad
    def calculate_gradient(self):
        pass
    
    def update_parameters(self, epsilon):
        pass

def maxpool_batch(x, pool_size):
    #x is a 4d array
    depth, height, width, batch = np.shape(x)
    x = im2col(A= x,b=(depth, pool_size, pool_size, batch), stepsize=2)
    
    x = np.max(x, axis = 1)
    x = x.reshape((depth, int(height/2), int(width/2) ,batch),order = 'F')
    return x


class max_pool_batch:
    def __init__(self, pool_size):
        self.pool_size = pool_size
        self.backward = {}
        self.forward = {}

    def forward_pass(self, image):
        self.in_size = image.shape
        self.image = image
        self.aug_size = int( np.ceil(self.in_size[1]/self.pool_size) )
        
        self.forward['out'] = maxpool_batch(image,self.pool_size)
        self.aug_matrix = np.zeros(image.shape)
        
    def backward_pass(self, in_grad):
        self.aug_matrix = np.repeat(np.eye(self.aug_size), 
                                    self.pool_size).reshape(self.aug_size, 
                                                            self.aug_size*self.pool_size)
        
        depth, height, width,batch = self.in_size
        
        self.backward["t2"] = np.einsum("ij,njkm,kl->nilm", 
                                        self.aug_matrix.T, 
                                        in_grad, 
                                        self.aug_matrix)[:,0:height, 0:width]
        self.backward["out"] = np.einsum("ij,njkm,kl->nilm", 
                                        self.aug_matrix.T, 
                                        self.forward['out'], 
                                        self.aug_matrix)[:,0:height, 0:width]
        self.backward["t1"] = (abs(self.image - self.backward["out"])<10e-8 )* self.backward["t2"]
    def calculate_gradient(self):
        pass
    
    def update_parameters(self, epsilon):
        pass

    
class entropylosswithlogits:
    def __init__(self):
        self.forward = {}
        self.backward = {}
        self.X = 0
        self.y = 0
    
    def forward_pass(self, X, y):
        self.X = X
        self.y = y
        self.forward['f1'] = np.exp(self.X) / np.tile(np.sum(np.exp(self.X), axis=0), (10,1) )
        self.forward['f2'] = -y * np.log(self.forward['f1'])
        self.forward['out'] = np.sum(self.forward['f2'])
        
    def backward_pass(self):
        self.backward['t1'] = self.forward['f1'] - self.y
    def calculate_gradient(self):
        pass
    def update_parameters(self, epsilon):
        pass

class conv_layer_batch:
    def __init__(self, num_kernels, kernel_size, depth):
        self.M = {}
        self.M['kernels'] = parameter(data=0.1*np.random.normal(size = [depth,kernel_size,kernel_size,num_kernels]),
                                           gradient = np.zeros([depth ,kernel_size,kernel_size,num_kernels]))
        self.kernel_size = kernel_size
        self.forward = {}
        self.backward = {}
        
    def forward_pass(self, image):
        
        #populate the forward tree
        self.image = image
        self.forward['out'] = convolution_forward(Images=image, Kernels=self.M['kernels'].data)
            
    def backward_pass(self, in_grad):
        #populate the backward tree
        self.in_grad = in_grad
        self.backward['t1'] = convolution_backward(in_grad=in_grad, Kernels=self.M['kernels'].data)
        
    def calculate_gradient(self):
        self.M['kernels'].gradient += convolution_param(Images=self.image, in_grad=self.in_grad)
        
    
    def update_parameters(self, epsilon):
        #remember to divide by batchsize
        self.M['kernels'].data = self.M['kernels'].data - epsilon/self.image.shape[-1] * self.M['kernels'].gradient
        #set to zero
        self.M['kernels'].gradient = np.zeros(self.M['kernels'].gradient.shape)

## Now we can train the CNN

In [1]:
conv_1 = conv_layer_batch(num_kernels = 8, kernel_size = 3, depth = 1)
conv_2 = conv_layer_batch(num_kernels = 4, kernel_size = 3, depth = 8)
maxpool1 = max_pool_batch(pool_size=2)
flatten1 = flatten_batch()
linear1 = linear_layer_batch(in_dim= (4*(14**2)) ,out_dim=32)
relu1 = relu_batch()
linear2 = linear_layer_batch(in_dim= 32, out_dim = 10)


ELWL = entropylosswithlogits()

model1 = tree([conv_1, conv_2, maxpool1, flatten1, linear1 ,relu1, linear2], ELWL)

batch_size = 10
epochs = 5
for epoch in range(epochs):
    loss = 0
    num_batches = int(y_train.shape[1]/batch_size)
    for i in range(num_batches):
        start = batch_size*i
        end = batch_size*(i+1)
        X = np.rollaxis(X_train[start:end,:,:], 0, 3)[None,:,:,:]
        y = y_train[:,start:end]
        
        model1.forward_pass(X = X, y = y)
        model1.backward_pass(X = X, y = y)
        model1.calculate_gradients()
        loss = loss + model1.loss
           
        if i%20 == 19:
            print(np.sum(conv_1.M['D_Kernel'].gradient))
            print("epoch: ",epoch,',', i ," out of ", num_batches, " batches completed, loss = ", loss/(100*batch_size))
            loss = 0
        model1.update_parameters(epsilon = 0.01) 

            


## This is a function to test our model

In [3]:
def test(model, X_test, Y_test,batch_size):
    count = 0
    Y_test = Y_test.T
    for i in range(100):
        start = batch_size*i
        end = batch_size*(i+1)
        X = np.rollaxis(X_test[start:end,:,:], 0, 3)[None,:,:,:]
        y = Y_test[:,start:end]
        model.forward_pass(X, y)
        y_pred = model.prediction

        
        correct = np.argmax(y, axis = 0)

        count += np.sum(correct == y_pred)
    print(count)

# Extra: CNN, except now with separable convolutions

In [271]:
def conv_forward_sep(Images, D_Kernels, P_Kernels):
    k_depth, k_size, k_size = D_Kernels.shape
    depth, M, N, batch = Images.shape
    
    Images = pad_zeros_4d(Images, int((k_size-1)/2))
    
    new_Image = im2col(Images, D_Kernels.shape)
    D_Kernels = D_Kernels.reshape((k_depth, k_size**2),order = 'F')
    #depthwise conv
    output = np.einsum('ijkl,ij->ikl', new_Image, D_Kernels)
    
    output2 = np.reshape(output, (depth, M ,N, batch), order = 'F')
    #pointwise conv
    output = np.einsum('ijkl,mi->mjkl', output2, P_Kernels)
    return(output2, output)

def conv_backward_sep(In_grad, D_Kernels, P_Kernels):
    n_ker, M,N, batch = In_grad.shape
    k_depth, k_size, k_size = D_Kernels.shape
    n_ker, depth = P_Kernels.shape
    
    
    out_grad_2 = np.einsum('ijkl,in->njkl', In_grad, P_Kernels)

    out_grad = pad_zeros_4d(out_grad_2, int((k_size-1)/2))
    out_grad = im2col(out_grad, D_Kernels.shape)
    
    D_Kernels = np.flip(np.flip(D_Kernels, 1),2)
    D_Kernels = D_Kernels.reshape((depth, k_size**2), order = 'F')

    out_grad = np.einsum('ijkl,ij->ikl',out_grad, D_Kernels)

    out_grad = out_grad.reshape((k_depth,M,N,batch), order = 'F')
    return(out_grad, out_grad_2)

def conv_param_sep(Image, In_grad2, Image2, In_grad):
    depth, M,N, batch = Image.shape
    depth, M,N, batch = In_grad2.shape
    depth, M,N, batch = Image2.shape
    n_ker, M,N, batch = In_grad.shape
    
    P_grad = np.einsum('ijkl,mjkl->mi', Image2, In_grad)
    
    #hardcode alert
    Image = pad_zeros_4d(Image, 1)
    Image = im2col(Image, In_grad2.shape)
    In_grad2 = In_grad2.reshape((depth, M*N, batch), order = 'F')
    
    
    D_grad = np.einsum('ijkl,ijl->ik', Image, In_grad2)
    
    #hardcode alert
    D_grad = D_grad.reshape((depth, 3, 3), order = 'F')
    
    return(P_grad, D_grad)
    

In [365]:

class conv_layer_sep_batch:
    def __init__(self, num_kernels, kernel_size, depth):
        self.M = {}
        self.M['D_Kernels'] = parameter(data=0.5*np.random.normal(size = [depth,kernel_size,kernel_size]),
                                           gradient = np.zeros([depth ,kernel_size,kernel_size]))
        self.M['P_Kernels'] = parameter(data=0.5*np.random.normal(size = [num_kernels, depth]),
                                       gradient = np.zeros([num_kernels,depth]))
        self.kernel_size = kernel_size
        self.forward = {}
        self.backward = {}
        
    def forward_pass(self, image):
        
        #populate the forward tree
        self.image = image
        self.forward['f1'],self.forward['out'] = conv_forward_sep(image, 
                                                                  self.M['D_Kernels'].data, 
                                                                  self.M['P_Kernels'].data)
    def backward_pass(self, in_grad):
        #populate the backward tree
        self.in_grad = in_grad
        self.backward['t1'],self.backward['t2'] = conv_backward_sep(in_grad, 
                                                                    self.M['D_Kernels'].data, 
                                                                    self.M['P_Kernels'].data)
    def calculate_gradient(self):
        #note that there is no gradient accum here
        self.M['P_Kernels'].gradient,self.M['D_Kernels'].gradient   =  conv_param_sep(self.image, 
                                                                                      self.backward['t2'], 
                                                                                      self.forward['f1'], 
                                                                                      self.in_grad)
    def update_parameters(self, epsilon):
        #remember to divide by batchsize
        self.M['D_Kernels'].data -= epsilon * self.M['D_Kernels'].gradient
        self.M['P_Kernels'].data -= epsilon * self.M['P_Kernels'].gradient
        #set to zero
        self.M['D_Kernels'].gradient = np.zeros(self.M['D_Kernels'].gradient.shape)
        self.M['P_Kernels'].gradient = np.zeros(self.M['P_Kernels'].gradient.shape)

In [2]:
conv_1 = conv_layer_sep_batch(num_kernels = 8, kernel_size = 3, depth = 1)
conv_2 = conv_layer_sep_batch(num_kernels = 4, kernel_size = 3, depth = 8)
maxpool1 = max_pool_batch(pool_size=2)
flatten1 = flatten_batch()
linear1 = linear_layer_batch(in_dim= (4*(14**2)) ,out_dim=32)
relu1 = relu_batch()
linear2 = linear_layer_batch(in_dim= 32, out_dim = 10)


ELWL = entropylosswithlogits()

model1 = tree([conv_1, conv_2,  maxpool1, flatten1, linear1 ,relu1, linear2], ELWL)

batch_size = 100
epochs = 5
for epoch in range(epochs):
    loss = 0
    num_batches = int(y_train.shape[1]/batch_size)
    for i in range(num_batches):
        start = batch_size*i
        end = batch_size*(i+1)
        X = np.rollaxis(X_train[start:end,:,:], 0, 3)[None,:,:,:]
        y = y_train[:,start:end]
        
        model1.forward_pass(X = X, y = y)
        model1.backward_pass(X = X, y = y)
        model1.calculate_gradients()
        loss = loss + model1.loss
           
        if i%20 == 19:
            print(np.sum(conv_1.M['P_Kernels'].gradient))
            print("epoch: ",epoch,',', i ,
                  " out of ", num_batches, 
                  " batches completed, loss = ", loss/(100*batch_size))
            loss = 0
        model1.update_parameters(epsilon = 0.001) 

In [4]:
test(model1, X_test=X_test, Y_test=y_test.T,batch_size = 100)

# Extra: Separable convolutions with depth filters being an outer product

In [369]:
img2 = np.arange(5*36*2).reshape((5,6,6,2))
vec1 = np.arange(5*3).reshape((5,3))
vec2 = np.arange(5*3).reshape((5,3))
P_Kernels = np.arange(4*5).reshape((4,5))
b = np.array([5,3,3])

class conv_layer_new_batch:
    def __init__(self, num_kernels, kernel_size, depth):
        self.M = {}
        self.M['vec1'] = parameter(data=0.5*np.random.normal(size = [depth,kernel_size]),
                                           gradient = np.zeros([depth ,kernel_size]))
        self.M['vec2'] = parameter(data=0.5*np.random.normal(size = [depth,kernel_size]),
                                           gradient = np.zeros([depth ,kernel_size]))
        self.M['P_Kernels'] = parameter(data=0.5*np.random.normal(size = [num_kernels, depth]),
                                       gradient = np.zeros([num_kernels,depth]))
        self.kernel_size = kernel_size
        self.forward = {}
        self.backward = {}
        
    def forward_pass(self, image):
        
        #populate the forward tree
        self.image = image
        D_Kernels = np.einsum( 'ij,ik->ijk',self.M['vec1'].data,self.M['vec2'].data)
        self.forward['f1'],self.forward['out'] = conv_forward_sep(image, 
                                                                  D_Kernels, 
                                                                  self.M['P_Kernels'].data)
    def backward_pass(self, in_grad):
        #populate the backward tree
        self.in_grad = in_grad
        D_Kernels = np.einsum( 'ij,ik->ijk',self.M['vec1'].data,self.M['vec2'].data)
        self.backward['t1'],self.backward['t2'] = conv_backward_sep(in_grad, 
                                                                    D_Kernels, 
                                                                    self.M['P_Kernels'].data)
    def calculate_gradient(self):
        #note that there is no gradient accum here
        self.M['P_Kernels'].gradient, D_Kernels_grad = conv_param_sep(self.image, 
                                                                      self.backward['t2'], 
                                                                      self.forward['f1'], 
                                                                      self.in_grad)
        self.M['vec1'].gradient = np.einsum('ijk,ik->ij', D_Kernels_grad, self.M['vec2'].data)
        self.M['vec2'].gradient = np.einsum('ijk,ij->ik', D_Kernels_grad, self.M['vec1'].data)
        
    def update_parameters(self, epsilon):
        #remember to divide by batchsize
        self.M['P_Kernels'].data -= epsilon * self.M['P_Kernels'].gradient
        self.M['vec1'].data -= epsilon * self.M['vec1'].gradient
        self.M['vec2'].data -= epsilon * self.M['vec2'].gradient
        #set to zero
        self.M['P_Kernels'].gradient = np.zeros(self.M['P_Kernels'].gradient.shape)
        self.M['vec1'].gradient = np.zeros(self.M['vec1'].gradient.shape)
        self.M['vec2'].gradient = np.zeros(self.M['vec2'].gradient.shape)


In [6]:
conv_1 = conv_layer_new_batch(num_kernels = 8, kernel_size = 3, depth = 1)
conv_2 = conv_layer_new_batch(num_kernels = 4, kernel_size = 3, depth = 8)
maxpool1 = max_pool_batch(pool_size=2)
flatten1 = flatten_batch()
linear1 = linear_layer_batch(in_dim= (4*(14**2)) ,out_dim=32)
relu1 = relu_batch()
linear2 = linear_layer_batch(in_dim= 32, out_dim = 10)


ELWL = entropylosswithlogits()

model1 = tree([conv_1, conv_2,  maxpool1, flatten1, linear1 ,relu1, linear2], ELWL)

batch_size = 100
epochs = 5
for epoch in range(epochs):
    loss = 0
    num_batches = int(y_train.shape[1]/batch_size)
    for i in range(num_batches):
        start = batch_size*i
        end = batch_size*(i+1)
        X = np.rollaxis(X_train[start:end,:,:], 0, 3)[None,:,:,:]
        y = y_train[:,start:end]
        
        model1.forward_pass(X = X, y = y)
        model1.backward_pass(X = X, y = y)
        model1.calculate_gradients()
        loss = loss + model1.loss
           
        if i%100 == 99:
            print("epoch: ",epoch,',', i ," out of ", num_batches, " batches completed, loss = ", loss/(100*batch_size))
            loss = 0
        model1.update_parameters(epsilon = 0.01)     