In [649]:
import numpy as np
np.random.seed(1)

# Model Declaration

In [1]:
import numpy as np
np.random.seed(1)
from utils import *

class model():

	def __init__(self,layers,dims,init_type,activation_tup):
		'''layers :means the number of layer in the network
			dims  :length of each layer (including the input layer)
		'''
		self.layers=layers #excluding the input layer as by convention
		self.dims=dims
		self.parameters=self.initialize_parameters(dims,init_type)
		self.init_type=init_type
		self.cache={}
		self.cost=0
		self.activation_tup=activation_tup
		self.grads={}

	def initialize_parameters(self,dims,init_type):
		''' INPUT 	: dims=dimension of each layer list or tuple
			OUTPUT  : parameters Dictionary
		'''
		parameters={}
		num_layers=self.layers

		for l in range(num_layers):
			#He works best with relu activations in the layers(Why?)
			if(init_type=="HE"):
				parameters["W"+str(l+1)]=np.random.randn(dims[l+1],dims[l])*np.sqrt(2/dims[l])
				parameters["b"+str(l+1)]=np.zeros((dims[l+1],1))

			elif(init_type=="Xavier"):
				parameters["W"+str(l+1)]=np.random.randn(dims[l+1],dims[l])*np.sqrt(1/dims[l])
				parameters["b"+str(l+1)]=np.zeros((dims[i+1],1))
			
		return parameters

	def forward_propagate(self,X,Y):
		''' X: Training Data with examples in column 
			Y: Training label with examples in column
		'''

		cache={}
		num_layers=self.layers
		parameters=self.parameters
		activation=self.activation_tup
		cache["A"+str(0)]=X

		#Looping through the layer computing the values.
		for l in range(num_layers):
			cache["Z"+str(l+1)]=np.dot(parameters["W"+str(l+1)],cache["A"+str(l)])+parameters["b"+str(l+1)]
			if(activation[l]=="relu"):
				cache["A"+str(l+1)]=relu(cache["Z"+str(l+1)])
			
			elif(activation[l]=="sigmoid"):
				cache["A"+str(l+1)]=sigmoid(cache["Z"+str(l+1)])
		self.cache=cache

	def calculate_cost(self,Y):
		batch_size=Y.shape[1]
		layers=self.layers
		AL=self.cache["A"+str(layers)]

		log_probs=Y*np.log(AL)+(1-Y)*np.log(1-AL)
		cost=(-1/batch_size)*np.squeeze(np.sum(log_probs))

		return cost

	def back_propagate_layer(self,m,layer):
		'''INPUT:  m: is batch size
			   layer: is exact layer number
		'''
		activation=self.activation_tup[layer-1] #len(activation_tup) is total layer number-1

		dA=self.grads["dA"+str(layer)]
		Z=self.cache["Z"+str(layer)]
		A_prev=self.cache["A"+str(layer-1)]
		W=self.parameters["W"+str(layer)]

		#Activation Backpropagate
		if(activation=="sigmoid"):
			dZ=sigmoid_backward(dA,Z)
		elif(activation=="relu"):
			dZ=relu_backward(dA,Z)
		self.grads["dZ"+str(layer)]=dZ

		#Linear Backpropagate
		self.grads["dW"+str(layer)]=(1/m)*np.dot(dZ,A_prev.T)
		self.grads["db"+str(layer)]=(1/m)*np.sum(dZ,axis=1,keepdims=1)

		self.grads["dA"+str(layer-1)]=np.dot(W.T,dZ)

	def back_propagate_model(self,Y):
		''' INPUT: Y: Training Label
			OUTPUT: updates the model varible grads
		'''
		layers=self.layers
		m=Y.shape[1]

		AL=self.cache["A"+str(layers)]
		self.grads["dA"+str(layers)]=np.divide(-1*Y,AL)+np.divide(1-Y,1-AL)

		for l in reversed(range(1,layers+1)):
			self.back_propagate_layer(m,l)
			#print (self.grads["dW"+str(l)].shape,self.grads["db"+str(l)].shape)

	def param_to_vector(self):

		params=[]
		layers=self.layers
		for l in range(layers):
			params.append(self.parameters["W"+str(l+1)])
			params.append(self.parameters["b"+str(l+1)])
		#print (params)
		count=0
		for param in params:
			#print (param)
			new_vector=np.reshape(param,(-1,1))
			if(count==0):
				theta=new_vector
			else:
				theta=np.concatenate((theta,new_vector),axis=0)
			count=count+1
		return theta

	def grads_to_vector(self):
		
		#We have to bring the theta and grad in same order to make sense
		grads=[]
		layers=self.layers
		for l in range(layers):
			grads.append(self.grads["dW"+str(l+1)])
			grads.append(self.grads["db"+str(l+1)])
		#print(grads)
		count=0
		for grad in grads:
			#print (grad)
			new_vector=np.reshape(grad,(-1,1))
			if(count==0):
				gradient=new_vector
			else:
				gradient=np.concatenate((gradient,new_vector),axis=0)
			count=count+1
			
		return gradient

	def vector_to_param(self,theta):

		layers=self.layers
		parameters={}

		start=0
		for l in range(1,layers+1):
			n1,n2=self.parameters["W"+str(l)].shape
			end=start+n1*n2
			parameters["W"+str(l)]=theta[start:end].reshape((n1,n2))
			start=end
			end=start+n1
			parameters["b"+str(l)]=theta[start:end].reshape((n1,1))
			start=end
		return parameters

	def gradient_checking(self,epsilon,X,Y):

		param_vector=self.param_to_vector()
		gradient_vector=self.grads_to_vector()
		num_param=param_vector.shape[0]
		J_plus=np.zeros((num_param,1))
		J_minus=np.zeros((num_param,1))
		grad_approx=np.zeros((num_param,1))

		for i in range(num_param):

			#Calculating the right sided value for Central Difference
			theta_plus=np.copy(param_vector) #fresh memory (new clone) so that we could change
			theta_plus[i][0]=theta_plus[i][0]+epsilon
			self.parameters=self.vector_to_param(theta_plus)
			self.forward_propagate(X,Y)
			J_plus[i][0]=self.calculate_cost(Y)

			#Calculating the left side value for Central Differnce
			theta_minus=np.copy(param_vector)
			theta_minus[i][0]=theta_minus[i][0]-epsilon
			self.parameters=self.vector_to_param(theta_minus)
			self.forward_propagate(X,Y)
			J_minus[i][0]=self.calculate_cost(Y)

			grad_approx[i][0]=(J_plus[i][0]-J_minus[i][0])/(2*epsilon)

		numerator=np.linalg.norm(gradient_vector-grad_approx)
		denominator=np.linalg.norm(gradient_vector)+np.linalg.norm(grad_approx)

		difference=numerator/denominator
		print ("difference in grad",difference)

		#Re-assigning the initial parameters as model parameters
		self.parameters=self.vector_to_param(param_vector)

	def update_parameter(self,learningRate):
		'''
		Updating the parameter according to the gradient 
		calculated above.
		'''
		layers=self.layers
		parameters=self.parameters
		grads=self.grads

		for l in range(1,layers+1):
			parameters["W"+str(l)]=parameters["W"+str(l)]-learningRate*grads["dW"+str(l)]
			parameters["b"+str(l)]=parameters["b"+str(l)]-learningRate*grads["db"+str(l)]


# Checking Forward Propagation

In [2]:
net=model(2,[3,2,1],"HE",("relu","sigmoid"))
print (net.parameters["W1"].shape)
print (net.parameters["b1"].shape)
print (net.parameters["W2"].shape)
print (net.parameters["b2"].shape)

(2, 3)
(2, 1)
(1, 2)
(1, 1)


In [3]:
X=np.random.randn(3,3)
Y=np.random.randn(1,3)
Y=Y<0.5
print (X)
print (Y)

[[ 0.3190391  -0.24937038  1.46210794]
 [-2.06014071 -0.3224172  -0.38405435]
 [ 1.13376944 -1.09989127 -0.17242821]]
[[ True  True False]]


In [4]:
net.forward_propagate(X,Y)

In [5]:
net.cache

{'A0': array([[ 0.3190391 , -0.24937038,  1.46210794],
        [-2.06014071, -0.3224172 , -0.38405435],
        [ 1.13376944, -1.09989127, -0.17242821]]),
 'A1': array([[ 0.96322835,  0.30464196,  2.2053472 ],
        [ 0.        ,  2.05756044,  0.        ]]),
 'A2': array([[ 0.84299087,  0.26217657,  0.97912109]]),
 'Z1': array([[ 0.96322835,  0.30464196,  2.2053472 ],
        [-3.86578036,  2.05756044, -1.22826341]]),
 'Z2': array([[ 1.68065215, -1.03468633,  3.84791574]])}

In [6]:
param=net.parameters
Z1=np.dot(param["W1"],X)+param["b1"]
print (Z1==net.cache["Z1"])

[[ True  True  True]
 [ True  True  True]]


In [7]:
A1=relu(Z1)
print (A1==net.cache["A1"])

[[ True  True  True]
 [ True  True  True]]


In [8]:
Z2=np.dot(param["W2"],A1)+param["b2"]
print (Z2==net.cache["Z2"])

[[ True  True  True]]


In [9]:
A2=sigmoid(Z2)
print (A2==net.cache["A2"])

[[ True  True  True]]


In [10]:
net.calculate_cost(Y)

1.7928506394467114

In [11]:
log=Y*np.log(A2)+(1-Y)*np.log(1-A2)
np.sum(log)*(-1/3)

1.7928506394467114

# Checking Backpropagation

In [12]:
net.back_propagate_model(Y)

In [13]:
param["W2"].shape

(1, 2)

In [14]:
param["b2"].shape

(1, 1)

In [15]:
param["W1"].shape

(2, 3)

In [16]:
param["b1"].shape

(2, 1)

# Gradient checking

In [17]:
a=np.random.randn(2,3)
print (a)

[[-1.10061918  1.14472371  0.90159072]
 [ 0.50249434  0.90085595 -0.68372786]]


In [18]:
b=a.reshape(-1,1)
print (b)

[[-1.10061918]
 [ 1.14472371]
 [ 0.90159072]
 [ 0.50249434]
 [ 0.90085595]
 [-0.68372786]]


In [19]:
b.reshape(2,3)

array([[-1.10061918,  1.14472371,  0.90159072],
       [ 0.50249434,  0.90085595, -0.68372786]])

In [20]:
dict={1,2,3}

In [21]:
for i in dict:
    print (i)

1
2
3


In [22]:
net.parameters

{'W1': array([[ 1.32627244, -0.49949702, -0.43125043],
        [-0.87607521,  0.70660237, -1.87919848]]),
 'W2': array([[ 1.74481176, -0.7612069 ]]),
 'b1': array([[ 0.],
        [ 0.]]),
 'b2': array([[ 0.]])}

In [23]:
theta=net.param_to_vector()
print (theta)

[[ 1.32627244]
 [-0.49949702]
 [-0.43125043]
 [-0.87607521]
 [ 0.70660237]
 [-1.87919848]
 [ 0.        ]
 [ 0.        ]
 [ 1.74481176]
 [-0.7612069 ]
 [ 0.        ]]


In [24]:
print (net.parameters)

{'W1': array([[ 1.32627244, -0.49949702, -0.43125043],
       [-0.87607521,  0.70660237, -1.87919848]]), 'b1': array([[ 0.],
       [ 0.]]), 'W2': array([[ 1.74481176, -0.7612069 ]]), 'b2': array([[ 0.]])}


In [25]:
params=net.vector_to_param(theta)

In [26]:
print (params)

{'W1': array([[ 1.32627244, -0.49949702, -0.43125043],
       [-0.87607521,  0.70660237, -1.87919848]]), 'b1': array([[ 0.],
       [ 0.]]), 'W2': array([[ 1.74481176, -0.7612069 ]]), 'b2': array([[ 0.]])}


In [27]:
print(net.grads["dW1"])
print(net.grads["db1"])
print(net.grads["dW2"])
print(net.grads["db2"])

[[ 0.91048929  0.10777828  0.27026279]
 [-0.04668515 -0.0603604  -0.20591295]]
[[ 0.04902254]
 [ 0.1872121 ]]
[[ 0.59443144 -0.50603877]]
[[ 0.02809618]]


In [28]:
epsilon=1e-7
net.gradient_checking(epsilon,X,Y)

difference in grad 4.73352029244e-09


In [678]:
a=[1,2,3]

In [679]:
a.append(5)

In [680]:
a

[1, 2, 3, 5]