<a href="https://colab.research.google.com/github/Voland24/AndrewNGDeepLearningCourse/blob/main/Week2GradCheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Helper methods provided by the course


In [1]:
import numpy as np

def sigmoid(x):
    """
    Compute the sigmoid of x
    Arguments:
    x -- A scalar or numpy array of any size.
    Return:
    s -- sigmoid(x)
    """
    s = 1/(1+np.exp(-x))
    return s

def relu(x):
    """
    Compute the relu of x
    Arguments:
    x -- A scalar or numpy array of any size.
    Return:
    s -- relu(x)
    """
    s = np.maximum(0,x)
    
    return s

def dictionary_to_vector(parameters):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    keys = []
    count = 0
    for key in ["W1", "b1", "W2", "b2", "W3", "b3"]:
        
        # flatten parameter
        new_vector = np.reshape(parameters[key], (-1,1))
        keys = keys + [key]*new_vector.shape[0]
        
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta, new_vector), axis=0)
        count = count + 1

    return theta, keys

def vector_to_dictionary(theta):
    """
    Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
    """
    parameters = {}
    parameters["W1"] = theta[:20].reshape((5,4))
    parameters["b1"] = theta[20:25].reshape((5,1))
    parameters["W2"] = theta[25:40].reshape((3,5))
    parameters["b2"] = theta[40:43].reshape((3,1))
    parameters["W3"] = theta[43:46].reshape((1,3))
    parameters["b3"] = theta[46:47].reshape((1,1))

    return parameters

def gradients_to_vector(gradients):
    """
    Roll all our gradients dictionary into a single vector satisfying our specific required shape.
    """
    
    count = 0
    for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:
        # flatten parameter
        new_vector = np.reshape(gradients[key], (-1,1))
        
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta, new_vector), axis=0)
        count = count + 1

    return theta

The idea here is that we confirm that the gradient we are calculating with the chain rule i.e. backwards propagating of the gradient is approximately the same as the analytic gradient as well to a certain degree of accuracy

# Case of 1D grad check

In [5]:
def forward_propagation(x, theta):
  J = np.dot(theta, x)
  return J

In [4]:
def backward_propagation(x, theta):
  dtheta = x
  return dtheta

In [6]:
def gradient_check(x, theta, epsilon = 1e-7):
  thetaplus = theta + epsilon
  thetaminus = theta - epsilon
  J_plus = forward_propagation(x, thetaplus)
  J_minus = forward_propagation(x, thetaminus)
  gradapprox = (J_plus - J_minus) / (2 * epsilon)

  grad = backward_propagation(x, theta)

  numerator = np.linalg.norm(grad - gradapprox)
  denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)
  difference = numerator / denominator

  if difference < 1e-7:
    print("Grad is correct")
  else:
    print("Grad is NOT correct")
  
  return difference

# N-dim grad check

In [10]:
def forward_propagation_n(X, Y, params):
  
  m = X.shape[1]
  W1 = params["W1"]
  b1 = params["b1"]
  W2 = params["W2"]
  b2 = params["b2"]
  W3 = params["W3"]
  b3 = params["b3"]

  Z1 = np.dot(W1,X) + b1
  A1 = relu(Z1)
  Z2 = np.dot(W2, A1) + b2
  A2 = relu(Z2)
  Z3 = np.dot(W3,A2) + b3
  A3 = sigmoid(Z3)

  logprobs = np.multiply(-np.log(A3),Y) + np.multiply(-np.log(1 - A3), 1 - Y)
  cost = (1./ m) * np.sum(logprobs)
  cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)

  return cost, cache


In [11]:
def backward_propagation_n(X, Y , cache):
  
  m = X.shape[1]
  (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

  dZ2 = A3 - Y
  dW3 = (1. / m) * np.dot(dZ3, A2.T)
  db3 = (1./m) * np.sum(dZ3, axis = 1, keepdims = True)

  dA2 = np.dot(W3.T, dZ3)
  dZ2 = np.multiply(dA2, np.int64(A2 > 0))
  dW2 = (1./m) * np.dot(dZ2, A1.T)
  db2 = (1./m) * np.sum(dZ2, axis = 1, keepdims = True)

  dA1 = np.dot(W2.T, dZ2)
  dZ1 = np.multiply(dA1, np.int64(A2 > 0))
  dW2 = (1./m) * np.dot(dZ1, X1.T)
  db1 = 4./m * np.sum(dZ1, axis = 1, keepdims = True)

  gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
                 "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
                 "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}

  return gradients


In [12]:
def gradient_check_n(params, gradients, X, Y, epsilon = 1e-7):

  params_values, _ = dictionary_to_vector(params)
  grad = gradients_to_vector(gradients)
  num_params = params_values.shape[0]
  J_plus = np.zeros((num_params, 1))
  J_minus = np.zeros((num_params, 1))

  for i in range(num_params):
    thetaplus = np.copy(params_values)
    thetaplus[i][0] += epsilon
    J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus))

    thetaminus - np.copy(params_values)
    thetaminus[i][0] -= epslion
    J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus))

    gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon)
  
  numerator = np.linalg.norm(grad - gradapprox)                                     # Step 1'
  denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)                   # Step 2'
  difference = numerator / denominator                                              # Step 3'
  ### END CODE HERE ###

  if difference > 1e-7:
      print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
  else:
      print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
  
  return difference
