# Numerical Gradient Checking

We would highly recommend looking at `neural_networks.grad_check.check_gradients` and making sure you understand how numerical gradient checking is being carried out. This function is used in the notebook to check the gradients of the neural network layers you write. Make sure to check the gradient of a layer after finishing its implementation.

The function returns the relative error of the numerical gradient (approximated using finite differences) with respect to the analytical gradient (computed via backpropagation). Correct implementations should get very small errors, usually less than `1e-8` for 64-bit float matrices (the default).

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from neural_networks.utils import check_gradients
from neural_networks.layers import FullyConnected
from neural_networks.activations import Linear, ReLU, SoftMax
from neural_networks.losses import CrossEntropy

## Gradient Checks for Activation Functions

### Linear Activation

In [2]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
linear_activation = Linear()
_ = linear_activation.forward(X)
grad = linear_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for linear activation:",
    check_gradients(
        fn=linear_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for linear activation: 1.7257824629296002e-11


### ReLU Activation

In [3]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
relu_activation = ReLU()
out = relu_activation.forward(X)
grad = relu_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for relu activation:",
    check_gradients(
        fn=relu_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for relu activation: 2.0922241164025358e-11


### Softmax Activation

In [4]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
softmax_activation = SoftMax()
_ = softmax_activation.forward(X)
grad = softmax_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for softmax activation:",
    check_gradients(
        fn=softmax_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for softmax activation: 1.3479786999838347e-10


## Gradient Checks for Full Layers

### Fully Connected Layer (Linear Activation)

In [17]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
fc_layer = FullyConnected(n_out=4, activation="linear")
_ = fc_layer.forward(X)
_ = fc_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in fc_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=fc_layer.forward_with_param(param, X),  # the function we are checking
            grad=fc_layer.gradients[param],  # the analytically computed gradient
            x=fc_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

Relative error for W: 1.9973623964560514e-11
Relative error for b: 9.456720680851424e-12


### Fully Connected Layer (ReLU Activation) 

In [21]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
fc_layer = FullyConnected(n_out=4, activation="relu")
_ = fc_layer.forward(X)
_ = fc_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in fc_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=fc_layer.forward_with_param(param, X),  # the function we are checking
            grad=fc_layer.gradients[param],  # the analytically computed gradient
            x=fc_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 4)

## Gradient Checks for Loss Functions

### Cross Entropy Loss

In [12]:
random_indices = np.random.randint(0, 4, size=5)
Y = np.eye(4)[random_indices]
Y_hat = np.random.rand(5, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
cross_entropy = CrossEntropy("cross_entropy")
_ = cross_entropy.forward(Y, Y_hat)
grad = cross_entropy.backward(Y, Y_hat)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for relu activation:",
    check_gradients(
        fn=lambda x: cross_entropy.forward(Y, x),  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=Y_hat,    # the variable w.r.t. which we are taking the gradient
        dLdf=1,     # gradient at previous layer
    )
)

[[-0.26768035 -0.55211475 -0.70422281 -1.54014043]
 [-0.0984885  -0.60650146 -0.19807481 -0.49601839]
 [-2.40378043 -0.01705365 -2.75745295 -2.04424083]
 [-2.59292463 -1.54270305 -1.08336151 -1.82341383]
 [-0.90320179 -0.81215962 -0.27920956 -1.1795523 ]]
[[-0.26767904 -0.55211475 -0.70422281 -1.54014043]
 [-0.0984885  -0.60650146 -0.19807481 -0.49601839]
 [-2.40378043 -0.01705365 -2.75745295 -2.04424083]
 [-2.59292463 -1.54270305 -1.08336151 -1.82341383]
 [-0.90320179 -0.81215962 -0.27920956 -1.1795523 ]]
[[-0.26768166 -0.55211475 -0.70422281 -1.54014043]
 [-0.0984885  -0.60650146 -0.19807481 -0.49601839]
 [-2.40378043 -0.01705365 -2.75745295 -2.04424083]
 [-2.59292463 -1.54270305 -1.08336151 -1.82341383]
 [-0.90320179 -0.81215962 -0.27920956 -1.1795523 ]]
[[-0.26768035 -0.55211302 -0.70422281 -1.54014043]
 [-0.0984885  -0.60650146 -0.19807481 -0.49601839]
 [-2.40378043 -0.01705365 -2.75745295 -2.04424083]
 [-2.59292463 -1.54270305 -1.08336151 -1.82341383]
 [-0.90320179 -0.81215962 -0