# Debugging

在开始正式debug你的代码实现之前，我们推荐先尝试看懂 `neural_networks.grad_check.check_gradients`。在这个notebook中，我们使用这个函数检查你的神经网络之间传递的梯度，与正确梯度之间的差值。

这个函数返回的是数值梯度（有限差分近似计算的梯度），与利用数值分析求得的梯度（反向传播）的相对差值。如果你的代码实现正确，这个函数会返回很小的误差，一般小于`1e-7`。

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from neural_networks.utils.grad_check import check_gradients
from neural_networks.layers import FullyConnected
from neural_networks.layers.activations import Identity, Sigmoid, TanH, ReLU, SoftMax

## Gradient Checks for Activation Functions

### Identity Activation

In [2]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)
# initialize a fully connected layer
# and perform a forward and backward pass
identity_activation = Identity()
_ = identity_activation.forward(X)
grad = identity_activation.backward(dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for identity activation:",
    check_gradients(
        fn=identity_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for identity activation: 4.0254750997459915e-11


### Sigmoid Activation

In [3]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
sigmoid_activation = Sigmoid()
_ = sigmoid_activation.forward(X)
grad = sigmoid_activation.backward(dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for sigmoid activation:",
    check_gradients(
        fn=sigmoid_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for sigmoid activation: 7.722641918597325e-11


### Tanh Activation

In [4]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
tanh_activation = TanH()
_ = tanh_activation.forward(X)
grad = tanh_activation.backward(dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for tanh activation:",
    check_gradients(
        fn=tanh_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for tanh activation: 1.159334155031177e-11


### ReLU Activation

In [5]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
relu_activation = ReLU()
out = relu_activation.forward(X)
grad = relu_activation.backward(dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for relu activation:",
    check_gradients(
        fn=relu_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for relu activation: 1.432676695557825e-11


### Softmax Activation

In [16]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
softmax_activation = SoftMax()
_ = softmax_activation.forward(X)
grad = softmax_activation.backward(dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for softmax activation:",
    check_gradients(
        fn=softmax_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for softmax activation: 1.1369402983163086e-10


## Gradient Checks for Full Layers

### Fully Connected Layer

In [16]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 4)
# initialize a fully connected layer
# and perform a forward and backward pass
fc_layer = FullyConnected(n_out=4, activation="tanh")
_ = fc_layer.forward(X)
_ = fc_layer.backward(dLdY)


# check the gradients w.r.t. each parameter
for param in fc_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=fc_layer.forward_with_param(param, X),  # the function we are checking
            grad=fc_layer.gradients[param],  # the analytically computed gradient
            x=fc_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )##之前使用dw，db为错误的key

Relative error for W: 3.520273906693607e-10
Relative error for b: 2.9313301936537695e-11


### Cross Entropy

In [9]:
from neural_networks.layers import CrossEntropyLoss

num_pts = 5
num_classes = 6

# one-hot encoded y
y_idxs = np.random.randint(0, num_classes, (num_pts,))
y = np.zeros((num_pts, num_classes))
y[range(num_pts), y_idxs] = 1

# normalized predictions
scores = np.random.uniform(0, 1, size=(num_pts, num_classes))
y_hat = scores / scores.sum(axis=1, keepdims=True)

cross_entropy_loss = CrossEntropyLoss()

def forward_fn(Y_hat, Y):
    def inner_forward(Y_hat):
        return cross_entropy_loss.forward(Y_hat, Y)
    return inner_forward

loss = cross_entropy_loss(y_hat, y)
grad = cross_entropy_loss.backward()

print(
    f"Relative error for cross entropy loss:",
    check_gradients(
        fn=forward_fn(y_hat, y),  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=y_hat,        # the variable w.r.t. which we are taking the gradient
        dLdf=1,  # gradient at previous layer
    )
)

Relative error for cross entropy loss: 5.916086875247166e-10
