# Bachpropagation

In [37]:
import numpy as np

# define MSE criterion
class nn_MSECriterion:
    def forward(self, prediction, target):
        return np.sum(np.square(prediction - target)) 
    
    def backward(self, prediction, target):
        return 2 * (prediction - target)
    
# define loss function
class nn_Sigmoid:
    def forward(self, x):
        return 1 / (1 + np.exp(-x))
    
    def backward(self, x, grad_output):
        return np.multiply(self.forward(x) * (1 - self.forward(x)), grad_output)
    
# define neural network
class nn_Linear:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.bias = np.random.randn(output_size)
        self.grad_weights = np.zeros_like(self.weights) * 0.01
        self.grad_bias = np.zeros_like(self.bias)
        
    def forward(self, x):
        return np.dot(x, self.weights) + self.bias
    
    def backward(self, x, grad_output):
        self.grad_weights = np.dot(x.T, grad_output)
        self.grad_bias = np.copy(grad_output)
        return np.dot(grad_output, self.weights.T)
    
    def get_params(self):
        params = [self.weights, self.bias]
        grad_params = [self.grad_weights, self.grad_bias]
        return params, grad_params

Let's test some dummy inputs for a full pass of forward and backward propagation.

In [38]:
x1 = np.array([[1, 2, 2, 3]])
y1 = np.array([[0.25, 0.25, 0.25]])

# Define the operations.
linear = nn_Linear(4, 3)  # h(W, b)
sigmoid = nn_Sigmoid()  # g(v)
loss = nn_MSECriterion()  # f(u)

# Forward-propagation.
lin = linear.forward(x1)
y_hat = sigmoid.forward(lin)
loss_val = loss.forward(y_hat, y1) # Loss function.

# Backward-propagation.
dy_hat = loss.backward(y_hat, y1)
dlin = sigmoid.backward(lin, dy_hat)
dx1 = linear.backward(x1, dlin)

# Gradient checking

In [42]:
# We will compute derivatives with respect to a single data pair (x,y)
x = np.array([[2.34, 3.8, 34.44, 5.33]])
y = np.array([[3.2, 4.2, 5.3]])

hidden_state_size = 5;

model = {}
model['linear1'] = nn_Linear(4, hidden_state_size)  # 第一个线性层，输出维度改为5
model['sigmoid1'] = nn_Sigmoid()    # 第一个 Sigmoid 激活层
model['linear2'] = nn_Linear(hidden_state_size, 3)  # 第二个线性层，输入维度为5，输出维度为3
model['sigmoid2'] = nn_Sigmoid()    # 第二个 Sigmoid 激活层
model['loss'] = nn_MSECriterion()

gradWeight1 = model['linear1'].grad_weights
gradBias1 = model['linear1'].grad_bias
gradWeight2 = model['linear2'].grad_weights
gradBias2 = model['linear2'].grad_bias

approxGradWeight1 = np.zeros_like(model['linear1'].weights)
approxGradBias1 = np.zeros_like(model['linear1'].bias)
approxGradWeight2 = np.zeros_like(model['linear2'].weights)
approxGradBias2 = np.zeros_like(model['linear2'].bias)

epsilon = 1e-4
for i in range(0, linear.weights.shape[0]):
    for j in range(0, linear.weights.shape[1]):
        # Forward-propagation.
        a0 = model['linear1'].forward(x)
        a1 = model['sigmoid1'].forward(a0)  # 第一个 Sigmoid 激活层
        a2 = model['linear2'].forward(a1)   # 第二个线性层
        a3 = model['sigmoid2'].forward(a2)  # 第二个 Sigmoid 激活层
        loss1 = model['loss'].forward(a3, y)
        shift_weight1 = np.copy(model['linear1'].weights)
        shift_weight2 = np.copy(model['linear2'].weights)
        shift_weight1[i, j] += epsilon
        shift_weight2[i, j] += epsilon
        shift_linear1 = nn_Linear(4, hidden_state_size)
        shift_linear1.bias = model['linear1'].bias
        shift_linear1.weights = shift_weight1
        shift_linear2 = nn_Linear(hidden_state_size, 3)
        shift_linear2.bias = model['linear2'].bias
        shift_linear2.weights = shift_weight2
        shift_a0 = shift_linear1.forward(x)
        shift_a1 = sigmoid.forward(shift_a0)
        shift_a2 = shift_linear2.forward(shift_a1)
        shift_a3 = sigmoid.forward(shift_a2)
        loss2 = model['loss'].forward(shift_a3, y)
        approxGradWeight[i, j] = (loss2 - loss1) / epsilon
        
# These two outputs should be similar up to some precision.
print('gradWeight: ' + str(gradWeight))
print('\napproxGradWeight: ' + str(approxGradWeight))

gradWeight: [[ -3.18346091  -2.34519463  -3.43486793]
 [ -5.16972285  -3.80843573  -5.57799065]
 [-46.85401443 -34.51645431 -50.55421004]
 [ -7.25121652  -5.34183221  -7.82386584]]

approxGradWeight: [[ -3.18347075  -2.34538141  -3.43461622]
 [ -5.16974876  -3.8089283   -5.57732685]
 [-46.85610065 -34.55692648 -50.4996925 ]
 [ -7.25126744  -5.34280131  -7.8225599 ]]
