In [61]:
import numpy as np
from collections import OrderedDict
from typing import Optional  # 类型注释

# 误差反向传播法的实现

In [62]:
class Affine:
    def __init__(self, W, b) -> None:
        self.W = W
        self.b = b
        self.x: Optional[np.ndarray] = None
        self.dW = None
        self.db = None

    def forward(self, x: np.ndarray):
        self.x = x
        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        self.db = np.sum(dout, axis=0)
        self.dW = np.dot(self.x.T, dout)  # type: ignore
        dx = np.dot(dout, self.W.T)

        return dx

class Relu():
    def __init__(self) -> None:  # 激活函数都不需要更新参数
        self.mask = None

    def forward(self, x):
        self.mask = x <= 0
        
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dx = dout.copy()  # 书中直接修改了 dout 不推荐
        dx[self.mask] = 0

        return dx

def softmax(x):
    if np.ndim(x) == 1:
        x = x.reshape(1, -1)
    
    x_max = np.max(x, axis=1, keepdims=True)
    x_exp = np.exp(x - x_max)
    x_exp_sum = np.sum(x_exp, axis=1, keepdims=True)
    y = x_exp / x_exp_sum

    return y

def cross_entropy_error(y, t):
    if np.ndim == 1:
        y = y.reshape(1, -1)
    
    loss = -np.mean(np.log(y[np.arange(y.shape[0]), t] + 1e-6))

    return loss

class SoftmaxWithLoss:
    def __init__(self) -> None:  # 无需保存任何参数
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.y = softmax(x)
        self.t = t
        loss = cross_entropy_error(self.y, self.t)

        return loss  # return loss 而不是 y

    def backward(self, dout=1):  # 最后一层不需要 dout
        if self.t.ndim == 1:  # type: ignore
            t_onehot = np.zeros_like(self.y)
            t_onehot[np.arange(self.t.shape[0]), self.t] = 1  # type: ignore

        batch_size = self.y.shape[0]  # type: ignore
        dx = (self.y - t_onehot) / batch_size  #  type: ignore 由于 loss 是取均值 因此梯度也要取均值

        return dx

def numerical_gradient(x, f):
    h = 1e-4  # 这里建议设置成-4 太大导致求导误差 太小导致舍入误差
    grad = np.zeros_like(x)

    for index in np.ndindex(x.shape):
        tmp = x[index]
        x[index] = tmp + h
        fh1 = f(x)
        x[index] = tmp - h
        fh2 = f(x)
        x[index] = tmp
        grad[index] = (fh1 - fh2) / (2 * h)
    
    return grad


In [63]:
class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01) -> None:
        self.params_key = ['W1', 'b1', 'W2', 'b2']  # 这里书里没有 为了方便梯度计算自己加的
        
        # 初始化权重
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)  # W 随机是为了防止对称性问题
        self.params['b1'] = np.zeros(hidden_size)  # b 设置为无偏即可 高斯随机并无明显优势
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

        # 生成层
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()  # lastLayer不需要参与forward

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x

    def loss(self, x, t):
        y = self.predict(x)
        loss = self.lastLayer.forward(y, t)

        return loss

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        accuracy = np.mean(y == t)

        return accuracy

    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        for key in self.params_key:
            grads[key] = numerical_gradient(self.params[key], loss_W)
        
        return grads  # 数值梯度没必要保存 仅用作和反向传播对比

    def gradient(self, x, t):
        # forward
        self.loss(x, t)  # 这里是为了正向传播

        # backward
        dout = self.lastLayer.backward()

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db

        return grads

# 梯度确认

In [64]:
# 读入数据
from torchvision import datasets, transforms

transform = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = datasets.MNIST(
    root='../data',
    train=True,
    download=False,
    transform=transform
)

x_train_list = []
t_train_list = []
for image, label in train_dataset:
    x_train_list.append(image)
    t_train_list.append(label)

x_train = np.array(x_train_list)
x_train = x_train.reshape(x_train.shape[0], -1)
t_train = np.array(t_train_list)

transform = transforms.Compose([
    transforms.ToTensor()
])

test_dataset = datasets.MNIST(
    root='../data',
    train=False,
    download=False,
    transform=transform
)

x_test_list = []
t_test_list = []
for image, label in test_dataset:
    x_test_list.append(image)
    t_test_list.append(label)

x_test = np.array(x_test_list)
x_test = x_test.reshape(x_test.shape[0], -1)
t_test = np.array(t_test_list)

In [65]:
import time

network = TwoLayerNet(input_size=784, hidden_size=100, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

start_time = time.perf_counter()
grad_numerical = network.numerical_gradient(x_batch, t_batch)
cost_time = time.perf_counter() - start_time
print(f"numerical time: {cost_time}")

start_time = time.perf_counter()
grad_backprop = network.gradient(x_batch, t_batch)
cost_time = time.perf_counter() - start_time
print(f"backprop time: {cost_time}")

for key in network.params_key:
    diff = np.mean(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(f"{key}: {diff}")

numerical time: 14.532416099915281
backprop time: 0.0005848999135196209
W1: 3.8261000777673905e-09
b1: 2.316471321025512e-08
W2: 4.8825749413789305e-08
b2: 1.404590186858401e-06


# 使用反向传播训练

In [69]:
network = TwoLayerNet(784, 100, 10)

iters_num = 1000
train_size = x_train.shape[0]
batch_size = 1000
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size // batch_size, 1)

train_loss = network.loss(x_train, t_train)
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
print(f"train_loss: {train_loss}")
print(f"train_acc: {train_acc}")
print(f"test_acc: {test_acc}")

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 计算梯度
    grad = network.gradient(x_batch, t_batch)  # 这里把forward嵌入进去了 torch好像是分离的？

    # 更新
    for key in network.params_key:
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if (i + 1) % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)  # 注意这里不是batch
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(f"epoch {(i + 1) // iter_per_epoch} train_acc: {train_acc}")
        print(f"epoch {(i + 1) // iter_per_epoch} test_acc: {test_acc}") 

train_loss: 2.3032641411171646
train_acc: 0.05341666666666667
test_acc: 0.0521
epoch 1 train_acc: 0.68065
epoch 1 test_acc: 0.6886
epoch 2 train_acc: 0.8096833333333333
epoch 2 test_acc: 0.815
epoch 3 train_acc: 0.8566833333333334
epoch 3 test_acc: 0.8632
epoch 4 train_acc: 0.8765333333333334
epoch 4 test_acc: 0.8813
epoch 5 train_acc: 0.8876666666666667
epoch 5 test_acc: 0.8927
epoch 6 train_acc: 0.8951
epoch 6 test_acc: 0.8973
epoch 7 train_acc: 0.8989
epoch 7 test_acc: 0.9028
epoch 8 train_acc: 0.9022833333333333
epoch 8 test_acc: 0.9064
epoch 9 train_acc: 0.9061
epoch 9 test_acc: 0.9083
epoch 10 train_acc: 0.9085833333333333
epoch 10 test_acc: 0.9114
epoch 11 train_acc: 0.9115833333333333
epoch 11 test_acc: 0.9133
epoch 12 train_acc: 0.9131833333333333
epoch 12 test_acc: 0.9151
epoch 13 train_acc: 0.91555
epoch 13 test_acc: 0.9188
epoch 14 train_acc: 0.9171333333333334
epoch 14 test_acc: 0.9197
epoch 15 train_acc: 0.9196166666666666
epoch 15 test_acc: 0.9214
epoch 16 train_acc: 0.9