In [None]:
import numpy as np

In [None]:
def sigmoid(a):  # 天然支持任意维度
    return 1 / (1 + np.exp(-a))

def relu(a):  # 同上，除了 softmax 比较特殊，其他激活函数都天然支持
    return np.maximum(a, 0)

# def softmax(a):  # 只支持1维
#     return np.exp(a) / np.sum(np.exp(a))

# def softmax(a):  # 只支持2维
#     sum_exp = np.sum(np.exp(a), axis=1, keepdims=True)
#     return a / sum_exp

def softmax(a):  # 同时兼容1/2维
    if a.ndim == 1:
        a = a.reshape(1, -1)
    
    a_max = np.max(a, axis=1, keepdims=True)  # 数值稳定版
    a_exp = np.exp(a - a_max)
    sum_exp = np.sum(a_exp, axis=1, keepdims=True)
    return a_exp / sum_exp

def numerical_gradient(f, x, h = 1e-6):
    grad = np.zeros_like(x)

    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])  # type: ignore

    while not it.finished:
        idx = it.multi_index

        tmp = x[idx]

        x[idx] = tmp + h
        fxh1 = f(x)

        x[idx] = tmp - h
        fxh2 = f(x)  # 这里必须提前两个都算出来

        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp

        it.iternext()
    
    return grad

class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size) -> None:
        # 初始化权重
        self.params = {}
        self.params['W1'] = np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
    
    def predict(self, x):  # 天然支持多维数组
        a1 = np.dot(x, self.params['W1']) + self.params['b1']
        # z1 = sigmoid(a1)
        z1 = relu(a1)
        a2 = np.dot(z1, self.params['W2']) + self.params['b2']
        y = softmax(a2)
        return y
    
    # def loss(self, x, t):  # 不支持多维数组
    #     y = self.predict(x)
    #     return -np.log(y[t])  # 交叉损失熵

    def loss(self, x, t):
        if x.ndim == 1:
            x = x.reshape(1, -1)
        
        y = self.predict(x)
        return -np.mean(np.log(y[np.arange(y.shape[0]), t]))
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        return np.sum(y == t) / y.shape[0]
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])

        return grads

In [None]:
from torchvision import datasets, transforms

# 数据预处理
transform = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = datasets.MNIST(
    root='../data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = datasets.MNIST(
    root='../data',
    train=True,
    download=True,
    transform=transform
)

In [None]:
# 提取数据至np.array
x_train_list, t_train_list = [], []
for image, label in train_dataset:
    x_train_list.append(image)
    t_train_list.append(label)

x_train = np.array(x_train_list)
x_train = x_train.reshape(x_train.shape[0], -1)
t_train = np.array(t_train_list)

In [None]:
# 初始化网络
net = TwoLayerNet(784, 100, 10)

y_train = net.predict(x_train)
print(y_train.shape)
y_train = np.argmax(y_train, axis=1)
print(y_train.shape)

print(net.accuracy(x_train, t_train))

In [None]:
# 单样本训练测试
x = x_train[0]
t = t_train[0]

print(f"epoch: 0")
y = net.predict(x)
print(f"softmax: {y}")
y = np.argmax(y)
print(f"y: {np.argmax(y)}")
print(f"loss: {net.loss(x, t)}")

In [None]:
for i in range(1):
    grad = net.numerical_gradient(x, t)
    for key in ['W1', 'b1', 'W2', 'b2']:
        net.params[key] -= 0.01 * grad[key]
    y = net.predict(x)
    print(f"epoch: {i + 1}")
    print(f"softmax: {y}")
    print(f"y: {np.argmax(y)}")
    print(f"loss: {net.loss(x, t)}")

In [None]:
# batch 训练测试
x = x_train[0:10]
t = t_train[0:10]
print(x.shape)

print(f"epoch: 0")
y = net.predict(x)
print(f"softmax: {y}")
print(f"y: {np.argmax(y, axis=1)}")
print(f"t: {t}")
print(f"loss: {net.loss(x, t)}")

In [None]:
for i in range(10):
    grad = net.numerical_gradient(x, t)
    for key in ['W1', 'b1', 'W2', 'b2']:
        net.params[key] -= 0.01 * grad[key]
    y = net.predict(x)
    print(f"epoch: {i + 1}")
    print(f"y: {np.argmax(y, axis=1)}")
    print(f"loss: {net.loss(x, t)}")

# mini-bacth

In [None]:
net = TwoLayerNet(784, 100, 10)

train_loss_list = []

# 超参数
iters_num = 10
train_size = x_train.shape[0]
batch_size = 10
learning_rate = 0.1

for i in range(iters_num):
    # 获取 mini-batch
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 计算梯度
    grad = net.numerical_gradient(x_batch, t_batch)

    # 更新参数
    for key in ['W1', 'b1', 'W2', 'b2']:
        net.params[key] -= learning_rate * grad[key]
    
    # 记录学习过程
    loss = net.loss(x_batch, t_batch)
    train_loss_list.append(loss)

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(len(train_loss_list)), train_loss_list)
plt.axis('on')
plt.grid()
plt.xlabel('iteration')
plt.ylabel('loss')

# epoch

In [None]:
# 仅代码示例，实际上CPU根本跑不动
net = TwoLayerNet(784, 100, 10)

# 超参数
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = train_size / batch_size

for i in range(iters_num):
    # 获取 mini-batch
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 计算梯度
    grad = net.numerical_gradient(x_batch, t_batch)

    # 更新参数
    for key in ['W1', 'b1', 'W2', 'b2']:
        net.params[key] -= learning_rate * grad[key]
    
    # 记录学习过程
    loss = net.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    # 计算每个 epoch 的识别精度
    if i / iter_per_epoch == 0:  # 这里应该是 i + 1 吧？
        train_acc = net.accuracy(x_train, t_train)
        # test_acc = net.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        # test_acc_list.append(test_acc)
        print(f"train acc | {train_acc}")  # 书中的写法字符串拼接过于古老，这里使用f

