In [1]:
import numpy as np

# 梯度

In [2]:
def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)  # 生成和 x 形状相同的数组

    for idx in range(x.size):
        tmp_val = x[idx]

        x[idx] = tmp_val + h
        fxh1 = f(x)

        x[idx] = tmp_val - h
        fxh2 = f(x)

        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val
    
    return grad

In [3]:
def function_2(x):
    return x[0] ** 2 + x[1] ** 2

print(numerical_gradient(function_2, np.array([3.0, 4.0])))
print(numerical_gradient(function_2, np.array([0.0, 2.0])))
print(numerical_gradient(function_2, np.array([3.0, 0.0])))

[6. 8.]
[0. 4.]
[6. 0.]


## 梯度法

In [4]:
def gradient_descend(f, init_x, lr=0.01, step_num=100):
    x = init_x
    
    for i in range(step_num):
        grad = numerical_gradient(f, x)
        x -= lr * grad
        # print(f"step: {i}, x = {x}")
    return x

In [5]:
init_x = np.array([-3.0, 4.0])
gradient_descend(function_2, init_x)
# function_2(gradient_descend(function_2, np.array([1., 2.])))

array([-0.39785867,  0.53047822])

In [6]:
# 学习率过大的例子：lr=10
init_x = np.array([-3.0, 4.0])

gradient_descend(function_2, init_x, lr=10.0)

array([-2.58983747e+13, -1.29524862e+12])

In [7]:
print(init_x)

[-2.58983747e+13 -1.29524862e+12]


In [8]:
# 学习率过小的例子：lr=1e-10
init_x = np.array([-3.0, 4.0])
gradient_descend(function_2, init_x, lr=1e-10)

array([-2.99999994,  3.99999992])

In [9]:
def softmax(a):
    c = np.max(a)
    return np.exp(a - c) / np.sum(np.exp(a - c))

def cross_entropy_error(y, t):
    return -np.sum(y * np.log(t + 1e-6))

class simpleNet:
    def __init__(self) -> None:
        self.W = np.random.randn(2, 3)
    
    def predict(self, x):
        return np.dot(x, self.W)
    
    def loss(self, x, t):
        z = self.predict(x)
        y = softmax(z)
        loss = cross_entropy_error(y, t)
        return loss

In [10]:
net = simpleNet()
print(net.W)

[[-1.06577896  0.3928238   0.47935903]
 [ 0.3361168   0.07822983  0.5225732 ]]


In [11]:
x = np.array([0.6, 0.9])
p = net.predict(x)
print(p)

[-0.33696226  0.30610112  0.75793129]


In [12]:
np.argmax(p)

t = np.array([0, 0, 1])
print(net.loss(x, t))

6.806251848789094


In [None]:
# def f(W):
#     return net.loss(x, t)
f = lambda w: net.loss(x, t)

# 之前的数值梯度不支持多维数组，这里重新实现
def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)  # 生成和 x 形状相同的数组

    # 创建一个迭代器，能遍历 x 的每个元素，并记录其多为坐标
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])  # type: ignore

    while not it.finished:
        # 获取当前元素的多维索引
        idx = it.multi_index

        tmp_val = x[idx]

        x[idx] = tmp_val + h
        fxh1 = f(x)

        x[idx] = tmp_val - h
        fxh2 = f(x)

        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val

        it.iternext()
    
    return grad

In [None]:
dW = numerical_gradient(f, net.W)
print(dW)
print(net.loss(x, t))

[[ 0.71393168  1.35794886 -2.07188054]
 [ 1.07089752  2.03692329 -3.10782082]]
6.806394629466937


In [53]:
gradient_descend(f, net.W)
print(net.loss(x, t))

0.024158202298158123
