In [None]:
import torch
device = torch.device('cpu')

In [None]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# 隨機生成x, y
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# 初始化weight W1, W2
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

# 設置learning rate
learning_rate = 1e-6

# 訓練500個epoch
for t in range(500):
  # 向前傳遞: 計算y_pred
  h = x.mm(w1)
  h_relu = h.clamp(min=0)
  y_pred = h_relu.mm(w2)

  # 計算loss
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # 倒傳遞: 計算W1與W2對loss的微分(梯度)
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h < 0] = 0
  grad_w1 = x.t().mm(grad_h)

  # 參數更新
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 36738736.0
1 37226320.0
2 43329408.0
3 45192048.0
4 36402568.0
5 20705342.0
6 9191917.0
7 3916275.0
8 2030506.625
9 1327425.375
10 1006494.0
11 817133.5625
12 683581.6875
13 580513.4375
14 497703.125
15 429833.3125
16 373548.65625
17 326436.84375
18 286729.0
19 253053.28125
20 224295.1875
21 199593.625
22 178303.203125
23 159882.875
24 143830.640625
25 129777.2109375
26 117381.3515625
27 106454.3359375
28 96786.5
29 88215.4140625
30 80591.453125
31 73793.5859375
32 67706.609375
33 62242.51171875
34 57319.21484375
35 52872.08984375
36 48843.125
37 45188.1875
38 41864.5859375
39 38835.93359375
40 36074.25
41 33551.73046875
42 31243.236328125
43 29127.37890625
44 27182.078125
45 25390.296875
46 23737.66796875
47 22212.04296875
48 20801.73046875
49 19493.54296875
50 18280.451171875
51 17155.001953125
52 16109.775390625
53 15137.78515625
54 14232.64453125
55 13389.1923828125
56 12603.8681640625
57 11870.7294921875
58 11186.064453125
59 10546.896484375
60 9948.8095703125
61 9389.33203125
6

In [None]:
import torch
device = torch.device('cpu')

In [None]:
# N: batch size
# D_in: input dimension
# H: hidden dimension
# D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# 隨機生成x, y
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# 初始化weight W1, W2
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

# 設置learning rate
learning_rate = 1e-6

# 訓練500個epoch
for t in range(500):
  # 向前傳遞: 計算y_pred
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  
  # 計算loss
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # 倒傳遞: 計算W1與W2對loss的微分(梯度)
  loss.backward()

  # 參數更新: 這裡再更新參數時，我們不希望更新參數的計算也被紀錄微分相關的資訊，因此使用torch.no_grad()
  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()

0 33000836.0
1 29622686.0
2 28934048.0
3 26602988.0
4 21417406.0
5 14659774.0
6 8915223.0
7 5138581.5
8 3040978.5
9 1936605.25
10 1348920.75
11 1015139.125
12 807804.9375
13 666373.0
14 562333.875
15 481444.21875
16 416292.375
17 362537.15625
18 317527.21875
19 279460.71875
20 247034.8125
21 219168.4375
22 195116.78125
23 174280.78125
24 156113.03125
25 140228.671875
26 126277.875
27 113987.2578125
28 103126.59375
29 93506.765625
30 84955.6953125
31 77332.1484375
32 70516.921875
33 64408.08203125
34 58918.31640625
35 53980.37109375
36 49527.83984375
37 45504.08203125
38 41859.9375
39 38555.6953125
40 35552.8125
41 32818.5
42 30326.845703125
43 28054.037109375
44 25975.728515625
45 24072.8359375
46 22334.271484375
47 20740.904296875
48 19276.359375
49 17929.5390625
50 16689.9609375
51 15546.8525390625
52 14492.513671875
53 13518.7763671875
54 12618.478515625
55 11784.923828125
56 11013.056640625
57 10298.1875
58 9634.7353515625
59 9019.044921875
60 8448.25390625
61 7917.9853515625
62 74