# 几种实现反向传播的方法。
定义函数为 y = w2(relu(w1*x))  
定义损失函数为MSE

# 使用numpy实现bp算法

In [22]:
import numpy as np

In [23]:
# 定义batch_size为N，输入维度D_in,隐藏层维度H，输出维度D_out
N, D_in, H, D_out = 64, 1000, 100, 10

In [24]:
# 穿件随机输入和随机输出
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [25]:
# 随机初始化w1, w2
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [26]:
# 定义学习率
learning_rate = 1e-6

for t in range(50):
    
    h = x.dot(w1)
    h_relu = np.maximum(0,h)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred-y).sum()
    
    print(loss)
    
    grad_y_pred = 2.0*(y_pred-y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # 更新梯度
    w1 = w1 - learning_rate*grad_w1
    w2 = w2 - learning_rate*grad_w2
    
    
    

29137396.07736845
28460708.009038877
29770181.943714008
28721528.031357482
23460233.758789394
15776357.434331018
9093584.37263605
4923889.67288959
2776072.4402635517
1739018.570503272
1224035.2491683057
943010.4965224364
769443.5836570002
649052.5496991801
558109.023374166
485572.14213041466
425828.7368483133
375635.7321650436
332917.07277771685
296248.35631350393
264757.8525684091
237359.0596224625
213408.78843776474
192413.02161619425
173923.82143803957
157582.48563384853
143091.83853786503
130203.3417063088
118698.43462595079
108403.16339571904
99164.57587213506
90860.8251040409
83374.71845300894
76617.39120252838
70510.27923313499
64974.692854958485
59950.73954390363
55383.839333425974
51222.46979528919
47426.0889724246
43955.324060709034
40777.509364980164
37864.743949306314
35192.94179413024
32738.627445542472
30488.554280577115
28417.916528066038
26509.122586211244
24747.04096173543
23119.798490388894


# 使用pytorch实现bp算法

In [42]:
import torch

In [43]:
x = torch.randn(N, D_in, dtype=torch.float)
y = torch.randn(N, D_out, dtype=torch.float)

In [44]:
w1 = torch.randn(D_in, H, dtype=torch.float)
w2 = torch.randn(H, D_out, dtype=torch.float)

In [45]:
learning_rate = 1e-6

In [47]:
for t in range(50):
    
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred-y).pow(2).sum()
    print(loss.item())
    
    # 反向传播算法
    grad_y_pred = 2 * (y_pred-y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

21579106.0
16777476.0
14706097.0
13457490.0
12234658.0
10676590.0
8864967.0
6965629.5
5249217.0
3823383.5
2744703.5
1959924.125
1411418.0
1030913.375
769162.0625
587146.625
459244.3125
367420.5625
300155.5
249721.703125
211064.984375
180732.828125
156430.828125
136586.109375
120156.4609375
106373.6328125
94638.4375
84548.484375
75808.546875
68179.703125
61485.3671875
55578.0625
50345.10546875
45696.50390625
41552.44140625
37847.13671875
34527.125
31544.935546875
28859.8515625
26437.091796875
24248.5859375
22268.32421875
20474.65625
18844.515625
17362.14453125
16012.953125
14784.2041015625
13663.50390625
12638.5302734375
11700.40234375


# Autograd

In [53]:
import torch

dtype = torch.float
device = torch.device("cpu")

In [54]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [55]:
x = torch.randn(N, D_in, dtype=dtype)
y = torch.randn(N, D_out, dtype=dtype)

In [56]:
w1 = torch.randn(D_in, H, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, dtype=dtype, requires_grad=True)


In [57]:
learning_rate = 1e-6

In [58]:
for t in range(500):
    
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred-y).pow(2).sum()
    if t%99 == 0:
        print(t, loss.item())
        
    loss.backward()
    
    with torch.no_grad():
        
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
        

0 46844300.0
99 847.3705444335938
198 7.620578289031982
297 0.09520117193460464
396 0.0016607196303084493
495 0.00014125907910056412


# 定义一个新的autograd函数

In [63]:
import torch

class MyReLU(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, input):
        
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [64]:
dtype = torch.float
device = torch.device("cpu")

In [65]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [66]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [67]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [69]:
learning_rate = 1e-6

for t in range(500):
    
    relu = MyReLU.apply
    
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t%99 == 0:
        print(t, loss.item())
        
    loss.backward()
    
    with torch.no_grad():
        
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
    

0 39793356.0
99 1192.75830078125
198 12.422260284423828
297 0.22724980115890503
396 0.005282993894070387
495 0.00035157607635483146


# nn module

In [73]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction="sum")

In [76]:
learning_rate = 1e-4

for t in range(500):
    
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    
    if t % 100 == 99:
        print(loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

3.5723540782928467
0.06650062650442123
0.0024717615451663733
0.00012031671212753281
6.711089554300997e-06


In [78]:
print(model.parameters())

<generator object Module.parameters at 0x000001AC6C9ECEB8>


# Pytorch:optim

In [79]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

In [80]:
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [81]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

In [82]:
loss_fn = torch.nn.MSELoss(reduction="sum")

In [84]:
learning_rate = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [85]:
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    
    if t % 100 == 99:
        print(t, loss.item())
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


99 41.646183013916016
199 0.45663875341415405
299 0.0014213592512533069
399 2.8552392450365005e-06
499 3.609547993832507e-09


# Pytorch:Custom nn Modules

In [86]:
import torch

class TwoLayerNet(torch.nn.Module):
    
    def __init__(self, D_in, H, D_out):
        
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        
        h = self.linear1(x)
        h_relu = torch.nn.functional.relu(h)
        y_pred = self.linear2(h)
        
        return y_pred
        

In [88]:
learning_rate = 1e-4
model = TwoLayerNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [89]:
for t in range(500):
    
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    
    if t % 100 == 99:
        print(t, loss.item())
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 0.09466313570737839
199 1.1030675523215905e-05
299 6.129178276026437e-10
399 2.0362100894288915e-11
499 7.452397206286765e-12
