In [1]:
import torch
import numpy as np

## 用mumpy实现一个两层的网络，没有bias。从x来预测y，使用L2 loss
- $h = W_1X$
- $a = max(0,h)$
- $y_{hat} = W_2a$


一个前向传播神经网络，计算loss，计算back

In [2]:
N,D_in,H,D_out = 64,1000,100,10

#随机初始一些训练的数据
x = np.random.randn(N,D_in)
y = np.random.randn(N,D_out)


In [3]:
w1 = np.random.randn(D_in,H)
w2 = np.random.randn(H,D_out)

lr = 1e-6

In [4]:
for it in range(500):
    #套路高度单一，需要4个步骤
    #1 一个前向传播的网络
    h = x.dot(w1)                  # N * D_in mm D_in * H 
    h_relu = np.maximum(h,0)       # N  * H
    y_pred = h_relu.dot(w2)        # N * H mm  H * D_out
    
    #2 给出损失函数
    loss = np.square(y_pred - y).sum()
    print(it,"--->",loss)
    
    #3 反向传播，计算梯度，利用loss链式推导
    grad_y_pred = 2 * (y_pred - y)            # N * D_out  
    grad_w2 = h_relu.T.dot(grad_y_pred)       #  H * N mm N * D_out
    grad_h_relu = grad_y_pred.dot(w2.T)       # N * D_out mm D_out * H
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)                #要把grad_w1和grad_w2拼成w1和w2的形状
    
    #4 更新权重
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2

0 ---> 35081789.60869064
1 ---> 31900537.363270983
2 ---> 30725287.38552878
3 ---> 27070431.01340557
4 ---> 20332977.82726998
5 ---> 12978208.808338672
6 ---> 7445682.859886989
7 ---> 4187988.2303313767
8 ---> 2492336.057964194
9 ---> 1629893.0580227645
10 ---> 1170579.4909543488
11 ---> 902501.3830266816
12 ---> 729108.3142524718
13 ---> 606200.3587908796
14 ---> 513192.9421440406
15 ---> 439616.9864155716
16 ---> 379880.1228071255
17 ---> 330453.3233080488
18 ---> 289121.4394092814
19 ---> 254131.0250379148
20 ---> 224329.16354182118
21 ---> 198766.2166929894
22 ---> 176692.0213337251
23 ---> 157575.01895759022
24 ---> 140926.05413218215
25 ---> 126352.66790089497
26 ---> 113562.96075790326
27 ---> 102291.88310178273
28 ---> 92332.60803599571
29 ---> 83496.12934797806
30 ---> 75633.66863094896
31 ---> 68624.29447342032
32 ---> 62364.20042792224
33 ---> 56757.417441834856
34 ---> 51719.77323561566
35 ---> 47187.468782880766
36 ---> 43100.56220524545
37 ---> 39411.80280062013
38 ---> 3

488 ---> 2.2265777374084895e-05
489 ---> 2.1393008964009998e-05
490 ---> 2.0554300281141732e-05
491 ---> 1.974848434341257e-05
492 ---> 1.897430264573235e-05
493 ---> 1.8230470308821842e-05
494 ---> 1.7515758684041617e-05
495 ---> 1.6829153721380064e-05
496 ---> 1.6169563404090878e-05
497 ---> 1.5535758595776984e-05
498 ---> 1.492676564577767e-05
499 ---> 1.434173237343121e-05


In [5]:
y_pred = np.maximum(x.dot(w1),0).dot(w2)

In [6]:
print(y[:10])

[[-4.40570645e-01  6.44646288e-01 -1.07580122e+00  6.03581792e-01
  -2.09709896e+00 -7.81883079e-01 -7.35352720e-01  7.58331762e-01
   4.07231258e-01  8.62610226e-01]
 [ 8.25440406e-01  5.38539513e-01  1.23103652e-01 -1.81620224e+00
  -7.63996576e-01  1.28739051e+00  8.92868939e-01  1.44055673e+00
  -1.40510423e+00  5.65986735e-01]
 [-1.44349809e+00  9.40705297e-01 -9.03139014e-01  3.35158334e+00
   1.71911416e+00 -8.60027108e-01  6.07478614e-01  1.05520644e+00
   1.45783710e+00 -1.27593600e+00]
 [ 1.22976774e+00 -1.00523943e+00 -1.26980912e+00 -6.83450151e-01
  -6.96551827e-01 -1.92074577e+00  7.44779531e-01 -1.05506130e+00
   8.90919191e-01 -5.20885993e-01]
 [ 9.30809916e-01 -8.22464326e-01 -7.06800857e-01 -5.00898845e-01
   3.19758969e-01 -6.61991186e-01 -9.22690832e-01 -5.67580802e-01
  -7.51288648e-01  2.75876247e-02]
 [ 1.39535766e+00  6.64403419e-02  2.40984115e+00  1.85297994e+00
   3.10306422e-01 -4.36375686e-01 -5.21450754e-01 -9.85722638e-01
  -1.43810601e+00 -7.12178579e-01

## 再使用pytroch创建前向神经网络，计算损失函数，反向传播，我们一步步的使用更高级的接口

In [7]:
# 首先使用torch的Tensors去创建前向网络

N,D_in,D_out,H = 64,1000,10,100

# 随机创建一些数据
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)

w1 = torch.randn(D_in,H)
w2 = torch.randn(H,D_out)

lr = 1e-6

for it in range(500):
    
    #1 前向神经网络
    h = x.mm(w1)
    h_relu = h.clamp(min = 0) # clamp只有min的话就是小于min的等于min，大于的不管
    y_pred = h_relu.mm(w2)
    
    #2 计算loss
    loss = (y_pred - y).pow(2).sum().item() #item能把tensor变为python值
    print(it,"->",loss)
    
    #3 计算梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    #4 更新权重
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2

0 -> 39422268.0
1 -> 40456576.0
2 -> 42798956.0
3 -> 38033492.0
4 -> 26073248.0
5 -> 13734712.0
6 -> 6463477.0
7 -> 3254962.75
8 -> 1966052.75
9 -> 1395930.375
10 -> 1091178.375
11 -> 893943.75
12 -> 749648.1875
13 -> 636627.5625
14 -> 545012.6875
15 -> 469450.6875
16 -> 406528.1875
17 -> 353731.59375
18 -> 309065.96875
19 -> 271124.96875
20 -> 238710.578125
21 -> 210849.96875
22 -> 186791.21875
23 -> 165954.21875
24 -> 147823.65625
25 -> 132008.203125
26 -> 118161.53125
27 -> 106001.90625
28 -> 95300.546875
29 -> 85846.1328125
30 -> 77476.875
31 -> 70043.6484375
32 -> 63427.734375
33 -> 57530.6484375
34 -> 52265.58984375
35 -> 47549.125
36 -> 43317.984375
37 -> 39518.94921875
38 -> 36103.01953125
39 -> 33024.4609375
40 -> 30244.541015625
41 -> 27729.765625
42 -> 25451.068359375
43 -> 23386.6171875
44 -> 21513.251953125
45 -> 19809.833984375
46 -> 18258.48828125
47 -> 16842.890625
48 -> 15551.0615234375
49 -> 14370.265625
50 -> 13289.65625
51 -> 12300.197265625
52 -> 11393.0576171875
5

466 -> 0.00020376278553158045
467 -> 0.00020036417117808014
468 -> 0.00019657982920762151
469 -> 0.000192278967006132
470 -> 0.00018840709526557475
471 -> 0.0001862144999904558
472 -> 0.00018220054334960878
473 -> 0.0001788745285011828
474 -> 0.00017586475587449968
475 -> 0.00017190580547321588
476 -> 0.0001691650104476139
477 -> 0.00016578177746850997
478 -> 0.0001630854676477611
479 -> 0.00016043476352933794
480 -> 0.00015744850679766387
481 -> 0.00015507935313507915
482 -> 0.00015171809354797006
483 -> 0.0001498752972111106
484 -> 0.00014711846597492695
485 -> 0.00014445921988226473
486 -> 0.0001413628488080576
487 -> 0.00013949208369012922
488 -> 0.0001370643440168351
489 -> 0.00013455373118631542
490 -> 0.00013286458852235228
491 -> 0.00013068129192106426
492 -> 0.00012812501518055797
493 -> 0.0001260493154404685
494 -> 0.0001239393895957619
495 -> 0.00012267612328287214
496 -> 0.00012053517275489867
497 -> 0.00011873502808157355
498 -> 0.0001166041984106414
499 -> 0.0001149506861

## 使用autograd去自动的求梯度，只要约定了forward,loss,在需要求grad的Tensor中确定了require_grad = True那么就可以使用autograd去自动求梯度（记住梯度是累加的使用过一次之后记得清零）

In [8]:
N,D_in,D_out,H = 64,1000,10,100

# 随机创建一些数据
# 需要放gpu就.cuda()
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)

w1 = torch.randn(D_in,H,requires_grad = True)
w2 = torch.randn(H,D_out,requires_grad = True)

lr = 1e-6

for it in range(500):
    # forward
    y_pred = x.mm(w1).clamp(min = 0).mm(w2)
    
    # loss ,不需要转成py的item
    loss = (y_pred - y).pow(2).sum()
    print(it,"-->",loss.item())
    #autoback
    loss.backward()
    
    #update and make zero of w1 and w2
    # 所有的tensor计算都算计算图，为了不让底下的这个占内存选择no_grad
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

0 --> 29516510.0
1 --> 24400676.0
2 --> 22556270.0
3 --> 20863988.0
4 --> 18003840.0
5 --> 14061174.0
6 --> 9967725.0
7 --> 6575863.0
8 --> 4201676.0
9 --> 2696023.0
10 --> 1790535.5
11 --> 1249827.375
12 --> 920722.125
13 --> 711837.25
14 --> 572089.375
15 --> 473232.71875
16 --> 399817.65625
17 --> 342893.75
18 --> 297244.0625
19 --> 259678.0625
20 --> 228229.609375
21 --> 201566.84375
22 --> 178750.375
23 --> 159063.984375
24 --> 141977.921875
25 --> 127071.9375
26 --> 114028.3515625
27 --> 102596.0390625
28 --> 92524.3203125
29 --> 83622.0
30 --> 75735.9453125
31 --> 68725.234375
32 --> 62470.09375
33 --> 56875.11328125
34 --> 51861.57421875
35 --> 47361.609375
36 --> 43321.3359375
37 --> 39678.75
38 --> 36389.32421875
39 --> 33416.19140625
40 --> 30721.68359375
41 --> 28277.939453125
42 --> 26058.078125
43 --> 24037.078125
44 --> 22195.7734375
45 --> 20516.26171875
46 --> 18982.4296875
47 --> 17579.845703125
48 --> 16295.2587890625
49 --> 15118.140625
50 --> 14038.8779296875
51 --

## 使用nn这个库来构建网络，可以构建一些简单的模型


In [9]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# 创建model

model = nn.Sequential(
    torch.nn.Linear(D_in,H,bias=False),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out,bias=False),
)

loss_fn = nn.MSELoss(reduction= 'sum')

#初始话和模型训练好坏相关
torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

lr = 1e-6

for it in range(500):
    # forward
    y_pred = model(x)
    # loss 
    loss = loss_fn(y_pred,y)
    print(it,loss.item())
    
    #back
    loss.backward()
    
    #update
    with torch.no_grad():
        for param in model.parameters():
            param -= lr * param.grad
    
    model.zero_grad()

0 25143940.0
1 20617752.0
2 18912184.0
3 17639166.0
4 15721456.0
5 12943376.0
6 9787550.0
7 6891413.0
8 4646141.5
9 3089145.5
10 2081190.25
11 1447484.625
12 1050480.375
13 796638.6875
14 628798.5625
15 512978.8125
16 429170.65625
17 365818.875
18 316126.03125
19 275975.625
20 242724.15625
21 214712.46875
22 190798.203125
23 170214.296875
24 152322.21875
25 136696.578125
26 122976.609375
27 110899.3515625
28 100215.3125
29 90735.5390625
30 82297.265625
31 74767.9296875
32 68024.2890625
33 61976.67578125
34 56543.8046875
35 51649.12890625
36 47234.28125
37 43244.703125
38 39633.6015625
39 36362.81640625
40 33395.6953125
41 30698.296875
42 28245.34375
43 26010.390625
44 23972.771484375
45 22112.701171875
46 20412.41015625
47 18856.04296875
48 17430.677734375
49 16123.66796875
50 14924.2529296875
51 13822.2626953125
52 12809.70703125
53 11877.794921875
54 11020.2880859375
55 10231.408203125
56 9504.09765625
57 8833.0986328125
58 8213.4384765625
59 7641.0625
60 7111.81982421875
61 6622.362

## 使用optim，更加无脑

In [10]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H, bias=False), # w_1 * x + b_1
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out, bias=False),
)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

loss_fn = nn.MSELoss(reduction = 'sum')

optimizer = torch.optim.SGD(model.parameters(),lr = 1e-6)

for it in range(500):
    #forward
    y_pred = model(x)
    
    loss = loss_fn(y_pred,y)
    print(it,loss.item())
    
    #先把梯度清零
    optimizer.zero_grad()
    loss.backward()
    
    #update weight
    optimizer.step()

0 32256864.0
1 28746376.0
2 28823990.0
3 27732028.0
4 23553652.0
5 16781176.0
6 10398491.0
7 5899673.0
8 3356058.0
9 2027321.0
10 1344639.375
11 974283.375
12 755888.5
13 613671.9375
14 512571.03125
15 435661.65625
16 374630.75
17 324813.4375
18 283479.5
19 248698.609375
20 219174.25
21 193919.0625
22 172201.109375
23 153430.453125
24 137127.84375
25 122910.921875
26 110471.7578125
27 99547.5
28 89935.4453125
29 81417.265625
30 73864.09375
31 67153.8203125
32 61171.90625
33 55826.2109375
34 51036.703125
35 46736.8046875
36 42874.7265625
37 39390.34765625
38 36241.7265625
39 33392.265625
40 30807.64453125
41 28459.17578125
42 26321.1953125
43 24373.08203125
44 22593.32421875
45 20966.015625
46 19476.1171875
47 18109.787109375
48 16855.7734375
49 15703.357421875
50 14643.560546875
51 13667.564453125
52 12767.6279296875
53 11936.4453125
54 11168.408203125
55 10457.56640625
56 9799.1865234375
57 9188.9091796875
58 8623.107421875
59 8097.6943359375
60 7609.2041015625
61 7154.83642578125
62 

465 0.002141070319339633
466 0.002080892911180854
467 0.002022720407694578
468 0.0019663602579385042
469 0.0019104036036878824
470 0.0018586008809506893
471 0.0018071301747113466
472 0.0017597249243408442
473 0.0017106386367231607
474 0.0016661023255437613
475 0.0016199421370401978
476 0.0015792213380336761
477 0.0015387344174087048
478 0.0014975122176110744
479 0.0014580360148102045
480 0.0014185578329488635
481 0.0013832737458869815
482 0.0013458880130201578
483 0.0013116782065480947
484 0.001277724513784051
485 0.0012453703675419092
486 0.0012146866647526622
487 0.001182680600322783
488 0.0011521107517182827
489 0.0011249820236116648
490 0.0010955242905765772
491 0.0010701753199100494
492 0.0010431609116494656
493 0.0010186450090259314
494 0.0009925717022269964
495 0.0009685553377494216
496 0.0009445027681067586
497 0.0009225892135873437
498 0.0009006670443341136
499 0.0008791736327111721


## 自定义继承modules

In [11]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayer(nn.Module):
    def __init__(self,D_in,D_out,H):
        super(TwoLayer,self).__init__()
        self.l1 = torch.nn.Linear(D_in,H,bias=False)
        self.relu = torch.nn.ReLU()
        self.l2 = torch.nn.Linear(H,D_out,bias=False)
    def forward(self,x):
        y_pred = self.l2(self.relu(self.l1(x)))
        return y_pred
    
model = TwoLayer(D_in,D_out,H)

learningrate = 1e-4
loss_fn = torch.nn.MSELoss(reduction='sum')
# optimizer = torch.optim.SGD(model.parameters(),lr = learningrate)
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-4)
for it in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred,y)
    print(it ,loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    
    optimizer.step()

0 734.16357421875
1 716.1856079101562
2 698.7529296875
3 681.7464599609375
4 665.2605590820312
5 649.2026977539062
6 633.6846923828125
7 618.6154174804688
8 603.9371337890625
9 589.6676025390625
10 575.8157958984375
11 562.3776245117188
12 549.414306640625
13 536.7715454101562
14 524.4758911132812
15 512.5078125
16 500.7890625
17 489.3973083496094
18 478.3597717285156
19 467.605224609375
20 457.0960998535156
21 446.87945556640625
22 436.9722900390625
23 427.34210205078125
24 417.9286804199219
25 408.7381591796875
26 399.7740783691406
27 391.0140380859375
28 382.4493103027344
29 374.0716857910156
30 365.8894958496094
31 357.91424560546875
32 350.1434020996094
33 342.5483703613281
34 335.1246643066406
35 327.83599853515625
36 320.6709289550781
37 313.6605529785156
38 306.7863464355469
39 300.08013916015625
40 293.5085144042969
41 287.0650634765625
42 280.7584228515625
43 274.574462890625
44 268.5035705566406
45 262.53533935546875
46 256.668212890625
47 250.91307067871094
48 245.272598266

443 6.926930836925749e-06
444 6.529506663355278e-06
445 6.154690709081478e-06
446 5.798448455607286e-06
447 5.463874913402833e-06
448 5.14804605700192e-06
449 4.849351626035059e-06
450 4.56770521850558e-06
451 4.301757144276053e-06
452 4.050994448334677e-06
453 3.813770263150218e-06
454 3.5913014926336473e-06
455 3.379920599400066e-06
456 3.1810509426577482e-06
457 2.9935367820144165e-06
458 2.8174522412882652e-06
459 2.6505792902753456e-06
460 2.4939402010204503e-06
461 2.3457723727915436e-06
462 2.206212002420216e-06
463 2.0749521354446188e-06
464 1.950969817698933e-06
465 1.8344293266636669e-06
466 1.7240982970179175e-06
467 1.6209951354539953e-06
468 1.5232009218379972e-06
469 1.4309099469755893e-06
470 1.3446618822854361e-06
471 1.2634351378437714e-06
472 1.186710619549558e-06
473 1.114602696361544e-06
474 1.0468353366377414e-06
475 9.826879932006705e-07
476 9.229269721799938e-07
477 8.659824857204512e-07
478 8.129760544761666e-07
479 7.630217737641942e-07
480 7.159716801652394e-0