### pytorch核心两个主要特征
* 一个n维张量，类似于numpy，可以在GPU上运行
* 搭建和训练神经网络时的自动微分/求导机制

In [1]:
# 使用numpy实现网络
import numpy as np

# N是批量大小； D_in是输入维度
# 49/5000 H是隐藏的维度； D_out是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 创建随机输入和输出数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# 随机初始化权重
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # 前向传播；计算预测值y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # 计算和打印loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 反向传播，计算w1和w2对loss的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 48430887.60332304
1 53003082.68437226
2 50797687.812461056
3 35595150.2540987
4 17436979.06238401
5 7182537.435736881
6 3406036.600716468
7 2117041.2776306868
8 1573388.32168737
9 1263093.2103167141
10 1046002.3700033883
11 879207.2835343934
12 746629.1649076543
13 639067.795847856
14 550712.9540585221
15 477447.5297851258
16 416246.7520540961
17 364713.24402365554
18 320997.62171807105
19 283686.43652448454
20 251662.8926709124
21 224016.2336355261
22 200033.69944236596
23 179183.33680739292
24 160954.21579618717
25 144952.3200608657
26 130860.40521343824
27 118403.08866670117
28 107351.26303617409
29 97521.80358046686
30 88760.3090513252
31 80924.97818008739
32 73901.67410600944
33 67592.31138566544
34 61909.69300251823
35 56782.10238178429
36 52146.02737639065
37 47947.35471043534
38 44137.753900156466
39 40684.60836033322
40 37542.07456589822
41 34677.921093056284
42 32064.273457323427
43 29675.11952547681
44 27488.07864811567
45 25484.440306537603
46 23646.1282783082
47 21958.27

498 8.78487910249729e-06
499 8.404388693684037e-06


In [1]:
# pytorch: 张量
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop计算w1和w2相对于损耗的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 22961780.0
1 15306415.0
2 11052506.0
3 8282089.0
4 6303998.0
5 4830122.5
6 3716358.5
7 2871917.5
8 2234772.5
9 1754305.875
10 1391289.25
11 1116325.75
12 906378.4375
13 744400.75
14 618264.75
15 519005.75
16 439972.21875
17 376289.8125
18 324328.5625
19 281488.28125
20 245848.640625
21 215902.0625
22 190539.59375
23 168883.921875
24 150278.8125
25 134195.65625
26 120217.125
27 108010.265625
28 97299.21875
29 87864.0859375
30 79522.6875
31 72125.1640625
32 65544.1171875
33 59672.0546875
34 54424.265625
35 49724.6015625
36 45502.8359375
37 41701.1640625
38 38269.76953125
39 35166.15625
40 32353.359375
41 29801.677734375
42 27484.69140625
43 25375.1953125
44 23453.705078125
45 21701.34375
46 20099.787109375
47 18633.04296875
48 17288.34375
49 16055.951171875
50 14923.8515625
51 13882.7373046875
52 12924.1669921875
53 12043.3544921875
54 11233.953125
55 10486.3876953125
56 9795.0546875
57 9155.34375
58 8562.93359375
59 8014.0615234375
60 7505.05615234375
61 7032.1171875
62 6592.399414062

479 0.00031177978962659836
480 0.00030545308254659176
481 0.0002991395303979516
482 0.00029336425359360874
483 0.00028688955353572965
484 0.0002818231878336519
485 0.0002764143282547593
486 0.0002712804707698524
487 0.0002659138117451221
488 0.0002612559183035046
489 0.0002559831482358277
490 0.0002515024971216917
491 0.000246684328885749
492 0.00024208218383137137
493 0.00023792429419700056
494 0.0002327126421732828
495 0.00022828621149528772
496 0.00022401954629458487
497 0.0002208909427281469
498 0.00021691185247618705
499 0.00021281900990288705


In [2]:
# 自动求导
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device（“cuda：0”）＃取消注释以在GPU上运行

# N是批量大小; D_in是输入维度;
# H是隐藏的维度; D_out是输出维度。
N, D_in, H, D_out = 64, 1000, 100, 10

# 创建随机Tensors以保持输入和输出。
# 设置requires_grad = False表示我们不需要计算渐变
# 在向后传球期间对于这些Tensors。
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 为权重创建随机Tensors。
# 设置requires_grad = True表示我们想要计算渐变
# 在向后传球期间尊重这些张贴。
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # 前向传播：使用tensors上的操作计算预测值y; 
      # 由于w1和w2有requires_grad=True，涉及这些张量的操作将让PyTorch构建计算图，
    # 从而允许自动计算梯度。由于我们不再手工实现反向传播，所以不需要保留中间值的引用。
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # 使用Tensors上的操作计算和打印丢失。
    # loss是一个形状为()的张量
    # loss.item() 得到这个张量对应的python数值
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # 使用autograd计算反向传播。这个调用将计算loss对所有requires_grad=True的tensor的梯度。
    # 这次调用后，w1.grad和w2.grad将分别是loss对w1和w2的梯度张量。
    loss.backward()

    # 使用梯度下降更新权重。对于这一步，我们只想对w1和w2的值进行原地改变；不想为更新阶段构建计算图，
    # 所以我们使用torch.no_grad()上下文管理器防止PyTorch为更新构建计算图
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # 反向传播后手动将梯度设置为零
        w1.grad.zero_()
        w2.grad.zero_()

0 23982104.0
1 18034956.0
2 14810042.0
3 12520715.0
4 10399256.0
5 8406129.0
6 6522384.0
7 4936807.5
8 3649067.75
9 2682286.0
10 1969717.25
11 1463460.25
12 1102766.25
13 847719.4375
14 664637.6875
15 531590.9375
16 432906.875
17 358489.78125
18 301092.5625
19 256016.75
20 219866.921875
21 190399.171875
22 166015.09375
23 145622.59375
24 128361.40625
25 113631.78125
26 100959.828125
27 89981.9140625
28 80419.078125
29 72054.3828125
30 64706.55078125
31 58223.24609375
32 52485.390625
33 47399.6953125
34 42873.140625
35 38840.21484375
36 35240.41796875
37 32020.0
38 29131.701171875
39 26536.05859375
40 24200.99609375
41 22096.255859375
42 20195.021484375
43 18475.81640625
44 16919.537109375
45 15509.0087890625
46 14228.87109375
47 13066.36328125
48 12010.0849609375
49 11048.21875
50 10170.7802734375
51 9370.103515625
52 8638.244140625
53 7969.33349609375
54 7357.10986328125
55 6796.357421875
56 6283.02001953125
57 5812.01171875
58 5379.5625
59 4982.033203125
60 4616.30126953125
61 4279.6

In [3]:
# 定义新的自动求到函数
import torch

class MyRelu(torch.autograd.Function):
    """
    我们可以通过建立auto.autograd的子类来实现我们定义的autograd函数，
    并完成张量的正向和反向传播。
    """
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0]= 0 
        return grad_x
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
N, D_in, H, D_out = 64, 1000, 100, 10


x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = MyRelu.apply(x.mm(w1)).mm(w2)
    loss = (y_pred -y).pow(2).sum()
    print(t, loss.item())
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 31067516.0
1 33150160.0
2 39132752.0
3 41995528.0
4 35597688.0
5 22225624.0
6 10673913.0
7 4702052.5
8 2331657.0
9 1432022.875
10 1042053.875
11 830390.25
12 689735.25
13 584302.6875
14 500220.53125
15 431290.84375
16 373920.625
17 325781.0625
18 285136.4375
19 250552.296875
20 220953.25
21 195487.0625
22 173481.53125
23 154384.15625
24 137753.96875
25 123204.671875
26 110444.171875
27 99217.9375
28 89310.1640625
29 80551.0234375
30 72766.0390625
31 65837.4765625
32 59658.69140625
33 54137.85546875
34 49197.5078125
35 44768.6640625
36 40788.21875
37 37205.265625
38 33974.39453125
39 31058.34765625
40 28420.4453125
41 26032.41015625
42 23868.771484375
43 21901.572265625
44 20112.353515625
45 18484.115234375
46 17000.203125
47 15647.111328125
48 14411.404296875
49 13281.8935546875
50 12248.587890625
51 11303.130859375
52 10436.4931640625
53 9641.595703125
54 8912.1103515625
55 8241.640625
56 7625.38330078125
57 7058.6240234375
58 6536.94775390625
59 6056.48828125
60 5615.05419921875
61

386 0.00021753583860117942
387 0.00021244017989374697
388 0.00020775740267708898
389 0.00020284796482883394
390 0.0001972307509277016
391 0.00019301052088849247
392 0.00018869641644414514
393 0.00018489702779334038
394 0.00018013945373240858
395 0.00017593511438462883
396 0.000172141328221187
397 0.00016821714234538376
398 0.00016452079580631107
399 0.000160682262503542
400 0.00015678633644711226
401 0.00015366057050414383
402 0.0001502768718637526
403 0.000146889899042435
404 0.0001439655461581424
405 0.00014078544336371124
406 0.0001381405454594642
407 0.00013525666145142168
408 0.00013248894538264722
409 0.00013033163850195706
410 0.00012774103379342705
411 0.0001251229114132002
412 0.000122964454931207
413 0.0001205864391522482
414 0.00011822178203146905
415 0.00011598059791140258
416 0.00011368367268005386
417 0.0001117430001613684
418 0.00010930243297480047
419 0.0001072137092705816
420 0.00010523081436986104
421 0.00010296954133082181
422 0.00010127369750989601
423 9.92131244856

计算图和autograd是十分强大的工具，可以定义复杂的操作并自动求导；然而对于大规模的网络，autograd太过于底层。在构建神经网络时，我们经常考虑将计算安排成层，其中一些具有可学习的参数，它们将在学习过程中优化。
TensorFlow中，类似Keras，TensoFlow-Slim和TFLearn这种封装了底层计算图的高度抽象的接口，这使得构建网络变得十分方便。
在pytorch中，包`nn`完成了同行的功能。`nn`包中定义了一组大致等价于层的模块。一个模块接受输入的tensor，计算输出的tensor，而且还保存了一些内部状态比如需要学习的tensor的参数等。`nn`包中夜定义了一组损失函数（loss functions），用来训练神经网络。

In [6]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 664.9850463867188
1 613.5818481445312
2 569.7225341796875
3 531.5866088867188
4 498.0000305175781
5 468.1623229980469
6 441.0925598144531
7 416.4062194824219
8 393.54193115234375
9 372.3800964355469
10 352.62335205078125
11 334.0610046386719
12 316.5256652832031
13 299.89697265625
14 284.0648498535156
15 269.1027526855469
16 254.78720092773438
17 241.11595153808594
18 228.1142578125
19 215.7331085205078
20 203.90396118164062
21 192.62290954589844
22 181.86477661132812
23 171.60958862304688
24 161.85037231445312
25 152.55380249023438
26 143.69879150390625
27 135.28929138183594
28 127.28881072998047
29 119.67906951904297
30 112.48051452636719
31 105.65255737304688
32 99.20938873291016
33 93.1256332397461
34 87.3772964477539
35 81.95824432373047
36 76.86432647705078
37 72.07704162597656
38 67.57061767578125
39 63.33732223510742
40 59.376007080078125
41 55.66335678100586
42 52.18947219848633
43 48.934017181396484
44 45.88530349731445
45 43.032752990722656
46 40.36525344848633
47 37.86502

385 2.683288221305702e-05
386 2.5989493224187754e-05
387 2.5173781978082843e-05
388 2.4384700736845843e-05
389 2.3619149942533113e-05
390 2.2881646145833656e-05
391 2.2167194401845336e-05
392 2.1473424567375332e-05
393 2.0803223378607072e-05
394 2.015265818045009e-05
395 1.9524133676895872e-05
396 1.8916176486527547e-05
397 1.832593625294976e-05
398 1.775571217876859e-05
399 1.7204332834808156e-05
400 1.6669053366058506e-05
401 1.6151758245541714e-05
402 1.5650777640985325e-05
403 1.5164581782300957e-05
404 1.469480230298359e-05
405 1.4239501069823746e-05
406 1.3799586668028496e-05
407 1.3373328329180367e-05
408 1.2960555068275426e-05
409 1.256058385479264e-05
410 1.2172779861430172e-05
411 1.1797754268627614e-05
412 1.1434127372922376e-05
413 1.1081188858952373e-05
414 1.0741005098680034e-05
415 1.0410563845653087e-05
416 1.008966955851065e-05
417 9.781159860722255e-06
418 9.482283530815039e-06
419 9.192443030769937e-06
420 8.910729775379878e-06
421 8.640183295938186e-06
422 8.3764389

到目前为止，我们已经通过手动改变包含可学习参数的张量来更新模型的权重。对于随机梯度下降（SGD/stochastic gradient descent）等简单的优化算法来说，这不是一个很大的负担，但在实践中，我们经常使用AdaGrad，RMSProp，Adam等更复杂的优化器训练神经网络。

### 自定义nn模块

In [12]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
N, D_in, H, D_out = 64, 1000, 100, 10

# 产生输入和输出的随机张量
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# 通过实例化上面定义的类来构建我们的模型。
model = TwoLayerNet(D_in, H, D_out)

# 构造损失函数和优化器。
# SGD构造函数中对model.parameters()的调用，
# 将包含模型的一部分，即两个nn.Linear模块的可学习参数。
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # 前向传播：通过向模型传递x计算预测值y
    y_pred = model(x)

    #计算并输出loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # 清零梯度，反向传播，更新权重
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 641.6544799804688
1 595.427734375
2 555.7215576171875
3 520.7649536132812
4 489.36627197265625
5 460.8077392578125
6 434.94573974609375
7 411.2552185058594
8 389.1241760253906
9 368.4479064941406
10 349.20440673828125
11 331.2275085449219
12 314.2814636230469
13 298.1758728027344
14 282.8568115234375
15 268.177490234375
16 254.1677703857422
17 240.79498291015625
18 228.01988220214844
19 215.87344360351562
20 204.2942352294922
21 193.25442504882812
22 182.74951171875
23 172.70529174804688
24 163.11306762695312
25 153.9816436767578
26 145.3169403076172
27 137.06378173828125
28 129.24407958984375
29 121.8182144165039
30 114.7815933227539
31 108.11707305908203
32 101.82164764404297
33 95.87322998046875
34 90.25837707519531
35 84.95199584960938
36 79.95011138916016
37 75.24560546875
38 70.81583404541016
39 66.65242004394531
40 62.74541473388672
41 59.08271408081055
42 55.6388053894043
43 52.40655517578125
44 49.374481201171875
45 46.518882751464844
46 43.836158752441406
47 41.318813323974

441 1.5843770597712137e-05
442 1.5424624507431872e-05
443 1.5015967619547155e-05
444 1.4618456589232665e-05
445 1.4232248759071808e-05
446 1.3857764315616805e-05
447 1.3491762729245238e-05
448 1.3136835150362458e-05
449 1.279021853406448e-05
450 1.2453956514946185e-05
451 1.2126029105274938e-05
452 1.1808120689238422e-05
453 1.1496797014842741e-05
454 1.1197151252417825e-05
455 1.0903806469286792e-05
456 1.0618036867526826e-05
457 1.0340671906305943e-05
458 1.0068455594591796e-05
459 9.806597518036142e-06
460 9.55103496380616e-06
461 9.300915735366289e-06
462 9.058294381247833e-06
463 8.822292329568882e-06
464 8.593776328780223e-06
465 8.369596798729617e-06
466 8.151451766025275e-06
467 7.939951501612086e-06
468 7.733515303698368e-06
469 7.5338498390919995e-06
470 7.338483555940911e-06
471 7.148042641347274e-06
472 6.963306532270508e-06
473 6.783615845051827e-06
474 6.607885552512016e-06
475 6.437539468606701e-06
476 6.271369784371927e-06
477 6.109175046731252e-06
478 5.951877938059624