### pytorch核心两个主要特征
* 一个n维张量，类似于numpy，可以在GPU上运行
* 搭建和训练神经网络时的自动微分/求导机制

In [1]:
# 使用numpy实现网络
import numpy as np

# N是批量大小； D_in是输入维度
# 49/5000 H是隐藏的维度； D_out是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 创建随机输入和输出数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# 随机初始化权重
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # 前向传播；计算预测值y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # 计算和打印loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 反向传播，计算w1和w2对loss的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 48430887.60332304
1 53003082.68437226
2 50797687.812461056
3 35595150.2540987
4 17436979.06238401
5 7182537.435736881
6 3406036.600716468
7 2117041.2776306868
8 1573388.32168737
9 1263093.2103167141
10 1046002.3700033883
11 879207.2835343934
12 746629.1649076543
13 639067.795847856
14 550712.9540585221
15 477447.5297851258
16 416246.7520540961
17 364713.24402365554
18 320997.62171807105
19 283686.43652448454
20 251662.8926709124
21 224016.2336355261
22 200033.69944236596
23 179183.33680739292
24 160954.21579618717
25 144952.3200608657
26 130860.40521343824
27 118403.08866670117
28 107351.26303617409
29 97521.80358046686
30 88760.3090513252
31 80924.97818008739
32 73901.67410600944
33 67592.31138566544
34 61909.69300251823
35 56782.10238178429
36 52146.02737639065
37 47947.35471043534
38 44137.753900156466
39 40684.60836033322
40 37542.07456589822
41 34677.921093056284
42 32064.273457323427
43 29675.11952547681
44 27488.07864811567
45 25484.440306537603
46 23646.1282783082
47 21958.27

498 8.78487910249729e-06
499 8.404388693684037e-06


In [1]:
# pytorch: 张量
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop计算w1和w2相对于损耗的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 22961780.0
1 15306415.0
2 11052506.0
3 8282089.0
4 6303998.0
5 4830122.5
6 3716358.5
7 2871917.5
8 2234772.5
9 1754305.875
10 1391289.25
11 1116325.75
12 906378.4375
13 744400.75
14 618264.75
15 519005.75
16 439972.21875
17 376289.8125
18 324328.5625
19 281488.28125
20 245848.640625
21 215902.0625
22 190539.59375
23 168883.921875
24 150278.8125
25 134195.65625
26 120217.125
27 108010.265625
28 97299.21875
29 87864.0859375
30 79522.6875
31 72125.1640625
32 65544.1171875
33 59672.0546875
34 54424.265625
35 49724.6015625
36 45502.8359375
37 41701.1640625
38 38269.76953125
39 35166.15625
40 32353.359375
41 29801.677734375
42 27484.69140625
43 25375.1953125
44 23453.705078125
45 21701.34375
46 20099.787109375
47 18633.04296875
48 17288.34375
49 16055.951171875
50 14923.8515625
51 13882.7373046875
52 12924.1669921875
53 12043.3544921875
54 11233.953125
55 10486.3876953125
56 9795.0546875
57 9155.34375
58 8562.93359375
59 8014.0615234375
60 7505.05615234375
61 7032.1171875
62 6592.399414062

479 0.00031177978962659836
480 0.00030545308254659176
481 0.0002991395303979516
482 0.00029336425359360874
483 0.00028688955353572965
484 0.0002818231878336519
485 0.0002764143282547593
486 0.0002712804707698524
487 0.0002659138117451221
488 0.0002612559183035046
489 0.0002559831482358277
490 0.0002515024971216917
491 0.000246684328885749
492 0.00024208218383137137
493 0.00023792429419700056
494 0.0002327126421732828
495 0.00022828621149528772
496 0.00022401954629458487
497 0.0002208909427281469
498 0.00021691185247618705
499 0.00021281900990288705


In [2]:
# 自动求导
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device（“cuda：0”）＃取消注释以在GPU上运行

# N是批量大小; D_in是输入维度;
# H是隐藏的维度; D_out是输出维度。
N, D_in, H, D_out = 64, 1000, 100, 10

# 创建随机Tensors以保持输入和输出。
# 设置requires_grad = False表示我们不需要计算渐变
# 在向后传球期间对于这些Tensors。
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 为权重创建随机Tensors。
# 设置requires_grad = True表示我们想要计算渐变
# 在向后传球期间尊重这些张贴。
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # 前向传播：使用tensors上的操作计算预测值y; 
      # 由于w1和w2有requires_grad=True，涉及这些张量的操作将让PyTorch构建计算图，
    # 从而允许自动计算梯度。由于我们不再手工实现反向传播，所以不需要保留中间值的引用。
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # 使用Tensors上的操作计算和打印丢失。
    # loss是一个形状为()的张量
    # loss.item() 得到这个张量对应的python数值
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # 使用autograd计算反向传播。这个调用将计算loss对所有requires_grad=True的tensor的梯度。
    # 这次调用后，w1.grad和w2.grad将分别是loss对w1和w2的梯度张量。
    loss.backward()

    # 使用梯度下降更新权重。对于这一步，我们只想对w1和w2的值进行原地改变；不想为更新阶段构建计算图，
    # 所以我们使用torch.no_grad()上下文管理器防止PyTorch为更新构建计算图
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # 反向传播后手动将梯度设置为零
        w1.grad.zero_()
        w2.grad.zero_()

0 23982104.0
1 18034956.0
2 14810042.0
3 12520715.0
4 10399256.0
5 8406129.0
6 6522384.0
7 4936807.5
8 3649067.75
9 2682286.0
10 1969717.25
11 1463460.25
12 1102766.25
13 847719.4375
14 664637.6875
15 531590.9375
16 432906.875
17 358489.78125
18 301092.5625
19 256016.75
20 219866.921875
21 190399.171875
22 166015.09375
23 145622.59375
24 128361.40625
25 113631.78125
26 100959.828125
27 89981.9140625
28 80419.078125
29 72054.3828125
30 64706.55078125
31 58223.24609375
32 52485.390625
33 47399.6953125
34 42873.140625
35 38840.21484375
36 35240.41796875
37 32020.0
38 29131.701171875
39 26536.05859375
40 24200.99609375
41 22096.255859375
42 20195.021484375
43 18475.81640625
44 16919.537109375
45 15509.0087890625
46 14228.87109375
47 13066.36328125
48 12010.0849609375
49 11048.21875
50 10170.7802734375
51 9370.103515625
52 8638.244140625
53 7969.33349609375
54 7357.10986328125
55 6796.357421875
56 6283.02001953125
57 5812.01171875
58 5379.5625
59 4982.033203125
60 4616.30126953125
61 4279.6

In [None]:
# 定义新的自动求到函数
import torch

classs MyRelu(torch.autograd.Function):
    """
    我们可以通过建立auto.autograd的子类来实现我们定义的autograd函数，
    并完成张量的正向和反向传播。
    """
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0]= 0 
        return grad_x
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
N, D_in, H, D_out = 64, 1000, 100, 10


x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = MyRelu.apply(x.mm(w1)).mm(w2)
    loss = (y_pred -y).pow(2).sum()
    print(t, loss.item)
    loss.backward()
    
    with torch.no_grad():
        w1 -=  