### pytorch核心两个主要特征
* 一个n维张量，类似于numpy，可以在GPU上运行
* 搭建和训练神经网络时的自动微分/求导机制

In [8]:
# 使用numpy实现网络
import numpy as np

# N是批量大小； D_in是输入维度
# 49/5000 H是隐藏的维度； D_out是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 创建随机输入和输出数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# 随机初始化权重
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # 前向传播；计算预测值y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # 计算和打印loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 反向传播，计算w1和w2对loss的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 35338742.456269376
1 31682242.88370516
2 30929635.076885384
3 27832519.069230806
4 21306845.633910295
5 13638041.345683824
6 7784837.840895438
7 4334576.957585258
8 2573843.565336817
9 1689490.8558896622
10 1221781.0964121032
11 947598.2432034721
12 768474.8033334792
13 640264.1512706899
14 542619.2435859893
15 464908.434554489
16 401479.3442465906
17 348862.80363781285
18 304694.6346036018
19 267280.94159779977
20 235400.50256856432
21 208080.20392935182
22 184556.67020074555
23 164188.27947713633
24 146506.63536926807
25 131098.32413571174
26 117603.4624466532
27 105733.72346887043
28 95257.42953217814
29 85996.20334903614
30 77773.67383556406
31 70459.0225699654
32 63943.34585595098
33 58118.07614703007
34 52889.08021889237
35 48195.56713584122
36 43975.65596072651
37 40173.78320838226
38 36743.48766026604
39 33643.2558896401
40 30836.06899408975
41 28289.192630123325
42 25977.806573433874
43 23875.771869950873
44 21963.103116730752
45 20219.721121182687
46 18628.893085331205
47 1

In [12]:
# pytorch: 张量
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop计算w1和w2相对于损耗的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 31839048.0
1 27758132.0
2 29826130.0
3 32616212.0
4 31655060.0
5 24690406.0
6 15309300.0
7 7952676.0
8 3962849.75
9 2125161.0
10 1311962.0
11 923019.5625
12 710763.875
13 576661.0
14 481514.8125
15 408684.4375
16 350443.375
17 302578.40625
18 262739.25
19 229209.265625
20 200816.078125
21 176634.90625
22 155915.46875
23 138044.03125
24 122600.78125
25 109190.203125
26 97504.1953125
27 87282.4921875
28 78303.546875
29 70397.7265625
30 63418.76953125
31 57238.6015625
32 51754.03125
33 46877.296875
34 42533.82421875
35 38663.9140625
36 35197.80078125
37 32090.6171875
38 29297.810546875
39 26783.37109375
40 24516.353515625
41 22468.591796875
42 20615.521484375
43 18938.6796875
44 17419.3984375
45 16037.720703125
46 14780.2294921875
47 13635.1298828125
48 12590.4033203125
49 11636.5087890625
50 10764.3330078125
51 9966.3798828125
52 9235.18359375
53 8564.845703125
54 7949.55029296875
55 7384.4365234375
56 6864.306640625
57 6385.44140625
58 5944.25
59 5537.18212890625
60 5161.3955078125
61

In [13]:
# 自动求导
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device（“cuda：0”）＃取消注释以在GPU上运行

# N是批量大小; D_in是输入维度;
# H是隐藏的维度; D_out是输出维度。
N, D_in, H, D_out = 64, 1000, 100, 10

# 创建随机Tensors以保持输入和输出。
# 设置requires_grad = False表示我们不需要计算渐变
# 在向后传球期间对于这些Tensors。
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 为权重创建随机Tensors。
# 设置requires_grad = True表示我们想要计算渐变
# 在向后传球期间尊重这些张贴。
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # 前向传播：使用tensors上的操作计算预测值y; 
      # 由于w1和w2有requires_grad=True，涉及这些张量的操作将让PyTorch构建计算图，
    # 从而允许自动计算梯度。由于我们不再手工实现反向传播，所以不需要保留中间值的引用。
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # 使用Tensors上的操作计算和打印丢失。
    # loss是一个形状为()的张量
    # loss.item() 得到这个张量对应的python数值
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # 使用autograd计算反向传播。这个调用将计算loss对所有requires_grad=True的tensor的梯度。
    # 这次调用后，w1.grad和w2.grad将分别是loss对w1和w2的梯度张量。
    loss.backward()

    # 使用梯度下降更新权重。对于这一步，我们只想对w1和w2的值进行原地改变；不想为更新阶段构建计算图，
    # 所以我们使用torch.no_grad()上下文管理器防止PyTorch为更新构建计算图
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # 反向传播后手动将梯度设置为零
        w1.grad.zero_()
        w2.grad.zero_()

0 36200420.0
1 40492204.0
2 45442304.0
3 41707072.0
4 27992856.0
5 13712198.0
6 5798767.5
7 2723954.0
8 1617499.75
9 1161959.875
10 920666.8125
11 761494.8125
12 642461.5
13 548143.625
14 471510.4375
15 408400.6875
16 355791.75
17 311591.09375
18 274190.59375
19 242361.96875
20 215072.9375
21 191548.78125
22 171182.34375
23 153430.28125
24 137927.9375
25 124335.5390625
26 112420.0
27 101889.2734375
28 92556.0234375
29 84257.34375
30 76860.0078125
31 70234.40625
32 64294.3359375
33 58955.48828125
34 54143.453125
35 49799.55078125
36 45869.40234375
37 42306.8984375
38 39071.3359375
39 36128.1640625
40 33445.70703125
41 30997.708984375
42 28761.076171875
43 26713.08984375
44 24834.8203125
45 23111.224609375
46 21528.361328125
47 20072.25390625
48 18730.75390625
49 17492.484375
50 16349.2080078125
51 15292.1279296875
52 14313.791015625
53 13405.8583984375
54 12564.0439453125
55 11782.69921875
56 11056.9853515625
57 10382.3798828125
58 9754.7392578125
59 9170.3720703125
60 8625.9697265625
6

In [None]:
# 定义新的自动求到函数
