# Numpy Style Back Propagration

In [10]:
import numpy as np

# N is batch size
# D_in is input dimension
# H is hidden dimension
# D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and ouput data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [11]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [12]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: comput predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 37910929.24391197
1 35429962.72867341
2 35415480.67651908
3 31820253.999504365
4 23269688.30761581
5 13889510.93099086
6 7308186.423817633
7 3869733.394065392
8 2275265.5406091874
9 1529877.5516699231
10 1143085.555103931
11 911574.8319515056
12 753926.2602302269
13 636353.2061543919
14 543813.5169223286
15 468887.4658961933
16 406937.1806846247
17 355089.08332245576
18 311274.3118512123
19 274017.14626452303
20 242138.23766822965
21 214685.2322146718
22 190957.88949489233
23 170357.59169347608
24 152366.31616808317
25 136650.79605893773
26 122852.03544028016
27 110695.01701289916
28 99936.45790054608
29 90387.00439591816
30 81890.54810408
31 74313.93099324037
32 67537.1758935824
33 61469.699314664744
34 56025.90446466135
35 51133.330284400196
36 46724.087152330976
37 42742.69766927225
38 39141.115378192
39 35881.873106999126
40 32927.051680622746
41 30243.06569195138
42 27802.305512825435
43 25582.746971409506
44 23560.23037038093
45 21713.736945611072
46 20026.218673360854
47 18483

421 4.239913237381802e-05
422 4.06142790384702e-05
423 3.8905985783918105e-05
424 3.7270239118371216e-05
425 3.5704279710676966e-05
426 3.420493043268952e-05
427 3.2769109888146906e-05
428 3.139477638063492e-05
429 3.0078389191660987e-05
430 2.881774462998346e-05
431 2.7610996829810436e-05
432 2.6455030427579818e-05
433 2.534793379536889e-05
434 2.4288031876738616e-05
435 2.3272901250496694e-05
436 2.2300633110893396e-05
437 2.1369474037205473e-05
438 2.0478300502070286e-05
439 1.9624847604020466e-05
440 1.88066550508622e-05
441 1.8022768794925783e-05
442 1.7272001753068116e-05
443 1.6553201682179907e-05
444 1.586438220745955e-05
445 1.5204422929633211e-05
446 1.4572334934924129e-05
447 1.3966655953587015e-05
448 1.3386476615884025e-05
449 1.2830714576402076e-05
450 1.229824942481806e-05
451 1.1788048946908792e-05
452 1.1299276461315871e-05
453 1.083101631370331e-05
454 1.0382257066604723e-05
455 9.952275737539175e-06
456 9.540309331435883e-06
457 9.1455910290154e-06
458 8.767280240172

# 关于矩阵中的导数传递
以计算``grad_w2 = h_relu.T.dot(grad_y_pred)``举例，其中：
* h_relu has size of (1, 100)
* w2 has size of (100, 10)
* y_pred has size of (1, 10)

这里有``y_pred = h_relu.dot(w2)``，即：

$$\left[\begin{matrix}h_1 \cr h_2 \cr \vdots \cr h_{100}\end{matrix}\right]^T
\left[\begin{matrix}w_{1,1} & w_{1,2} & \cdots & w_{1,10} \cr
w_{2,1} & w_{2,2} & \cdots & w_{2,10} \cr
\vdots & \vdots &\ddots & \vdots & \cr
w_{100,1} & w_{100,2} & \cdots & w_{100,10} \end{matrix}\right] = 
\left[\begin{matrix}\hat{y}_1 \cr \hat{y}_2 \cr \vdots \cr \hat{y}_{10}\end{matrix}\right]^T
$$

y的真实值为：
$$
\left[\begin{matrix}y_1 \cr y_2 \cr \vdots \cr y_{10}\end{matrix}\right]^T
$$

定义损失函数：
$$\cal{loss} = \sum_{i=1}^{10}(\hat{y}_i - y_i)^2$$

可以计算得$\cal{loss}$对于每一个$\hat{y}_i$的导数为：
$$ \frac{\partial \cal{loss}}{\partial \hat{y}_i}=2(\hat{y}_i - y_i)$$

那么如何计算$\cal{loss}$对于w2的导数呢？先以$\hat{y}_1$举例，由于：
$$ \hat{y}_1 = h_1 w_{1,1} + h_2 w_{2,1} + \cdots + h_{100} w_{100,1} $$
根据梯度链式法则，可以计算$\cal{loss}$对于$w_{j,1}$的梯度为：
$$
\frac{\partial \cal{loss}}{\partial w_{1,1}} = \frac{\partial \cal{loss}}{\partial \hat{y}_1} \times \frac{\partial \hat{y}_1}{\partial w_{1,1}} =  \frac{\partial \cal{loss}}{\partial \hat{y}_1} h_1\\
\frac{\partial \cal{loss}}{\partial w_{2,1}} = \frac{\partial \cal{loss}}{\partial \hat{y}_1} \times \frac{\partial \hat{y}_1}{\partial w_{2,1}} = \frac{\partial \cal{loss}}{\partial \hat{y}_1} h_2\\
\ldots \\
\frac{\partial \cal{loss}}{\partial w_{100,1}} = \frac{\partial \cal{loss}}{\partial \hat{y}_1} \times \frac{\partial \hat{y}_1}{\partial w_{100,1}} = \frac{\partial \cal{loss}}{\partial \hat{y}_1} h_{100}
$$

这样$\cal{loss}$对于$w2$第一列的元素的导数就计算完成了，全部计算完成后的导数为：
$$ \frac{\partial \cal{loss}}{\partial w} = 
\left[\begin{matrix}\frac{\partial \cal{loss}}{\partial \hat{y}_1}h_1 & \frac{\partial \cal{loss}}{\partial \hat{y}_2}h_1 & \cdots & \frac{\partial \cal{loss}}{\partial \hat{y}_{10}}h_1 \cr 
\frac{\partial \cal{loss}}{\partial \hat{y}_1}h_2 & \frac{\partial \cal{loss}}{\partial \hat{y}_2}h_2 & \cdots & \frac{\partial \cal{loss}}{\partial \hat{y}_{10}}h_2 \cr 
\vdots & \vdots & \ddots & \vdots \cr
\frac{\partial \cal{loss}}{\partial \hat{y}_1}h_{100} & \frac{\partial \cal{loss}}{\partial \hat{y}_2}h_{100} & \cdots & \frac{\partial \cal{loss}}{\partial \hat{y}_{10}}h_{100} \cr \end{matrix}\right] =
\left[\begin{matrix}h_1 \cr h_2 \cr \vdots \cr h_{100}\end{matrix}\right]
\left[\begin{matrix}\frac{\partial \cal{loss}}{\partial \hat{y}_1} & \frac{\partial \cal{loss}}{\partial \hat{y}_2} \cdots \frac{\partial \cal{loss}}{\partial \hat{y}_{10}}\end{matrix}\right]
$$
这个矩阵可以用向量化写法表示为：``grad_w2 = h_relu.T.dot(grad_y_pred)``

# PyTorch Tensors
GPU对于现代深度学习的加速有50倍以上，numpy在这时候就显得不够用了，这里对比一下Tensor版本的反向传播的代码：

In [13]:
import torch

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

In [14]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [15]:
# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

In [16]:
# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

In [17]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29196971.2062417
1 22183967.655584734
2 21282298.099785864
3 22387937.88363
4 22835432.205756307
5 21002413.301464587
6 16610181.051117538
7 11411483.13323021
8 6990086.744080739
9 4075350.623690352
10 2383727.7250426225
11 1470497.020130651
12 978281.1147697056
13 705019.4701678543
14 542841.6141827048
15 438651.7649822105
16 365984.4773921918
17 311725.609388832
18 269028.00574869104
19 234249.9549122065
20 205244.4918951888
21 180683.05812179693
22 159662.3305124868
23 141527.64505031193
24 125801.88094288555
25 112090.81609427018
26 100116.99100084836
27 89609.79106426169
28 80363.38380850092
29 72199.7725492598
30 64976.64049015053
31 58571.27221315508
32 52877.405941510224
33 47805.91168814944
34 43280.098828605136
35 39233.223076282884
36 35612.45855884574
37 32370.431429758668
38 29457.999729271905
39 26835.083947511528
40 24471.405179105815
41 22337.56683947082
42 20410.05078727195
43 18666.512221922167
44 17087.469610916225
45 15656.411531814656
46 14357.42209804935
47 1317

429 0.00013486447444350028
430 0.00013210834715777012
431 0.00012967639168659506
432 0.00012706669876462462
433 0.0001242988937670969
434 0.00012212342741067805
435 0.00011992038404752894
436 0.00011789630002519147
437 0.0001157412997697671
438 0.00011348813415677783
439 0.00011094048836914061
440 0.00010902821876761226
441 0.00010721342202284559
442 0.00010521349652481415
443 0.00010337087406406265
444 0.00010137096023940528
445 9.963478345679025e-05
446 9.811250074151859e-05
447 9.647048120639892e-05
448 9.492008916223149e-05
449 9.31180910157571e-05
450 9.14102686931928e-05
451 8.951488698635823e-05
452 8.843639288366734e-05
453 8.727090275012853e-05
454 8.592840482968279e-05
455 8.425177685538604e-05
456 8.302402389132881e-05
457 8.191098829904431e-05
458 8.013308767723659e-05
459 7.869586417160249e-05
460 7.751264147497217e-05
461 7.626522544816605e-05
462 7.499710268383881e-05
463 7.432637672745435e-05
464 7.29116332670171e-05
465 7.182806210440096e-05
466 7.106027218943978e-05
4

# 自动梯度计算Autograd
在上面的例子中手动给出了两层网络的反向传播计算，但对于更大型的网络，手动计算反向梯度就非常的费力了！

好在现在我们有自动微分的库可以自动计算神经网络中的反向传播。自动梯度PyTorch中的autograd就是这样的功能。当使用autograd的时候，网络中的正向传播会定义一个计算图computational graph，在图中node对应Tensors，连接线edge对应从input Tensors产出output tensors的函数。在图中可以非常方便地计算梯度。

PyTorch的Variables对象中就包含自动计算梯度的API：

In [22]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [23]:
# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

In [24]:
# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

In [25]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 38462408.0
1 36566128.0
2 38297944.0
3 35914648.0
4 27350964.0
5 16296908.0
6 8314776.0
7 4146355.5
8 2313533.0
9 1501362.0
10 1101332.5
11 870277.1875
12 715967.5
13 602234.875
14 513229.53125
15 441168.4375
16 381830.46875
17 332336.90625
18 290642.3125
19 255236.15625
20 224983.1875
21 198999.234375
22 176571.75
23 157089.484375
24 140110.0
25 125257.4296875
26 112223.71875
27 100720.59375
28 90562.8984375
29 81577.8046875
30 73604.734375
31 66508.828125
32 60182.05078125
33 54534.12109375
34 49488.91015625
35 44968.671875
36 40916.65234375
37 37269.01171875
38 33981.1796875
39 31014.44140625
40 28334.162109375
41 25909.3984375
42 23712.12109375
43 21719.697265625
44 19911.076171875
45 18268.63671875
46 16774.05859375
47 15413.1474609375
48 14172.2587890625
49 13040.46484375
50 12006.6376953125
51 11063.0234375
52 10199.9130859375
53 9409.939453125
54 8686.5390625
55 8023.1474609375
56 7414.38916015625
57 6855.7392578125
58 6342.90966796875
59 5871.50341796875
60 5437.60546875
61 

412 0.00015808363968972117
413 0.0001544804108561948
414 0.00015152971900533885
415 0.00014865290722809732
416 0.0001453960285289213
417 0.00014247515355236828
418 0.0001398206950398162
419 0.00013649561151396483
420 0.0001342235045740381
421 0.00013177894288673997
422 0.00012881502334494144
423 0.00012659294588956982
424 0.000124275655252859
425 0.00012195979797979817
426 0.00011924205318791792
427 0.00011702519987011328
428 0.00011514416837599128
429 0.00011289775284240022
430 0.00011096081288997084
431 0.00010884884977713227
432 0.00010678324179025367
433 0.00010464089427841827
434 0.00010261405986966565
435 0.00010076677426695824
436 9.902538295136765e-05
437 9.754562779562548e-05
438 9.614571899874136e-05
439 9.405190940015018e-05
440 9.249887807527557e-05
441 9.111936378758401e-05
442 8.951302152127028e-05
443 8.769764826865867e-05
444 8.641422755317762e-05
445 8.473880734527484e-05
446 8.348326082341373e-05
447 8.209120278479531e-05
448 8.112130308290944e-05
449 7.95607920736074

# Define new autograd functions

In [26]:
import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()


0 40412908.0
1 43965596.0
2 47238804.0
3 40996040.0
4 25885946.0
5 12200336.0
6 5207806.5
7 2557976.0
8 1585693.875
9 1161974.25
10 925530.5625
11 764858.6875
12 643971.0
13 548390.3125
14 470790.71875
15 406957.9375
16 353832.4375
17 309252.53125
18 271530.875
19 239405.234375
20 211865.21875
21 188124.15625
22 167566.484375
23 149675.78125
24 134036.203125
25 120316.6171875
26 108245.4765625
27 97578.234375
28 88130.734375
29 79741.2578125
30 72270.03125
31 65600.1953125
32 59632.03515625
33 54282.74609375
34 49483.0859375
35 45164.48828125
36 41272.68359375
37 37759.5390625
38 34591.2109375
39 31722.87109375
40 29121.140625
41 26757.9765625
42 24609.25
43 22652.77734375
44 20869.208984375
45 19242.818359375
46 17757.26171875
47 16401.2265625
48 15163.6962890625
49 14029.4951171875
50 12989.087890625
51 12033.6142578125
52 11155.5673828125
53 10347.681640625
54 9604.080078125
55 8919.0693359375
56 8287.9443359375
57 7706.017578125
58 7168.703125
59 6672.47314453125
60 6213.904296875


446 0.00026850291760638356
447 0.00026251794770359993
448 0.00025710766203701496
449 0.0002520344278309494
450 0.0002469835744705051
451 0.00024189974647015333
452 0.0002371302543906495
453 0.00023203484306577593
454 0.0002269671531394124
455 0.0002227927907370031
456 0.00021785483113490045
457 0.00021370421745814383
458 0.00020896870410069823
459 0.00020560456323437393
460 0.00020151949138380587
461 0.00019802694441750646
462 0.00019383776816539466
463 0.00018990147509612143
464 0.00018659830675460398
465 0.0001831686677178368
466 0.00017964821017812937
467 0.00017667579231783748
468 0.0001732379460008815
469 0.00017052993644028902
470 0.00016746729670558125
471 0.0001641957787796855
472 0.0001614072680240497
473 0.00015854007506277412
474 0.00015583807544317096
475 0.000152718901517801
476 0.00015025078027974814
477 0.00014765337982680649
478 0.00014486735744867474
479 0.00014221621677279472
480 0.00014003022806718946
481 0.0001372370752505958
482 0.00013496745668817312
483 0.0001331

# PyTorch nn module
对于大型神经网络，仅仅有autograd还是太底层了，所以nn module就是为了改善这个情况而设计的。

nn模块包含一些不同的网络层和训练网络常用的损失函数。

In [27]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

In [29]:
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

In [31]:
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

In [33]:
learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    model.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 644.3743896484375
1 596.6887817382812
2 555.06494140625
3 518.9366455078125
4 486.8756408691406
5 457.9064636230469
6 431.4229736328125
7 407.2027893066406
8 384.8330383300781
9 364.161376953125
10 344.7649230957031
11 326.4465637207031
12 309.1164245605469
13 292.75408935546875
14 277.2121887207031
15 262.4610290527344
16 248.50259399414062
17 235.27886962890625
18 222.6706085205078
19 210.65647888183594
20 199.16464233398438
21 188.21775817871094
22 177.7913360595703
23 167.85643005371094
24 158.4271240234375
25 149.4726104736328
26 140.9709930419922
27 132.90505981445312
28 125.26506042480469
29 118.03561401367188
30 111.18012237548828
31 104.6905746459961
32 98.5439453125
33 92.75444793701172
34 87.28107452392578
35 82.08866882324219
36 77.19229888916016
37 72.5883560180664
38 68.26443481445312
39 64.19420623779297
40 60.3722038269043
41 56.78641128540039
42 53.42007827758789
43 50.25846481323242
44 47.290130615234375
45 44.5081787109375
46 41.89717102050781
47 39.44524002075195


410 6.022784873493947e-05
411 5.859271914232522e-05
412 5.7006396673386917e-05
413 5.546454849536531e-05
414 5.396243432187475e-05
415 5.2503717597573996e-05
416 5.1085709856124595e-05
417 4.9705453420756385e-05
418 4.836222069570795e-05
419 4.7059635107871145e-05
420 4.579186497721821e-05
421 4.455903763300739e-05
422 4.3359807023080066e-05
423 4.219379115966149e-05
424 4.10606553487014e-05
425 3.9954582462087274e-05
426 3.8882884837221354e-05
427 3.7842110032215714e-05
428 3.682763781398535e-05
429 3.583914440241642e-05
430 3.4879827580880374e-05
431 3.394770101294853e-05
432 3.303883204353042e-05
433 3.215604374418035e-05
434 3.1295905500883237e-05
435 3.0461111236945726e-05
436 2.96470843750285e-05
437 2.8858765290351585e-05
438 2.8089274564990774e-05
439 2.7342095563653857e-05
440 2.6611654902808368e-05
441 2.5905137590598315e-05
442 2.5215827918145806e-05
443 2.4545555788790807e-05
444 2.389449036854785e-05
445 2.325905552424956e-05
446 2.2642392650595866e-05
447 2.20397323573706

# PyTorch: optim
PyTorch中的optim包抽象了优化算法，并给出了常用优化算法的实现

In [34]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 661.653564453125
1 644.6604614257812
2 628.1556396484375
3 612.114013671875
4 596.5916748046875
5 581.4546508789062
6 566.7763061523438
7 552.4948120117188
8 538.522705078125
9 524.9712524414062
10 511.8604736328125
11 499.173095703125
12 486.8336181640625
13 474.9072265625
14 463.3039245605469
15 452.05462646484375
16 441.0758056640625
17 430.338623046875
18 419.8242492675781
19 409.5502014160156
20 399.5325622558594
21 389.7931213378906
22 380.35186767578125
23 371.1675109863281
24 362.1837158203125
25 353.3905944824219
26 344.8219909667969
27 336.48272705078125
28 328.3474426269531
29 320.420166015625
30 312.6419982910156
31 305.0247802734375
32 297.5829162597656
33 290.307861328125
34 283.2141418457031
35 276.2752685546875
36 269.5047912597656
37 262.884033203125
38 256.396484375
39 250.04393005371094
40 243.8413543701172
41 237.7753448486328
42 231.85797119140625
43 226.07359313964844
44 220.4036407470703
45 214.84146118164062
46 209.41546630859375
47 204.10427856445312
48 198.8

399 9.749483069754206e-07
400 8.9801807234835e-07
401 8.274050173895375e-07
402 7.615948334205314e-07
403 7.011838647486002e-07
404 6.45532963972073e-07
405 5.938724143561558e-07
406 5.466813490784261e-07
407 5.025675591241452e-07
408 4.620280549261224e-07
409 4.2461874727450777e-07
410 3.902275409473077e-07
411 3.586104639907717e-07
412 3.293274630777887e-07
413 3.0235221970542625e-07
414 2.7741702979255933e-07
415 2.5459226549173763e-07
416 2.3361077694517007e-07
417 2.1408732209238224e-07
418 1.9632012993042736e-07
419 1.799457294282547e-07
420 1.6486583831465396e-07
421 1.510650236014044e-07
422 1.3831868272973225e-07
423 1.2661696757731988e-07
424 1.1591770743279994e-07
425 1.0609124956317828e-07
426 9.702264236466362e-08
427 8.869410805800726e-08
428 8.105997295615452e-08
429 7.407257385239063e-08
430 6.758584447652538e-08
431 6.172083999445022e-08
432 5.640625744263161e-08
433 5.145339798673376e-08
434 4.6992902014153515e-08
435 4.287862864771341e-08
436 3.914211177402649e-08
43

# PyTorch: Custom nn Modules

In [35]:
import torch
from torch.autograd import Variable


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 625.9271240234375
1 577.7947387695312
2 536.6757202148438
3 500.9743347167969
4 469.1854248046875
5 440.63372802734375
6 414.6476135253906
7 390.8913879394531
8 369.2474670410156
9 349.2066345214844
10 330.5207824707031
11 312.9652099609375
12 296.5143127441406
13 281.052490234375
14 266.43841552734375
15 252.65821838378906
16 239.65750122070312
17 227.29507446289062
18 215.55938720703125
19 204.34666442871094
20 193.69261169433594
21 183.52976989746094
22 173.87014770507812
23 164.70545959472656
24 155.99794006347656
25 147.7128143310547
26 139.85110473632812
27 132.3845977783203
28 125.29862976074219
29 118.56318664550781
30 112.16537475585938
31 106.1036605834961
32 100.36237335205078
33 94.92645263671875
34 89.77664947509766
35 84.88932037353516
36 80.23873138427734
37 75.83035278320312
38 71.65096282958984
39 67.70832824707031
40 63.97940444946289
41 60.45097732543945
42 57.12222671508789
43 53.975032806396484
44 50.999908447265625
45 48.19245910644531
46 45.54536056518555
47 43

392 5.2913539548171684e-05
393 5.118221088196151e-05
394 4.9505782953929156e-05
395 4.788804653799161e-05
396 4.6323380956891924e-05
397 4.480986899579875e-05
398 4.334506229497492e-05
399 4.193071072222665e-05
400 4.0558177715865895e-05
401 3.923684562323615e-05
402 3.795428710873239e-05
403 3.67199863831047e-05
404 3.551920235622674e-05
405 3.4363114536972716e-05
406 3.324333010823466e-05
407 3.2160729460883886e-05
408 3.111040496150963e-05
409 3.009993815794587e-05
410 2.912093623308465e-05
411 2.8172173188067973e-05
412 2.725740341702476e-05
413 2.6371131752966903e-05
414 2.551299439801369e-05
415 2.4684421077836305e-05
416 2.3880194930825382e-05
417 2.310525997017976e-05
418 2.2355157852871343e-05
419 2.162947566830553e-05
420 2.0929615857312456e-05
421 2.0251580281183124e-05
422 1.9593431716202758e-05
423 1.895866444101557e-05
424 1.8345121134188958e-05
425 1.77491001522867e-05
426 1.7175025277538225e-05
427 1.6619149391772225e-05
428 1.608228194527328e-05
429 1.5561734471702948e

# PyTorch: Control Flow + Weight Sharing

In [36]:
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 697.3257446289062
1 696.5664672851562
2 699.4256591796875
3 696.0826416015625
4 680.745849609375
5 665.3865966796875
6 689.2520751953125
7 630.2438354492188
8 718.0223999023438
9 685.2333984375
10 629.4927368164062
11 578.1094970703125
12 682.0983276367188
13 555.9555053710938
14 675.062255859375
15 670.6987915039062
16 675.8189086914062
17 656.3865356445312
18 668.556884765625
19 662.9664916992188
20 655.6471557617188
21 451.17120361328125
22 288.5182189941406
23 261.1958312988281
24 222.7823486328125
25 607.8709716796875
26 594.6953735351562
27 129.5166015625
28 505.3349304199219
29 535.3801879882812
30 455.8143310546875
31 426.21258544921875
32 278.093505859375
33 108.6146240234375
34 414.2115783691406
35 230.6929931640625
36 209.2282257080078
37 271.2425842285156
38 298.73919677734375
39 167.6911163330078
40 156.0332489013672
41 136.0129852294922
42 187.52915954589844
43 169.55029296875
44 118.83985900878906
45 108.60987854003906
46 218.6389923095703
47 151.85389709472656
48 279.

423 1.1436654329299927
424 1.2432465553283691
425 1.2285082340240479
426 5.671011924743652
427 3.8108811378479004
428 2.6021957397460938
429 0.9851672649383545
430 0.8447490334510803
431 0.7994694709777832
432 3.0738399028778076
433 0.8766961097717285
434 1.7636715173721313
435 1.663309097290039
436 1.3227471113204956
437 1.4978047609329224
438 1.107701301574707
439 0.7445800304412842
440 0.28730303049087524
441 2.753182888031006
442 0.9105530381202698
443 0.929661214351654
444 0.5532448887825012
445 2.4270732402801514
446 0.4209061861038208
447 0.8345495462417603
448 0.7874844074249268
449 1.0930382013320923
450 0.9401392936706543
451 0.7774931788444519
452 0.45460185408592224
453 0.4044514298439026
454 0.4269065856933594
455 0.6080858111381531
456 3.333808183670044
457 0.3317394554615021
458 0.4338454008102417
459 0.8203753232955933
460 2.2984466552734375
461 0.40443992614746094
462 0.7255659103393555
463 0.16422311961650848
464 0.9823811650276184
465 0.40401431918144226
466 0.316303