# Learning PyTorch with Examples
PyTorch provides two main features:
- An n-dimensional Tensor,similar to numpy but can run on GPUs
- Automatic differentiation for building and training neural networks

We will use a fully-connected **ReLU** network as our running example. The network will have a single hidden layer, and will be trained with gradient descent to fit random data by minimizing the Euclidean distance between the network output and the true output.

## Implement a two-layer network using numpy

In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)                      #[N,H]
    # Attention: ReLU Activation
    h_relu = np.maximum(h, 0)          #[N,H]
    y_pred = h_relu.dot(w2)            #[N,D_out]

    # Compute and print loss
    loss = np.square(y_pred - y).sum()#[N,D_out]
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) #[N,D_out]
    grad_w2 = h_relu.T.dot(grad_y_pred) #[H,D_out]
    grad_h_relu = grad_y_pred.dot(w2.T) #[N,H]
    grad_h = grad_h_relu.copy()         #[N,H]
    # Attention: ReLu grad
    grad_h[h < 0] = 0                   #[N,H]
    grad_w1 = x.T.dot(grad_h)           #[D_in,H]

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28290521.075577162
1 21645208.76046092
2 20400589.343689576
3 21158235.74194018
4 21698472.14936565
5 20535400.15936073
6 17119185.131780095
7 12584687.849888708
8 8273483.843313586
9 5135100.678233853
10 3134362.082539637
11 1969491.765213817
12 1301010.7142796733
13 915770.3911180311
14 684224.8375645059
15 537729.9985413661
16 438933.6019306931
17 368370.011655303
18 315191.12228534196
19 273361.10573216766
20 239344.79643779632
21 211051.24137945854
22 187130.7749828172
23 166630.72257391812
24 148879.15406694298
25 133401.86057001495
26 119861.11706730993
27 107931.46049678212
28 97396.4102033535
29 88054.3069260213
30 79744.88586259574
31 72317.89440276241
32 65683.8974720884
33 59743.436404803484
34 54414.680167454615
35 49628.71056325552
36 45315.98626868153
37 41426.74133157681
38 37915.11402212554
39 34733.53979527706
40 31848.647889627806
41 29232.11781091209
42 26855.568928715387
43 24693.68212457878
44 22725.762185111693
45 20928.547335092153
46 19287.94244152186
47 1778

402 3.1699883027552184e-05
403 3.0128359928487568e-05
404 2.863496781459652e-05
405 2.721588348777006e-05
406 2.5868128598076548e-05
407 2.4586867092581053e-05
408 2.336906139346171e-05
409 2.2211859923628664e-05
410 2.11122847210468e-05
411 2.0067304356078385e-05
412 1.907410582534612e-05
413 1.813037866251114e-05
414 1.7233527111771755e-05
415 1.6381060444630605e-05
416 1.5571028289573728e-05
417 1.4801150128930596e-05
418 1.4069511199485202e-05
419 1.3374164116139967e-05
420 1.2713272422758724e-05
421 1.2085135032788354e-05
422 1.1488160547497559e-05
423 1.0920781488197431e-05
424 1.0381501228285423e-05
425 9.868959940669828e-06
426 9.381888716608147e-06
427 8.918851313013702e-06
428 8.478709812811282e-06
429 8.060423445115004e-06
430 7.66279240661699e-06
431 7.284866740210308e-06
432 6.9256422219475774e-06
433 6.584200196372217e-06
434 6.25974487399297e-06
435 5.951204510206043e-06
436 5.657952131910021e-06
437 5.379187203893013e-06
438 5.11416405046982e-06
439 4.862285759453681e-0

## Tensors
Numpy is a great framework,but it cannot utilize GPUs to accelerate its numerical computations.For modern deep neural networks, GPUs often provide speedups of 50x or greater, so unfortunately numpy won’t be enough for modern deep learning.

A Tensor is an n-dimensional array.Tensors can keep track of a computational graph and gradients, but they’re also useful as a generic tool for scientific computing.

## Implement a two-layer network using PyTorch Tensors

In [2]:
# -*- coding: utf-8 -*-
import torch
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") #Uncomment this to run on GPU

# N is batch_size;D_in is input dimension
# H is hidden dimension; D_out is output dimension
N,D_in,H,D_out = 64,1000,100,10

# Create random input and output data
x = torch.randn(N,D_in,device=device,dtype=dtype)
y = torch.randn(N,D_out,device=device,dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in,H,device=device,dtype=dtype)
w2 = torch.randn(H,D_out,device=device,dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred-y).pow(2).sum().item()
    print(t,loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred-y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 45891948.0
1 47375636.0
2 45914408.0
3 33630972.0
4 18118394.0
5 7947697.0
6 3761586.75
7 2221075.75
8 1577524.25
9 1230560.0
10 1000509.875
11 829534.25
12 696164.375
13 589343.5625
14 502461.25
15 431009.21875
16 371763.15625
17 322329.71875
18 280746.4375
19 245577.484375
20 215655.03125
21 190084.84375
22 168095.3125
23 149115.65625
24 132658.15625
25 118324.609375
26 105812.34375
27 94854.3359375
28 85218.8046875
29 76728.421875
30 69229.7890625
31 62589.7109375
32 56684.30078125
33 51441.51953125
34 46763.00390625
35 42574.625
36 38818.53125
37 35449.9609375
38 32418.578125
39 29683.384765625
40 27213.185546875
41 24977.681640625
42 22951.28515625
43 21111.955078125
44 19439.42578125
45 17917.75390625
46 16531.9140625
47 15266.16796875
48 14108.978515625
49 13050.775390625
50 12081.81640625
51 11194.234375
52 10379.345703125
53 9630.9208984375
54 8943.0625
55 8310.046875
56 7726.56982421875
57 7188.38037109375
58 6691.685546875
59 6232.78466796875
60 5808.525390625
61 5415.8916

411 0.0001790947135305032
412 0.00017527700401842594
413 0.00017094584472943097
414 0.00016717531252652407
415 0.0001641115522943437
416 0.00016003781638573855
417 0.0001569445157656446
418 0.00015291098679881543
419 0.00015016715042293072
420 0.00014704064233228564
421 0.00014342876966111362
422 0.00014080698019824922
423 0.00013805419439449906
424 0.00013498372572939843
425 0.00013236627273727208
426 0.0001297184353461489
427 0.00012741282989736646
428 0.00012483372120186687
429 0.0001222981372848153
430 0.00012026851618429646
431 0.0001180406688945368
432 0.00011528061440913007
433 0.00011350826389389113
434 0.0001114951737690717
435 0.00010951989679597318
436 0.00010758257849374786
437 0.00010606545401969925
438 0.0001041053983499296
439 0.0001021354109980166
440 0.00010014708095695823
441 9.816746023716405e-05
442 9.655396570451558e-05
443 9.483290341449901e-05
444 9.280051017412916e-05
445 9.107403457164764e-05
446 8.961969433585182e-05
447 8.807852282188833e-05
448 8.64947069203

## Autograd
For large complex networks, manually implementing the backward pass is a big deal.
PyTorch provides autograd package.When using autograd, the forward pass will define a computational graph;nodes in the graph will be Tensors,and edges will be functions that produce output Tensors from input Tensors.
Backpropagating through this graph then allows you to easily compute gradients.

## Implement a two-layer network using Tensors and autograd

In [4]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 33119172.0
1 29051494.0
2 27562792.0
3 24624596.0
4 19432528.0
5 13326260.0
6 8256184.0
7 4898848.0
8 2974992.75
9 1926735.25
10 1351635.125
11 1017910.5625
12 808437.0625
13 665443.9375
14 560662.8125
15 479574.4375
16 414372.625
17 360642.6875
18 315621.8125
19 277448.6875
20 244812.03125
21 216811.921875
22 192651.265625
23 171660.53125
24 153365.796875
25 137335.6875
26 123246.203125
27 110838.7421875
28 99864.15625
29 90132.0859375
30 81485.28125
31 73783.75
32 66943.1328125
33 60820.66015625
34 55330.4140625
35 50400.83203125
36 45968.27734375
37 41974.4453125
38 38369.1015625
39 35106.97265625
40 32153.130859375
41 29475.9375
42 27045.1953125
43 24836.970703125
44 22826.96875
45 20995.3359375
46 19323.98828125
47 17797.75
48 16403.287109375
49 15127.86328125
50 13960.162109375
51 12889.7548828125
52 11908.333984375
53 11007.748046875
54 10180.3984375
55 9420.552734375
56 8721.9228515625
57 8078.9453125
58 7487.0888671875
59 6943.048828125
60 6441.46435546875
61 5978.8012695312

408 0.00018835462105926126
409 0.00018454172823112458
410 0.00018031749641522765
411 0.00017639424186199903
412 0.000172391562955454
413 0.00016837011207826436
414 0.00016446338850073516
415 0.00016051280545070767
416 0.00015717501810286194
417 0.00015360189718194306
418 0.0001504159445175901
419 0.00014712686243001372
420 0.00014432374155148864
421 0.0001408695534337312
422 0.00013823056360706687
423 0.00013533097808249295
424 0.00013233814388513565
425 0.000129713662317954
426 0.00012748547305818647
427 0.00012491914094425738
428 0.0001222018472617492
429 0.00011983186413999647
430 0.00011743838695110753
431 0.00011507325689308345
432 0.00011277850717306137
433 0.00011059242388000712
434 0.00010858486348297447
435 0.00010664374713087454
436 0.00010471999121364206
437 0.00010282003495376557
438 0.00010049054981209338
439 9.886652696877718e-05
440 9.716858039610088e-05
441 9.537612640997395e-05
442 9.373088687425479e-05
443 9.182685607811436e-05
444 9.026192128658295e-05
445 8.86202033

## Defining new autograd functions
Each primitive autograd operator is really two functions that operate on Tensors.The **forward** function computes output Tensors from input Tensors. The **backward** function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

In [5]:
# -*- coding: utf-8 -*-
import torch

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


0 31911158.0
1 27687814.0
2 32939532.0
3 43625588.0
4 52278224.0
5 46987384.0
6 28486524.0
7 11761985.0
8 4348692.0
9 2020345.875
10 1299362.75
11 1004281.25
12 832285.5625
13 707450.5
14 608125.3125
15 526358.5625
16 458129.75
17 400664.0
18 351800.625
19 310020.71875
20 274162.96875
21 243224.171875
22 216417.09375
23 193028.1875
24 172616.40625
25 154733.21875
26 139029.0625
27 125183.265625
28 112933.1015625
29 102074.8046875
30 92432.328125
31 83848.515625
32 76189.9609375
33 69333.0078125
34 63181.32421875
35 57657.91015625
36 52682.1015625
37 48196.40234375
38 44149.078125
39 40487.6328125
40 37168.390625
41 34157.3046875
42 31419.919921875
43 28928.5078125
44 26659.01953125
45 24593.31640625
46 22709.498046875
47 20985.78515625
48 19408.140625
49 17961.970703125
50 16635.021484375
51 15416.95703125
52 14297.2294921875
53 13267.400390625
54 12319.7177734375
55 11446.4833984375
56 10641.1748046875
57 9899.615234375
58 9215.94140625
59 8584.3564453125
60 8000.400390625
61 7459.874

416 0.0008636455750092864
417 0.0008394715841859579
418 0.0008148981723934412
419 0.0007906724931672215
420 0.000768768135458231
421 0.0007469847914762795
422 0.0007267239270731807
423 0.0007061633514240384
424 0.0006870788056403399
425 0.0006679503130726516
426 0.0006509401137009263
427 0.0006340030231513083
428 0.0006152771529741585
429 0.0006002737791277468
430 0.0005835071788169444
431 0.0005668643862009048
432 0.0005519094993360341
433 0.0005385226104408503
434 0.0005242492188699543
435 0.0005102464929223061
436 0.0004977599019184709
437 0.0004839378234464675
438 0.0004723085439763963
439 0.0004610895412042737
440 0.00044903281377628446
441 0.0004370705282781273
442 0.00042660938925109804
443 0.00041599071118980646
444 0.0004065292014274746
445 0.0003970189718529582
446 0.0003874475951306522
447 0.00037773867370560765
448 0.00036858770181424916
449 0.0003591476706787944
450 0.0003516256110742688
451 0.0003430906217545271
452 0.00033463287400081754
453 0.00032685717451386154
454 0.

## Static Graphs
PyTorch autograd looks a lot like TensorFlow: in both frameworks we define a computational graph, and use automatic differentiation to compute gradients. The biggest difference between the two is that TensorFlow’s computational graphs are **static** and PyTorch uses **dynamic** computational graphs.

In TensorFlow, we define the computational graph once and then execute the same graph over and over again, possibly feeding different input data to the graph. In PyTorch, each forward pass defines a new computational graph.

## Implement a two-layer networks using TensorFlow

In [6]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; there will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32,shape=(None,D_in))
y = tf.placeholder(tf.float32,shape=(None,D_out))

# Create Variables for the weights and initialize them with ramdom data.
# A Tensorflow Variable persists its value across executions for the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute  the predicted y using operations on Tensorflow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x,w1)
h_relu = tf.maximum(h,tf.zeros(1))
y_pred = tf.matmul(h_relu,w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y-y_pred)**2)

# Compute gradient of the loss with respect to w1 and w2
grad_w1,grad_w2 = tf.gradients(loss,[w1,w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)


26886272.0
19570292.0
17160014.0
16544433.0
16220745.0
15199123.0
13201639.0
10460588.0
7666015.0
5279841.0
3525283.5
2337395.5
1576406.8
1096792.5
794870.44
600939.25
472333.94
383367.2
319124.5
270702.22
232829.28
202284.44
177064.47
155878.95
137847.56
122325.61
108863.17
97124.234
86843.19
77796.47
69813.23
62751.08
56490.336
50929.266
45981.812
41570.55
37631.625
34111.016
30954.508
28120.781
25574.418
23283.64
21219.195
19356.31
17673.734
16153.131
14776.628
13529.139
12396.707
11367.941
10433.395
9582.956
8809.852
8105.2144
7462.0786
6874.4736
6337.32
5845.838
5396.071
4983.806
4605.9
4258.9844
3940.271
3647.4722
3378.1067
3130.2454
2902.0078
2691.691
2497.7927
2319.0098
2153.903
2001.5131
1860.7299
1730.5303
1610.1108
1498.6621
1395.4591
1299.8297
1211.1908
1128.98
1052.7173
981.92944
916.2127
855.1673
798.41626
745.6631
696.59955
650.9559
608.47485
568.9159
532.0665
497.73758
465.74103
435.9093
408.08008
382.12054
357.88843
335.26816
314.1419
294.41245
275.97665
258.74945
242.