# Learning PyTorch with Examples
PyTorch provides two main features:
- An n-dimensional Tensor,similar to numpy but can run on GPUs
- Automatic differentiation for building and training neural networks

We will use a fully-connected **ReLU** network as our running example. The network will have a single hidden layer, and will be trained with gradient descent to fit random data by minimizing the Euclidean distance between the network output and the true output.

## Implement a two-layer network using numpy

In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)                      #[N,H]
    # Attention: ReLU Activation
    h_relu = np.maximum(h, 0)          #[N,H]
    y_pred = h_relu.dot(w2)            #[N,D_out]

    # Compute and print loss
    loss = np.square(y_pred - y).sum()#[N,D_out]
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) #[N,D_out]
    grad_w2 = h_relu.T.dot(grad_y_pred) #[H,D_out]
    grad_h_relu = grad_y_pred.dot(w2.T) #[N,H]
    grad_h = grad_h_relu.copy()         #[N,H]
    # Attention: ReLu grad
    grad_h[h < 0] = 0                   #[N,H]
    grad_w1 = x.T.dot(grad_h)           #[D_in,H]

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28290521.075577162
1 21645208.76046092
2 20400589.343689576
3 21158235.74194018
4 21698472.14936565
5 20535400.15936073
6 17119185.131780095
7 12584687.849888708
8 8273483.843313586
9 5135100.678233853
10 3134362.082539637
11 1969491.765213817
12 1301010.7142796733
13 915770.3911180311
14 684224.8375645059
15 537729.9985413661
16 438933.6019306931
17 368370.011655303
18 315191.12228534196
19 273361.10573216766
20 239344.79643779632
21 211051.24137945854
22 187130.7749828172
23 166630.72257391812
24 148879.15406694298
25 133401.86057001495
26 119861.11706730993
27 107931.46049678212
28 97396.4102033535
29 88054.3069260213
30 79744.88586259574
31 72317.89440276241
32 65683.8974720884
33 59743.436404803484
34 54414.680167454615
35 49628.71056325552
36 45315.98626868153
37 41426.74133157681
38 37915.11402212554
39 34733.53979527706
40 31848.647889627806
41 29232.11781091209
42 26855.568928715387
43 24693.68212457878
44 22725.762185111693
45 20928.547335092153
46 19287.94244152186
47 1778

402 3.1699883027552184e-05
403 3.0128359928487568e-05
404 2.863496781459652e-05
405 2.721588348777006e-05
406 2.5868128598076548e-05
407 2.4586867092581053e-05
408 2.336906139346171e-05
409 2.2211859923628664e-05
410 2.11122847210468e-05
411 2.0067304356078385e-05
412 1.907410582534612e-05
413 1.813037866251114e-05
414 1.7233527111771755e-05
415 1.6381060444630605e-05
416 1.5571028289573728e-05
417 1.4801150128930596e-05
418 1.4069511199485202e-05
419 1.3374164116139967e-05
420 1.2713272422758724e-05
421 1.2085135032788354e-05
422 1.1488160547497559e-05
423 1.0920781488197431e-05
424 1.0381501228285423e-05
425 9.868959940669828e-06
426 9.381888716608147e-06
427 8.918851313013702e-06
428 8.478709812811282e-06
429 8.060423445115004e-06
430 7.66279240661699e-06
431 7.284866740210308e-06
432 6.9256422219475774e-06
433 6.584200196372217e-06
434 6.25974487399297e-06
435 5.951204510206043e-06
436 5.657952131910021e-06
437 5.379187203893013e-06
438 5.11416405046982e-06
439 4.862285759453681e-0

## Tensors
Numpy is a great framework,but it cannot utilize GPUs to accelerate its numerical computations.For modern deep neural networks, GPUs often provide speedups of 50x or greater, so unfortunately numpy won’t be enough for modern deep learning.

A Tensor is an n-dimensional array.Tensors can keep track of a computational graph and gradients, but they’re also useful as a generic tool for scientific computing.

## Implement a two-layer network using PyTorch Tensors

In [2]:
# -*- coding: utf-8 -*-
import torch
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") #Uncomment this to run on GPU

# N is batch_size;D_in is input dimension
# H is hidden dimension; D_out is output dimension
N,D_in,H,D_out = 64,1000,100,10

# Create random input and output data
x = torch.randn(N,D_in,device=device,dtype=dtype)
y = torch.randn(N,D_out,device=device,dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in,H,device=device,dtype=dtype)
w2 = torch.randn(H,D_out,device=device,dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred-y).pow(2).sum().item()
    print(t,loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred-y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 45891948.0
1 47375636.0
2 45914408.0
3 33630972.0
4 18118394.0
5 7947697.0
6 3761586.75
7 2221075.75
8 1577524.25
9 1230560.0
10 1000509.875
11 829534.25
12 696164.375
13 589343.5625
14 502461.25
15 431009.21875
16 371763.15625
17 322329.71875
18 280746.4375
19 245577.484375
20 215655.03125
21 190084.84375
22 168095.3125
23 149115.65625
24 132658.15625
25 118324.609375
26 105812.34375
27 94854.3359375
28 85218.8046875
29 76728.421875
30 69229.7890625
31 62589.7109375
32 56684.30078125
33 51441.51953125
34 46763.00390625
35 42574.625
36 38818.53125
37 35449.9609375
38 32418.578125
39 29683.384765625
40 27213.185546875
41 24977.681640625
42 22951.28515625
43 21111.955078125
44 19439.42578125
45 17917.75390625
46 16531.9140625
47 15266.16796875
48 14108.978515625
49 13050.775390625
50 12081.81640625
51 11194.234375
52 10379.345703125
53 9630.9208984375
54 8943.0625
55 8310.046875
56 7726.56982421875
57 7188.38037109375
58 6691.685546875
59 6232.78466796875
60 5808.525390625
61 5415.8916

411 0.0001790947135305032
412 0.00017527700401842594
413 0.00017094584472943097
414 0.00016717531252652407
415 0.0001641115522943437
416 0.00016003781638573855
417 0.0001569445157656446
418 0.00015291098679881543
419 0.00015016715042293072
420 0.00014704064233228564
421 0.00014342876966111362
422 0.00014080698019824922
423 0.00013805419439449906
424 0.00013498372572939843
425 0.00013236627273727208
426 0.0001297184353461489
427 0.00012741282989736646
428 0.00012483372120186687
429 0.0001222981372848153
430 0.00012026851618429646
431 0.0001180406688945368
432 0.00011528061440913007
433 0.00011350826389389113
434 0.0001114951737690717
435 0.00010951989679597318
436 0.00010758257849374786
437 0.00010606545401969925
438 0.0001041053983499296
439 0.0001021354109980166
440 0.00010014708095695823
441 9.816746023716405e-05
442 9.655396570451558e-05
443 9.483290341449901e-05
444 9.280051017412916e-05
445 9.107403457164764e-05
446 8.961969433585182e-05
447 8.807852282188833e-05
448 8.64947069203

## Autograd
For large complex networks, manually implementing the backward pass is a big deal.
PyTorch provides autograd package.When using autograd, the forward pass will define a computational graph;nodes in the graph will be Tensors,and edges will be functions that produce output Tensors from input Tensors.
Backpropagating through this graph then allows you to easily compute gradients.

## Implement a two-layer network using Tensors and autograd

In [5]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 34631852.0
1 31680750.0
2 33351380.0
3 33346008.0
4 28377408.0
5 19473534.0
6 11141001.0
7 5789729.5
8 3074787.0
9 1807016.0
10 1204239.0
11 889983.8125
12 703959.25
13 579501.4375
14 488177.84375
15 416952.4375
16 359514.03125
17 312055.03125
18 272298.875
19 238695.453125
20 210112.515625
21 185621.359375
22 164521.921875
23 146247.9375
24 130397.2421875
25 116558.5234375
26 104442.2265625
27 93826.296875
28 84453.8359375
29 76169.640625
30 68825.4375
31 62301.16015625
32 56487.1015625
33 51303.65625
34 46663.15625
35 42499.48046875
36 38759.79296875
37 35393.8046875
38 32355.056640625
39 29609.7578125
40 27126.068359375
41 24876.92578125
42 22835.474609375
43 20979.296875
44 19291.5390625
45 17754.734375
46 16352.5517578125
47 15072.19921875
48 13902.251953125
49 12833.619140625
50 11854.6943359375
51 10957.96484375
52 10134.91015625
53 9378.908203125
54 8684.171875
55 8045.25146484375
56 7457.06591796875
57 6915.42138671875
58 6415.7041015625
59 5955.05126953125
60 5529.786621093

415 8.783919474808499e-05
416 8.637808059575036e-05
417 8.440810051979497e-05
418 8.280852489406243e-05
419 8.096560486592352e-05
420 7.934790482977405e-05
421 7.805316272424534e-05
422 7.646423910045996e-05
423 7.512229785788804e-05
424 7.366599311353639e-05
425 7.210968033177778e-05
426 7.09704909240827e-05
427 6.933657277841121e-05
428 6.823812873335555e-05
429 6.660598592134193e-05
430 6.533585110446438e-05
431 6.438088894356042e-05
432 6.308798037935048e-05
433 6.200330244610086e-05
434 6.0882728575961664e-05
435 5.9797323046950623e-05
436 5.890893589821644e-05
437 5.7752375141717494e-05
438 5.6929893617052585e-05
439 5.600742588285357e-05
440 5.507422611117363e-05
441 5.419065564638004e-05
442 5.325615347828716e-05
443 5.240320024313405e-05
444 5.1536986575229093e-05
445 5.071397390565835e-05
446 5.000878809369169e-05
447 4.9135869630845264e-05
448 4.853017162531614e-05
449 4.766682468471117e-05
450 4.678230834542774e-05
451 4.606147558661178e-05
452 4.5276865421328694e-05
453 4.

## Defining new autograd functions
Each primitive autograd operator is really two functions that operate on Tensors.The **forward** function computes output Tensors from input Tensors. The **backward** function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

In [5]:
# -*- coding: utf-8 -*-
import torch

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


0 31911158.0
1 27687814.0
2 32939532.0
3 43625588.0
4 52278224.0
5 46987384.0
6 28486524.0
7 11761985.0
8 4348692.0
9 2020345.875
10 1299362.75
11 1004281.25
12 832285.5625
13 707450.5
14 608125.3125
15 526358.5625
16 458129.75
17 400664.0
18 351800.625
19 310020.71875
20 274162.96875
21 243224.171875
22 216417.09375
23 193028.1875
24 172616.40625
25 154733.21875
26 139029.0625
27 125183.265625
28 112933.1015625
29 102074.8046875
30 92432.328125
31 83848.515625
32 76189.9609375
33 69333.0078125
34 63181.32421875
35 57657.91015625
36 52682.1015625
37 48196.40234375
38 44149.078125
39 40487.6328125
40 37168.390625
41 34157.3046875
42 31419.919921875
43 28928.5078125
44 26659.01953125
45 24593.31640625
46 22709.498046875
47 20985.78515625
48 19408.140625
49 17961.970703125
50 16635.021484375
51 15416.95703125
52 14297.2294921875
53 13267.400390625
54 12319.7177734375
55 11446.4833984375
56 10641.1748046875
57 9899.615234375
58 9215.94140625
59 8584.3564453125
60 8000.400390625
61 7459.874

416 0.0008636455750092864
417 0.0008394715841859579
418 0.0008148981723934412
419 0.0007906724931672215
420 0.000768768135458231
421 0.0007469847914762795
422 0.0007267239270731807
423 0.0007061633514240384
424 0.0006870788056403399
425 0.0006679503130726516
426 0.0006509401137009263
427 0.0006340030231513083
428 0.0006152771529741585
429 0.0006002737791277468
430 0.0005835071788169444
431 0.0005668643862009048
432 0.0005519094993360341
433 0.0005385226104408503
434 0.0005242492188699543
435 0.0005102464929223061
436 0.0004977599019184709
437 0.0004839378234464675
438 0.0004723085439763963
439 0.0004610895412042737
440 0.00044903281377628446
441 0.0004370705282781273
442 0.00042660938925109804
443 0.00041599071118980646
444 0.0004065292014274746
445 0.0003970189718529582
446 0.0003874475951306522
447 0.00037773867370560765
448 0.00036858770181424916
449 0.0003591476706787944
450 0.0003516256110742688
451 0.0003430906217545271
452 0.00033463287400081754
453 0.00032685717451386154
454 0.

## Static Graphs
PyTorch autograd looks a lot like TensorFlow: in both frameworks we define a computational graph, and use automatic differentiation to compute gradients. The biggest difference between the two is that TensorFlow’s computational graphs are **static** and PyTorch uses **dynamic** computational graphs.

In TensorFlow, we define the computational graph once and then execute the same graph over and over again, possibly feeding different input data to the graph. In PyTorch, each forward pass defines a new computational graph.

## Implement a two-layer networks using TensorFlow

In [6]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; there will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32,shape=(None,D_in))
y = tf.placeholder(tf.float32,shape=(None,D_out))

# Create Variables for the weights and initialize them with ramdom data.
# A Tensorflow Variable persists its value across executions for the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute  the predicted y using operations on Tensorflow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x,w1)
h_relu = tf.maximum(h,tf.zeros(1))
y_pred = tf.matmul(h_relu,w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y-y_pred)**2)

# Compute gradient of the loss with respect to w1 and w2
grad_w1,grad_w2 = tf.gradients(loss,[w1,w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)


26886272.0
19570292.0
17160014.0
16544433.0
16220745.0
15199123.0
13201639.0
10460588.0
7666015.0
5279841.0
3525283.5
2337395.5
1576406.8
1096792.5
794870.44
600939.25
472333.94
383367.2
319124.5
270702.22
232829.28
202284.44
177064.47
155878.95
137847.56
122325.61
108863.17
97124.234
86843.19
77796.47
69813.23
62751.08
56490.336
50929.266
45981.812
41570.55
37631.625
34111.016
30954.508
28120.781
25574.418
23283.64
21219.195
19356.31
17673.734
16153.131
14776.628
13529.139
12396.707
11367.941
10433.395
9582.956
8809.852
8105.2144
7462.0786
6874.4736
6337.32
5845.838
5396.071
4983.806
4605.9
4258.9844
3940.271
3647.4722
3378.1067
3130.2454
2902.0078
2691.691
2497.7927
2319.0098
2153.903
2001.5131
1860.7299
1730.5303
1610.1108
1498.6621
1395.4591
1299.8297
1211.1908
1128.98
1052.7173
981.92944
916.2127
855.1673
798.41626
745.6631
696.59955
650.9559
608.47485
568.9159
532.0665
497.73758
465.74103
435.9093
408.08008
382.12054
357.88843
335.26816
314.1419
294.41245
275.97665
258.74945
242.

## nn module
Computational graphs and autograd are a very powerful paradigm for defining complex operators and automatically taking derivatives; however for large neural networks **raw autograd** can be a bit too low-level.

When building neural networks we frequently think of arranging the computation into **layers**, some of which have learnable parameters which will be optimized during learning.

In TensorFlow, packages like <font color=red>Keras</font>, <font color=red>TensorFlow-Slim</font>, and <font color=red>TFLearn</font> provide higher-level abstractions over raw computational graphs that are useful for building neural networks.

In PyTorch, the <font color=red>nn</font> package serves this same purpose. The <font color=red>nn</font> package defines a set of **Modules**, which are roughly equivalent to neural network layers. A Module receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters. The <font color=red>nn</font> package also defines a set of useful loss functions that are commonly used when training neural networks.

## Implement a two-layer network using nn

In [8]:
# -*- coding:utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers.
# nn.Sequential is a Module which contains Modules, and applies
# them in sequence to produce its output. Each Linear Module computes
# output from input using a linear function, and holds internal
# Tensors for its weights and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in,H),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out),
)

# The nn package also contains definitions of popular loss function; in this
# case we will use Mean Squared Error(MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)
    
    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the loss.
    loss = loss_fn(y_pred,y)
    print(t,loss.item())
    
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for all
    # learnable parameters in the model.
    loss.backward()
    
    # Update the weights using gradient descent. Each parameter is a Tensor, so we can
    # access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate*param.grad


0 709.9724731445312
1 656.5048217773438
2 610.3779296875
3 569.9826049804688
4 534.25048828125
5 502.0862731933594
6 472.671142578125
7 445.751953125
8 421.0672912597656
9 398.0520935058594
10 376.4727783203125
11 356.1627197265625
12 337.0384216308594
13 318.9648742675781
14 301.8168029785156
15 285.671630859375
16 270.2654113769531
17 255.52145385742188
18 241.5064239501953
19 228.11341857910156
20 215.33572387695312
21 203.1604461669922
22 191.56085205078125
23 180.51390075683594
24 170.0073699951172
25 160.0406036376953
26 150.59124755859375
27 141.61888122558594
28 133.09971618652344
29 124.99217224121094
30 117.33737182617188
31 110.13665771484375
32 103.34835052490234
33 96.94886779785156
34 90.92124938964844
35 85.25296020507812
36 79.92301177978516
37 74.90986633300781
38 70.20712280273438
39 65.79519653320312
40 61.652400970458984
41 57.7713737487793
42 54.14228820800781
43 50.741451263427734
44 47.557861328125
45 44.579734802246094
46 41.79837417602539
47 39.20222473144531
4

407 4.7332079702755436e-05
408 4.590882599586621e-05
409 4.45249788754154e-05
410 4.318711580708623e-05
411 4.1885294194798917e-05
412 4.062478183186613e-05
413 3.940357419196516e-05
414 3.821475547738373e-05
415 3.706423012772575e-05
416 3.595264570321888e-05
417 3.487154754111543e-05
418 3.382170689292252e-05
419 3.28049500240013e-05
420 3.181966167176142e-05
421 3.0864397558616474e-05
422 2.993874113599304e-05
423 2.9037841159151867e-05
424 2.816545202222187e-05
425 2.731847962422762e-05
426 2.649947055033408e-05
427 2.5704101062729023e-05
428 2.493554347893223e-05
429 2.4187434974010102e-05
430 2.3460344891645946e-05
431 2.2757041733711958e-05
432 2.2072987121646293e-05
433 2.141096774721518e-05
434 2.0770030459971167e-05
435 2.014582059928216e-05
436 1.954276376636699e-05
437 1.8959559383802116e-05
438 1.839234391809441e-05
439 1.78402588062454e-05
440 1.730768781271763e-05
441 1.6787938875495456e-05
442 1.628625250305049e-05
443 1.5799530956428498e-05
444 1.5325404092436656e-05
4

## optim
Up to this point we have updated the weights of our models by manually mutating the Tensors holding learnable parameters (with **torch.no_grad()** or **.data** to **avoid tracking history in autograd**). This is not a huge burden for simple optimization algorithms like stochastic gradient descent, but in practice we often train neural networks using more sophisticated optimizers like <font color=red>AdaGrad</font>, <font color=red>RMSProp</font>, <font color=red>Adam</font>, etc.

## Implement a two-layer network using nn and optim

In [12]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of the model for us. 
# Here we will use Adam; the optim package contains many other optimization algoriths. The first
# argument to the Adam constructor tells the optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model
    y_pred = model(x)
    
    # Compute and print loss.
    loss = loss_fn(y_pred,y)
    print(t,loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffer(i.e,not overwritten) whenever .backward()
    # is called.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters.
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters.
    optimizer.step()


0 765.7986450195312
1 747.5376586914062
2 729.778564453125
3 712.5255126953125
4 695.8483276367188
5 679.6951293945312
6 664.0742797851562
7 648.9552001953125
8 634.1929321289062
9 619.7285766601562
10 605.6453247070312
11 591.885498046875
12 578.4395751953125
13 565.3589477539062
14 552.61083984375
15 540.2769165039062
16 528.2855224609375
17 516.623291015625
18 505.2320556640625
19 494.12335205078125
20 483.22357177734375
21 472.5820617675781
22 462.2034912109375
23 452.1192626953125
24 442.2723083496094
25 432.648193359375
26 423.25836181640625
27 414.08343505859375
28 405.14013671875
29 396.4158020019531
30 387.87078857421875
31 379.47064208984375
32 371.23321533203125
33 363.167236328125
34 355.2984619140625
35 347.6051940917969
36 340.0998840332031
37 332.7415771484375
38 325.54144287109375
39 318.4938049316406
40 311.5547790527344
41 304.75872802734375
42 298.07598876953125
43 291.521240234375
44 285.08441162109375
45 278.75634765625
46 272.5406188964844
47 266.4162902832031
48 

363 0.00029560652910731733
364 0.0002776604378595948
365 0.00026078737573698163
366 0.00024490876239724457
367 0.0002299654297530651
368 0.00021590766846202314
369 0.00020268223306629807
370 0.00019024165521841496
371 0.0001785519125405699
372 0.0001675552048254758
373 0.0001572129112901166
374 0.00014749125693924725
375 0.00013835460413247347
376 0.00012976930884178728
377 0.0001216999880853109
378 0.00011411790910642594
379 0.00010699986160034314
380 0.0001003027573460713
381 9.402444993611425e-05
382 8.812136366032064e-05
383 8.258066372945905e-05
384 7.737752457614988e-05
385 7.249216287164018e-05
386 6.790582119720057e-05
387 6.360786937875673e-05
388 5.957033499726094e-05
389 5.5784992582630366e-05
390 5.223148036748171e-05
391 4.889287811238319e-05
392 4.576711216941476e-05
393 4.2836269130930305e-05
394 4.008788164355792e-05
395 3.7508620152948424e-05
396 3.509224552544765e-05
397 3.282909528934397e-05
398 3.070721868425608e-05
399 2.8718795874738134e-05
400 2.6854255338548683e

## Custom nn Modules
Sometimes you will want to specify models that are **more complex** that a sequence of existing Modules; for these cases you can define your own Modules by subclassing <font color=red>nn.Module</font> and defining a <font color=red>forward</font> which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.

In [14]:
# -*- coding:utf-8 -*-
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self,D_in,H,D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet,self).__init__()
        self.linear1 = torch.nn.Linear(D_in,H)
        self.linear2 = torch.nn.Linear(H,D_out)
        
    def forward(self,x):
        """
        In the forward function we accept a Tensor of input data and we must return a
        Tensor of output data. We can use Modules defined in the constructor as well
        as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in,H,D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 684.8701782226562
1 641.437744140625
2 602.6382446289062
3 567.9405517578125
4 536.50244140625
5 507.85455322265625
6 481.5986633300781
7 457.2810974121094
8 434.4634094238281
9 413.1439208984375
10 393.0639343261719
11 373.9920959472656
12 355.8914794921875
13 338.58380126953125
14 321.95245361328125
15 306.05731201171875
16 290.7934875488281
17 276.162841796875
18 262.1285095214844
19 248.67166137695312
20 235.73411560058594
21 223.39169311523438
22 211.58529663085938
23 200.30838012695312
24 189.5380859375
25 179.2466278076172
26 169.4149627685547
27 160.02346801757812
28 151.08328247070312
29 142.58432006835938
30 134.5025177001953
31 126.81761932373047
32 119.52880096435547
33 112.63299560546875
34 106.07634735107422
35 99.84922790527344
36 93.97347259521484
37 88.4239730834961
38 83.18458557128906
39 78.24859619140625
40 73.60542297363281
41 69.2305679321289
42 65.10758972167969
43 61.22254943847656
44 57.5510368347168
45 54.10408401489258
46 50.86737823486328
47 47.81668472290

422 1.5301435269066133e-05
423 1.480148875998566e-05
424 1.4319110050564632e-05
425 1.3853618838766124e-05
426 1.3402280274021905e-05
427 1.2966574104211759e-05
428 1.2543792763608508e-05
429 1.2134182725276332e-05
430 1.1739084584405646e-05
431 1.1359396012267098e-05
432 1.0989296242769342e-05
433 1.0630778888298664e-05
434 1.0285148164257407e-05
435 9.952063010132406e-06
436 9.628648513171356e-06
437 9.315850547864102e-06
438 9.014368515636306e-06
439 8.720833648112603e-06
440 8.438310942437965e-06
441 8.163572601915803e-06
442 7.899099728092551e-06
443 7.642092896276154e-06
444 7.395170996460365e-06
445 7.155751973186852e-06
446 6.924418357812101e-06
447 6.700363883282989e-06
448 6.482776370830834e-06
449 6.272870450629853e-06
450 6.06940193392802e-06
451 5.873630470887292e-06
452 5.683173640136374e-06
453 5.499484814208699e-06
454 5.321308890415821e-06
455 5.149362550582737e-06
456 4.982907739758957e-06
457 4.821979928237852e-06
458 4.666374479711521e-06
459 4.515428372542374e-06
4

## Control Flow + Weight Sharing

In [15]:
# -*- coding:utf-8 -*-
import random
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self,D_in,H,D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet,self).__init__()
        self.input_linear = torch.nn.Linear(D_in,H)
        self.middle_linear = torch.nn.Linear(H,H)
        self.output_linear = torch.nn.Linear(H,D_out)
        
    def forward(self,x):
        """
        For the forward pass of the model, we randomly choose either 0,1,2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.
        
        Since each forward pass builds a dynamic computation graph, we canuse normal
        Python control-flow operators like loops or conditional statements when defining
        the forward pass of the model.
        
        Here we also see that it is perfectly safe to reuse the same Module many times
        when defining a computational graph. This is a big improvement from Lua Torch,
        where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0,3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Contruct our model by instantiating the class defined above
model = DynamicNet(D_in,H,D_out)

# Construct our loss function and an Optimizer. Training this 
# strange model with vanilla stochastic gradient descent is tough,
# so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(),lr=1e-4,momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
    
    # Compute and print loss
    loss = criterion(y_pred,y)
    print(t,loss.item())
    
    # Zero gradients,perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 621.6454467773438
1 621.7218017578125
2 628.2977294921875
3 617.3031005859375
4 612.5306396484375
5 644.595703125
6 592.6005249023438
7 582.8823852539062
8 600.68505859375
9 563.1325073242188
10 596.2572021484375
11 347.19085693359375
12 531.9234008789062
13 588.4441528320312
14 505.7809143066406
15 604.4092407226562
16 222.50831604003906
17 455.02655029296875
18 171.04696655273438
19 555.9972534179688
20 118.83098602294922
21 97.2779312133789
22 79.04486846923828
23 64.51116943359375
24 582.4295043945312
25 314.6124267578125
26 48.37520980834961
27 273.9366455078125
28 554.2511596679688
29 60.034812927246094
30 61.17797088623047
31 414.2515869140625
32 49.45159912109375
33 176.42892456054688
34 160.27777099609375
35 454.87896728515625
36 34.1974983215332
37 31.65951919555664
38 25.5064754486084
39 254.93113708496094
40 15.455772399902344
41 216.33004760742188
42 193.34747314453125
43 82.87884521484375
44 149.89588928222656
45 127.96312713623047
46 186.4084014892578
47 100.4374847412

372 0.6369782090187073
373 0.46859297156333923
374 0.6976785063743591
375 0.38466984033584595
376 0.23544363677501678
377 0.3877418339252472
378 0.8902551531791687
379 0.21903903782367706
380 0.22556523978710175
381 0.6170798540115356
382 0.5656839609146118
383 0.49619874358177185
384 1.1596823930740356
385 0.89170241355896
386 0.22986067831516266
387 0.5788633823394775
388 1.1383713483810425
389 0.5944957733154297
390 0.18069852888584137
391 0.6571789979934692
392 0.5575183033943176
393 0.5104073882102966
394 0.1836681216955185
395 0.166364386677742
396 1.046825647354126
397 0.4645979702472687
398 0.6364336013793945
399 0.24012765288352966
400 0.30257678031921387
401 1.40249764919281
402 1.1926906108856201
403 0.07629396766424179
404 0.21053647994995117
405 0.3639165461063385
406 0.7058221101760864
407 0.5638779401779175
408 0.2600124776363373
409 0.5355538725852966
410 0.11700178682804108
411 0.10924822837114334
412 0.6325536370277405
413 1.28786039352417
414 0.5820047855377197
415 0