In [2]:
import torch


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28676482.497397907
1 23965886.513415992
2 22325888.37118727
3 20731011.705596924
4 17847816.391861275
5 13883854.525408119
6 9767455.885037675
7 6414076.239766388
8 4092013.3348313724
9 2641972.8625509217
10 1775440.718993708
11 1260306.2472379066
12 945316.8950593267
13 743279.6286606714
14 606025.4083582858
15 507312.80104036536
16 432588.91402549896
17 373713.2110443465
18 325887.5516153921
19 286245.2655545585
20 252803.9054976724
21 224280.73082003754
22 199765.48642932487
23 178605.7378738626
24 160161.6341203244
25 144013.17119445
26 129820.38772125915
27 117301.48726840303
28 106212.94149942398
29 96366.5003723845
30 87617.34100141125
31 79799.08400147843
32 72800.44314587783
33 66517.18179420824
34 60863.207229801505
35 55765.85595006904
36 51164.157019722625
37 47003.92187125696
38 43230.296387947164
39 39804.59908503918
40 36686.374989989876
41 33845.139319745984
42 31252.292023138405
43 28885.436604435825
44 26724.955145384905
45 24752.215563279417
46 22943.388876296034
4

482 5.5687679856378236e-05
483 5.463943076523814e-05
484 5.4071819940076926e-05
485 5.3337978430892385e-05
486 5.2641257290153565e-05
487 5.1904853742956725e-05
488 5.1125623296746636e-05
489 5.029253701732683e-05
490 4.9477430098934794e-05
491 4.871886813402615e-05
492 4.8090234028533296e-05
493 4.755359102577572e-05
494 4.6599577969430594e-05
495 4.6187124971119875e-05
496 4.5546439445709463e-05
497 4.5053471565748016e-05
498 4.426680337372979e-05
499 4.3737294482398825e-05


In [6]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

#dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(3):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 24727718.0
1 17994948.0
2 14547852.0


In [2]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    def forward(self, input):
        """
        In the forward pass we receive a Tensor containing the input and return a
        Tensor containing the output. You can cache arbitrary Tensors for use in the
        backward pass using the save_for_backward method.
        """
        self.save_for_backward(input)
        return input.clamp(min=0)

    def backward(self, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Construct an instance of our MyReLU class to use in our network
    relu = MyReLU()

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()


0 24086628.0
1 20121036.0
2 19003920.0
3 18184696.0
4 16461066.0
5 13484244.0
6 9995575.0
7 6783247.0
8 4389451.5
9 2798093.75
10 1822152.5
11 1235690.875
12 882750.375
13 663415.6875
14 521005.46875
15 423498.1875
16 353219.4375
17 300080.8125
18 258331.0625
19 224521.984375
20 196562.875
21 173042.171875
22 153017.46875
23 135830.796875
24 120971.4765625
25 108059.890625
26 96787.0859375
27 86903.8984375
28 78217.71875
29 70563.859375
30 63783.72265625
31 57762.484375
32 52396.12109375
33 47603.97265625
34 43320.20703125
35 39481.62890625
36 36033.48828125
37 32928.65625
38 30127.060546875
39 27594.841796875
40 25302.501953125
41 23223.845703125
42 21337.06640625
43 19621.001953125
44 18058.3359375
45 16634.361328125
46 15334.2138671875
47 14146.0654296875
48 13059.0537109375
49 12063.9990234375
50 11152.076171875
51 10315.2431640625
52 9546.7529296875
53 8840.59765625
54 8190.84033203125
55 7592.71142578125
56 7041.8154296875
57 6533.86865234375
58 6065.0126953125
59 5632.2177734375

482 3.036273665202316e-05
483 2.9804959922330454e-05
484 2.9563147109001875e-05
485 2.9158076358726248e-05
486 2.869852505682502e-05
487 2.842980575223919e-05
488 2.8036571166012436e-05
489 2.7760195735027082e-05
490 2.7446405511000194e-05
491 2.729773405008018e-05
492 2.6985622753272764e-05
493 2.6684308977564797e-05
494 2.6404573873151094e-05
495 2.6144376533920877e-05
496 2.6003841412602924e-05
497 2.5721272322698496e-05
498 2.5351750082336366e-05
499 2.513606523280032e-05


In [1]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

  return f(*args, **kwds)


3.25145e+07
2.76097e+07
2.77802e+07
2.80369e+07
2.54698e+07
1.96257e+07
1.28596e+07
7.49538e+06
4.21512e+06
2.46301e+06
1.56808e+06
1.09849e+06
833224.0
667714.0
554058.0
469672.0
403490.0
349734.0
305081.0
267522.0
235503.0
208000.0
184257.0
163689.0
145787.0
130132.0
116402.0
104329.0
93677.6
84262.9
75919.4
68501.6
61898.7
56009.4
50746.3
46032.7
41804.2
38004.7
34585.6
31508.4
28731.1
26222.5
23953.3
21898.1
20035.3
18351.5
16823.2
15434.2
14171.9
13028.3
11985.1
11033.6
10163.8
9368.74
8640.75
7973.82
7362.27
6801.27
6286.45
5813.58
5378.96
4979.35
4611.63
4272.94
3960.84
3673.42
3408.5
3163.92
2938.06
2728.98
2535.78
2357.25
2192.13
2039.29
1897.77
1766.62
1645.09
1532.41
1427.9
1330.9
1240.86
1157.26
1079.6
1007.42
940.327
877.938
819.913
765.911
715.635
668.816
625.235
584.624
546.782
511.483
478.586
447.892
419.256
392.54
367.588
344.3
322.577
302.281
283.321
265.593
249.022
233.523
219.024
205.464
192.777
180.901
169.786
159.377
149.631
140.504
131.954
123.941
116.429
109.389

In [9]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype))
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H).type(dtype),
    torch.nn.ReLU().type(dtype),
    torch.nn.Linear(H, D_out).type(dtype),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 688.768310546875
1 634.3409423828125
2 588.1471557617188
3 547.951171875
4 512.3643798828125
5 480.7498474121094
6 451.8994140625
7 425.6877136230469
8 401.72412109375
9 379.54473876953125
10 358.8777770996094
11 339.56378173828125
12 321.5136413574219
13 304.54400634765625
14 288.6012878417969
15 273.59112548828125
16 259.44317626953125
17 246.0689697265625
18 233.38198852539062
19 221.22227478027344
20 209.622802734375
21 198.58499145507812
22 188.07989501953125
23 178.08340454101562
24 168.60147094726562
25 159.53439331054688
26 150.9268798828125
27 142.74002075195312
28 134.9234161376953
29 127.50122833251953
30 120.46525573730469
31 113.80619812011719
32 107.51043701171875
33 101.54353332519531
34 95.87861633300781
35 90.52456665039062
36 85.46832275390625
37 80.68843078613281
38 76.17933654785156
39 71.929443359375
40 67.91213989257812
41 64.12391662597656
42 60.553321838378906
43 57.185333251953125
44 54.00910949707031
45 51.0123291015625
46 48.1849479675293
47 45.519996643066

436 5.6377244618488476e-05
437 5.484482244355604e-05
438 5.335819878382608e-05
439 5.1910086767748e-05
440 5.050359322922304e-05
441 4.913658631267026e-05
442 4.7803270717849955e-05
443 4.650843038689345e-05
444 4.525059921434149e-05
445 4.402386548463255e-05
446 4.283379530534148e-05
447 4.167429869994521e-05
448 4.0546670788899064e-05
449 3.9453618228435516e-05
450 3.838668635580689e-05
451 3.734987330972217e-05
452 3.6342080420581624e-05
453 3.536090662237257e-05
454 3.440622458583675e-05
455 3.3478754630777985e-05
456 3.257756179664284e-05
457 3.16983278025873e-05
458 3.0844581488054246e-05
459 3.0013594368938357e-05
460 2.9204755264800042e-05
461 2.8419835871318355e-05
462 2.7655592930386774e-05
463 2.691131885512732e-05
464 2.6187557523371652e-05
465 2.548459087847732e-05
466 2.4799081074888818e-05
467 2.4132838007062674e-05
468 2.3485761630581692e-05
469 2.2855740098748356e-05
470 2.224159106845036e-05
471 2.164450052077882e-05
472 2.1063957319711335e-05
473 2.0500036043813452e-

In [None]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 100000, 1000, 10
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype))
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).type(dtype)
loss_fn = torch.nn.MSELoss(size_average=False).type(dtype)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 700.1295166015625
1 1462.2896728515625
2 430.818359375
3 184.82394409179688
4 417.3115539550781
5 355.7149658203125
6 166.05194091796875
7 80.34666442871094
8 111.40391540527344
9 156.47450256347656
10 149.40567016601562
11 106.1256103515625
12 69.86040496826172
13 59.00864028930664
14 63.316280364990234
15 64.71348571777344
16 56.688751220703125
17 45.67634963989258
18 38.86406326293945
19 37.21964645385742
20 36.668907165527344
21 33.295650482177734
22 27.216564178466797
23 21.251148223876953
24 18.17379379272461
25 18.404067993164062
26 19.738988876342773
27 19.45221519470215
28 16.45364761352539
29 12.150310516357422
30 9.208830833435059
31 8.907902717590332
32 9.873128890991211
33 10.034008979797363
34 8.746550559997559
35 6.918580055236816
36 5.794905185699463
37 5.5052289962768555
38 5.342265605926514
39 4.899595260620117
40 4.282815933227539
41 3.7543773651123047
42 3.447333812713623
43 3.224381923675537
44 2.940765857696533
45 2.5973777770996094
46 2.270620107650757
47 2.039

In [14]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 100000, 100, 10

dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype))
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out).type(dtype)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False).type(dtype)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 680.6207275390625
1 797.928466796875
2 633.4841918945312
3 291.8186950683594
4 235.925048828125
5 184.9457550048828
6 139.656005859375
7 116.38621520996094
8 98.16300964355469
9 87.5408935546875
10 80.47701263427734
11 73.89134216308594
12 65.64324951171875
13 59.18330001831055
14 55.32588195800781
15 52.20457458496094
16 48.065879821777344
17 45.50067138671875
18 42.52301025390625
19 39.42010498046875
20 36.38455581665039
21 34.0142936706543
22 32.061744689941406
23 30.360309600830078
24 27.207584381103516
25 24.918292999267578
26 23.187267303466797
27 21.560400009155273
28 20.271190643310547
29 19.18348503112793
30 18.25925064086914
31 17.456737518310547
32 16.75872039794922
33 16.1453914642334
34 15.608312606811523
35 15.1362943649292
36 14.725401878356934
37 14.369331359863281
38 14.066527366638184
39 13.80737018585205
40 13.546483993530273
41 13.327537536621094
42 13.144830703735352
43 12.995343208312988
44 12.882376670837402
45 12.799389839172363
46 12.756230354309082
47 12.737

383 7.573263168334961
384 7.570318698883057
385 7.567375183105469
386 7.564432144165039
387 7.561490058898926
388 7.558548927307129
389 7.555610656738281
390 7.552671909332275
391 7.549734592437744
392 7.5467987060546875
393 7.543864727020264
394 7.54093074798584
395 7.537998199462891
396 7.535067081451416
397 7.532136917114258
398 7.529209136962891
399 7.526281356811523
400 7.523355007171631
401 7.520429611206055
402 7.517505645751953
403 7.514582633972168
404 7.511661529541016
405 7.508740425109863
406 7.505821228027344
407 7.502902984619141
408 7.499986171722412
409 7.497070789337158
410 7.494155406951904
411 7.491241455078125
412 7.48832893371582
413 7.48541784286499
414 7.482508182525635
415 7.479598522186279
416 7.476691246032715
417 7.47378396987915
418 7.470878601074219
419 7.4679741859436035
420 7.465070724487305
421 7.4621686935424805
422 7.459268093109131
423 7.456368446350098
424 7.453469276428223
425 7.450571537017822
426 7.447675704956055
427 7.444780349731445
428 7.44188

In [18]:
# -*- coding: utf-8 -*-
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype))
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out).type(dtype)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False).type(dtype)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 645.0169677734375
1 641.960693359375
2 644.8233032226562
3 571.0902709960938
4 85.2299575805664
5 601.1029663085938
6 297.89727783203125
7 159.08444213867188
8 424.093505859375
9 297.1700439453125
10 152.26446533203125
11 279.21478271484375
12 6575.2646484375
13 4024.62255859375
14 752.6444091796875
15 59723.7421875
16 1210406.75
17 43081984.0
18 6.267762120388956e+35
19 nan
20 nan
21 nan
22 nan
23 nan
24 nan
25 nan
26 nan
27 nan
28 nan
29 nan
30 nan
31 nan
32 nan
33 nan
34 nan
35 nan
36 nan
37 nan
38 nan
39 nan
40 nan
41 nan
42 nan
43 nan
44 nan
45 nan
46 nan
47 nan
48 nan
49 nan
50 nan
51 nan
52 nan
53 nan
54 nan
55 nan
56 nan
57 nan
58 nan
59 nan
60 nan
61 nan
62 nan
63 nan
64 nan
65 nan
66 nan
67 nan
68 nan
69 nan
70 nan
71 nan
72 nan
73 nan
74 nan
75 nan
76 nan
77 nan
78 nan
79 nan
80 nan
81 nan
82 nan
83 nan
84 nan
85 nan
86 nan
87 nan
88 nan
89 nan
90 nan
91 nan
92 nan
93 nan
94 nan
95 nan
96 nan
97 nan
98 nan
99 nan
100 nan
101 nan
102 nan
103 nan
104 nan
105 nan
106 nan
107 