## Pytorch Numpy

In [None]:
import numpy as np

In [2]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [3]:
x = np.random.rand(N, D_in)
y = np.random.randn(N, D_out)

In [4]:
w1 = np.random.rand(D_in,H)
w2 = np.random.rand(H, D_out)

In [5]:
learning_rate = 1e-6

In [6]:
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred-y).sum()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred-y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 96713599800.22156
1 646.0260604859881
2 646.0260604859881
3 646.0260604859881
4 646.0260604859881
5 646.0260604859881
6 646.0260604859881
7 646.0260604859881
8 646.0260604859881
9 646.0260604859881
10 646.0260604859881
11 646.0260604859881
12 646.0260604859881
13 646.0260604859881
14 646.0260604859881
15 646.0260604859881
16 646.0260604859881
17 646.0260604859881
18 646.0260604859881
19 646.0260604859881
20 646.0260604859881
21 646.0260604859881
22 646.0260604859881
23 646.0260604859881
24 646.0260604859881
25 646.0260604859881
26 646.0260604859881
27 646.0260604859881
28 646.0260604859881
29 646.0260604859881
30 646.0260604859881
31 646.0260604859881
32 646.0260604859881
33 646.0260604859881
34 646.0260604859881
35 646.0260604859881
36 646.0260604859881
37 646.0260604859881
38 646.0260604859881
39 646.0260604859881
40 646.0260604859881
41 646.0260604859881
42 646.0260604859881
43 646.0260604859881
44 646.0260604859881
45 646.0260604859881
46 646.0260604859881
47 646.0260604859881
48

## Pytorch Tensor

In [7]:
import torch

In [8]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [9]:
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [10]:
w1 = torch.rand(D_in,H)
w2 = torch.rand(H, D_out)

In [11]:
learning_rate = 1e-6

In [13]:
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min = 0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred-y).pow(2).sum()
    print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred-y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 209027098.4840588
1 23112308.173531678
2 4971687.66538012
3 2208469.6077302145
4 1144136.945651486
5 643862.0931060901
6 386685.8231381308
7 247893.23062041774
8 170631.77883708273
9 126572.77800740952
10 100808.95974993383
11 85276.04357961938
12 75526.14690308028
13 69078.3914776335
14 64538.09097917084
15 61117.50228737638
16 58370.12929536184
17 56040.7325011128
18 53982.5944091775
19 52110.7147033963
20 50374.73980649136
21 48744.943045190936
22 47202.89236417138
23 45738.53933169652
24 44342.10848170931
25 43008.27380998126
26 41731.35879444548
27 40507.42693508258
28 39333.36016583082
29 38207.11849841467
30 37126.18022852376
31 36087.45970442626
32 35089.298403303175
33 34129.359370354505
34 33205.78933993702
35 32316.794814820278
36 31460.916493624893
37 30637.468992405164
38 29844.052823305057
39 29079.214555170583
40 28341.67707950231
41 27630.552793260773
42 26944.883278923688
43 26283.167213959474
44 25644.541429774894
45 25027.95835934044
46 24431.98848131099
47 23855.6

431 798.4665290853854
432 795.600845328079
433 792.7538650430251
434 789.92423005623
435 787.1138582952079
436 784.3212742937694
437 781.5465326665014
438 778.7890046558683
439 776.0496423079417
440 773.327365385659
441 770.6220149429416
442 767.9344132107444
443 765.2631098439225
444 762.6090962736944
445 759.9718393704995
446 757.3497048444378
447 754.7439884287533
448 752.1544255560284
449 749.5807714364626
450 747.0233859388919
451 744.4809647024156
452 741.9553642039698
453 739.4437163265768
454 736.9483551614841
455 734.4682125813974
456 732.0034233830702
457 729.5538683803293
458 727.1204947986585
459 724.7006971982298
460 722.295110556196
461 719.9039384086366
462 717.526919537156
463 715.1657602902808
464 712.8174399439647
465 710.4830551400681
466 708.1631232250818
467 705.8586111689409
468 703.5686812713717
469 701.2922014477667
470 699.029542244157
471 696.7801215958983
472 694.5434670166044
473 692.3201486768187
474 690.1091966544732
475 687.9126194655264
476 685.727930371

## Autograd

In [15]:
import torch
from torch.autograd import Variable

In [16]:
dtype = torch.cuda.FloatTensor

In [17]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [18]:
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad = False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad = False)

In [19]:
w1 = Variable(torch.rand(D_in,H).type(dtype), requires_grad = True)
w2 = Variable(torch.rand(H, D_out).type(dtype), requires_grad = True)

In [20]:
learning_rate = 1e-6

In [22]:
for t in range(500):
    #h = x.mm(w1)
    #h_relu = h.clamp(min = 0)
    y_pred =x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred-y).pow(2).sum()
    print(t, loss.data[0])
    
    #grad_y_pred = 2.0 * (y_pred-y)
    #grad_w2 = h_relu.t().mm(grad_y_pred)
    #grad_h_relu = grad_y_pred.mm(w2.t())
    #grad_h = grad_h_relu.clone()
    #grad_h[h<0] = 0
    #grad_w1 = x.t().mm(grad_h)
    loss.backward()
    
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 138229008.0
1 18959588.0
2 6088511.0
3 3347661.75
4 2187226.5
5 1561386.375
6 1173487.375
7 920606.875
8 743822.1875
9 614902.9375
10 516360.9375
11 440241.03125
12 380107.1875
13 331710.78125
14 291847.84375
15 258810.375
16 231335.875
17 207901.03125
18 188091.578125
19 171026.8125
20 156181.328125
21 143207.546875
22 131843.96875
23 121784.1484375
24 112894.015625
25 104995.8984375
26 97945.390625
27 91598.8125
28 85820.5078125
29 80565.3125
30 75805.125
31 71437.9453125
32 67427.2890625
33 63773.734375
34 60454.78515625
35 57388.36328125
36 54584.78125
37 51988.11328125
38 49574.31640625
39 47342.234375
40 45230.60546875
41 43250.8125
42 41408.62890625
43 39675.578125
44 38045.4609375
45 36525.34765625
46 35130.96484375
47 33831.98828125
48 32602.87890625
49 31435.197265625
50 30339.912109375
51 29299.771484375
52 28319.4921875
53 27386.875
54 26499.771484375
55 25649.05859375
56 24833.111328125
57 24050.45703125
58 23307.8125
59 22598.8125
60 21917.92578125
61 21267.45703125
62 

408 983.3242797851562
409 981.0186767578125
410 978.7302856445312
411 976.4603271484375
412 974.2091674804688
413 971.9691772460938
414 969.751953125
415 967.5548095703125
416 965.36279296875
417 963.1937255859375
418 961.0358276367188
419 958.8963012695312
420 956.7901611328125
421 954.732177734375
422 952.699951171875
423 950.688232421875
424 948.6846923828125
425 946.6948852539062
426 944.7163696289062
427 942.7535400390625
428 940.81298828125
429 938.892822265625
430 936.974853515625
431 935.0711669921875
432 933.1737670898438
433 931.293701171875
434 929.431640625
435 927.5800170898438
436 925.7391967773438
437 923.9193115234375
438 922.1178588867188
439 920.3189086914062
440 918.535888671875
441 916.7634887695312
442 915.0069580078125
443 913.2615356445312
444 911.5264892578125
445 909.809814453125
446 908.1019897460938
447 906.4059448242188
448 904.7193603515625
449 903.0391235351562
450 901.3731689453125
451 899.7184448242188
452 898.0757446289062
453 896.4558715820312
454 894.

## Defining new Autograd function

In [23]:
import torch
from torch.autograd import Variable

In [25]:
class MyRelu(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
        
dtype = torch.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10
        
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad = False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad = False)

w1 = Variable(torch.rand(D_in,H).type(dtype), requires_grad = True)
w2 = Variable(torch.rand(H, D_out).type(dtype), requires_grad = True)
        
learning_rate = 1e-6

for t in range(500):
    relu = MyRelu.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred-y).pow(2).sum()
    print(t, loss.data[0])
    
    loss.backward()
    
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    

0 331867840.0


TypeError: '<' not supported between instances of 'tuple' and 'int'

In [26]:
import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 27462152.0
1 23478858.0
2 24747640.0
3 27786970.0
4 29015864.0
5 26066778.0
6 19075198.0
7 11634278.0
8 6252321.0
9 3278685.0
10 1815284.0
11 1119133.5
12 772064.375
13 582717.625
14 466783.78125
15 387850.53125
16 329140.6875
17 282921.84375
18 245295.203125
19 213984.21875
20 187599.84375
21 165117.6875
22 145894.546875
23 129349.9453125
24 115034.78125
25 102597.125
26 91757.078125
27 82284.75
28 73966.28125
29 66639.46875
30 60171.06640625
31 54447.4296875
32 49362.60546875
33 44832.3984375
34 40788.51171875
35 37171.68359375
36 33925.96484375
37 31008.080078125
38 28382.083984375
39 26013.42578125
40 23871.759765625
41 21934.619140625
42 20178.353515625
43 18583.33984375
44 17131.43359375
45 15808.8115234375
46 14602.873046875
47 13504.4423828125
48 12499.56640625
49 11579.7744140625
50 10736.8564453125
51 9962.330078125
52 9250.318359375
53 8595.1396484375
54 7991.66552734375
55 7435.31982421875
56 6922.36328125
57 6448.62451171875
58 6010.85498046875
59 5604.61279296875
60 522

In [None]:
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)


## nn module

In [28]:
import torch
from torch.autograd import Variable

In [29]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [36]:
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad = False)

In [37]:
model = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out)
        )

In [38]:
loss_fn = torch.nn.MSELoss(size_average=False)

In [39]:
learning_rate = 1e-4

In [41]:
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    model.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 666.9098510742188
1 620.7498168945312
2 580.2816772460938
3 544.255615234375
4 512.069580078125
5 482.9142150878906
6 456.0102844238281
7 431.1168212890625
8 408.02056884765625
9 386.3371887207031
10 365.8304748535156
11 346.5098571777344
12 328.3076477050781
13 310.978759765625
14 294.3918762207031
15 278.5793762207031
16 263.5121765136719
17 249.07247924804688
18 235.29153442382812
19 222.13368225097656
20 209.58885192871094
21 197.6224822998047
22 186.25335693359375
23 175.49525451660156
24 165.2433319091797
25 155.55125427246094
26 146.3809356689453
27 137.70245361328125
28 129.51878356933594
29 121.80162048339844
30 114.53720092773438
31 107.70320129394531
32 101.2767105102539
33 95.24776458740234
34 89.58514404296875
35 84.26322937011719
36 79.27059936523438
37 74.59700775146484
38 70.21434020996094
39 66.10773468017578
40 62.258853912353516
41 58.655540466308594
42 55.28749084472656
43 52.137149810791016
44 49.188499450683594
45 46.429203033447266
46 43.837284088134766
47 41.4

482 3.4164479529863456e-06
483 3.308492523501627e-06
484 3.203575261068181e-06
485 3.1015401873446535e-06
486 3.003253596034483e-06
487 2.9082166292937472e-06
488 2.8164022296550684e-06
489 2.727439778027474e-06
490 2.641500032041222e-06
491 2.5575766358088003e-06
492 2.4778832994343247e-06
493 2.3989878172869794e-06
494 2.32371962738398e-06
495 2.2506151253764983e-06
496 2.179432385673863e-06
497 2.1108689907123335e-06
498 2.0444579149625497e-06
499 1.9799610981863225e-06


## pytorch optim

In [42]:
import torch
from torch.autograd import Variable

In [43]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [45]:
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad = False)

In [46]:
model = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out)
        )

In [47]:
loss_fn = torch.nn.MSELoss(size_average=False)

In [48]:
learning_rate = 1e-4

In [49]:
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

In [50]:
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    optimizer.zero_grad()
    loss.backward()
    #for param in model.parameters():
     #   param.data -= learning_rate * param.grad.data
        
    optimizer.step()

0 670.04345703125
1 653.4566040039062
2 637.4476318359375
3 621.9124145507812
4 606.8099365234375
5 592.1600952148438
6 577.94384765625
7 564.1614379882812
8 550.8070678710938
9 537.868408203125
10 525.31689453125
11 513.1078491210938
12 501.20428466796875
13 489.6231994628906
14 478.3597106933594
15 467.5124816894531
16 456.9503173828125
17 446.6565246582031
18 436.6394348144531
19 426.8892822265625
20 417.3900146484375
21 408.1286926269531
22 399.1036376953125
23 390.341796875
24 381.8251647949219
25 373.47271728515625
26 365.2792663574219
27 357.2889099121094
28 349.44891357421875
29 341.7408447265625
30 334.20892333984375
31 326.8327941894531
32 319.6305236816406
33 312.5897216796875
34 305.7109375
35 298.9917907714844
36 292.4411315917969
37 286.0313415527344
38 279.7640380859375
39 273.61834716796875
40 267.5823669433594
41 261.67034912109375
42 255.8828887939453
43 250.21604919433594
44 244.6765899658203
45 239.27528381347656
46 233.9867401123047
47 228.80934143066406
48 223.744

402 1.1398213246138766e-05
403 1.0816183021233883e-05
404 1.0262529031024314e-05
405 9.739730558067095e-06
406 9.244211469194852e-06
407 8.7753114712541e-06
408 8.33023659652099e-06
409 7.90867670730222e-06
410 7.5087764344061725e-06
411 7.130323410819983e-06
412 6.772004780941643e-06
413 6.431017936847638e-06
414 6.108611614763504e-06
415 5.802236046292819e-06
416 5.511430117621785e-06
417 5.2355458137753885e-06
418 4.9727727855497506e-06
419 4.724176051240647e-06
420 4.487649675866123e-06
421 4.2652495721995365e-06
422 4.051836640428519e-06
423 3.849786935461452e-06
424 3.657859224404092e-06
425 3.475136736597051e-06
426 3.3018627618730534e-06
427 3.1372628654935397e-06
428 2.9805428312101867e-06
429 2.8323665901552886e-06
430 2.6910665837931447e-06
431 2.5570723209966673e-06
432 2.430102540529333e-06
433 2.308217517565936e-06
434 2.1930350158072542e-06
435 2.0836557723669102e-06
436 1.9801730104518356e-06
437 1.8807415926858084e-06
438 1.786648681445513e-06
439 1.6973272067843936e-0

## Custom nn Models

In [51]:
import torch
from torch.autograd import Variable

In [52]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.Linear1 = torch.nn.Linear(D_in, H)
        self.Linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.Linear1(x).clamp(min=0)
        y_pred = self.Linear2(h_relu)
        return y_pred   

In [53]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [54]:
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad = False)

In [55]:
model = TwoLayerNet(D_in, H, D_out)

In [59]:
criterion = torch.nn.MSELoss(size_average=False)

In [60]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [61]:
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 641.501708984375
1 595.1256103515625
2 554.5515747070312
3 518.7487182617188
4 486.891357421875
5 458.0539855957031
6 431.74078369140625
7 407.6297607421875
8 385.42938232421875
9 364.9048767089844
10 345.593505859375
11 327.4270324707031
12 310.34844970703125
13 294.13275146484375
14 278.8386535644531
15 264.4131164550781
16 250.72265625
17 237.6994171142578
18 225.33294677734375
19 213.513916015625
20 202.23150634765625
21 191.4616241455078
22 181.21163940429688
23 171.42779541015625
24 162.12234497070312
25 153.27700805664062
26 144.85995483398438
27 136.84609985351562
28 129.2505340576172
29 122.05066680908203
30 115.20227813720703
31 108.6978988647461
32 102.53335571289062
33 96.69113159179688
34 91.16004943847656
35 85.92416381835938
36 80.96751403808594
37 76.28199005126953
38 71.85847473144531
39 67.68461608886719
40 63.75354766845703
41 60.05132293701172
42 56.56631088256836
43 53.28520202636719
44 50.16397476196289
45 47.22928237915039
46 44.47412109375
47 41.88076400756836

415 5.883174162590876e-05
416 5.7142297009704635e-05
417 5.550347123062238e-05
418 5.3911458962829784e-05
419 5.236906872596592e-05
420 5.08668708789628e-05
421 4.941001316183247e-05
422 4.7996232751756907e-05
423 4.662127321353182e-05
424 4.52862550446298e-05
425 4.3991676648147404e-05
426 4.273386366548948e-05
427 4.15095710195601e-05
428 4.0324361179955304e-05
429 3.916922287316993e-05
430 3.8050347939133644e-05
431 3.696372004924342e-05
432 3.5906879929825664e-05
433 3.4881366445915774e-05
434 3.3886110031744465e-05
435 3.2918676879489794e-05
436 3.19763166771736e-05
437 3.106449730694294e-05
438 3.0178754968801513e-05
439 2.9317929147509858e-05
440 2.8481003027991392e-05
441 2.766920624708291e-05
442 2.6881465601036325e-05
443 2.6113166313734837e-05
444 2.536919964768458e-05
445 2.46472773142159e-05
446 2.3944663553265855e-05
447 2.3262900867848657e-05
448 2.260091423522681e-05
449 2.1956939235678874e-05
450 2.1333327822503634e-05
451 2.07262382900808e-05
452 2.013510857068468e-05

## control flow with weight sharing

In [62]:
import random
import torch
from torch.autograd import Variable

In [63]:
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0,3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
            
        y_pred = self.output_linear(h_relu)
        return y_pred      

In [64]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [65]:
model = DynamicNet(D_in, H, D_out)

In [66]:
criterion = torch.nn.MSELoss(size_average=False)

In [68]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

In [69]:
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 626.1519775390625
1 624.6248779296875
2 637.2493286132812
3 621.3856811523438
4 553.5235595703125
5 617.0841674804688
6 442.6560974121094
7 612.9600830078125
8 611.0068969726562
9 305.7362060546875
10 577.8864135742188
11 570.45166015625
12 556.6865844726562
13 536.8980102539062
14 184.20083618164062
15 489.2149963378906
16 599.8047485351562
17 583.4771118164062
18 410.2562561035156
19 562.595947265625
20 350.74652099609375
21 100.7182388305664
22 566.5537109375
23 554.0523071289062
24 536.2271118164062
25 512.6720581054688
26 211.88446044921875
27 110.79701232910156
28 372.9493408203125
29 102.18620300292969
30 327.51287841796875
31 149.06459045410156
32 275.6277770996094
33 120.65145111083984
34 105.41741943359375
35 89.84420776367188
36 271.4545593261719
37 162.48080444335938
38 147.7037811279297
39 69.99215698242188
40 107.43968200683594
41 143.2880096435547
42 87.23578643798828
43 86.37916564941406
44 81.53814697265625
45 70.9858627319336
46 70.9312744140625
47 79.53397369384766

498 0.2755409777164459
499 0.5363263487815857
