In [1]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 36721844.09411214
1 33247775.32369309
2 33641974.5225356
3 31529255.45016572
4 24782965.332396485
5 15740617.446295135
6 8621978.726132065
7 4510656.975375136
8 2535688.4533593133
9 1609682.5372632323
10 1148652.4908079826
11 888250.8124552553
12 720530.7694925008
13 600576.0611118944
14 508464.1612988362
15 434896.2260063306
16 374840.9741578
17 324965.1720892604
18 283110.59103999427
19 247681.4188356718
20 217496.85821792937
21 191648.6805259367
22 169432.0373082551
23 150241.33602532424
24 133605.69497804667
25 119090.45596134076
26 106396.6027521269
27 95262.69873952217
28 85472.0153418534
29 76832.80967284087
30 69195.07024757659
31 62417.45576525694
32 56388.97796860018
33 51023.7033736851
34 46231.54447063639
35 41942.656641568494
36 38096.80954853499
37 34644.39758383845
38 31537.925991059423
39 28739.81473254522
40 26216.209133354118
41 23937.08107253159
42 21876.890728931943
43 20012.43844596178
44 18321.62568499535
45 16787.580172337104
46 15394.417710864844
47 14127.9168

376 0.00019529230307017672
377 0.00018639618056872913
378 0.00017790799658710577
379 0.0001698102418937805
380 0.00016208388194172398
381 0.00015471489028013224
382 0.00014767976384521373
383 0.00014096815810582696
384 0.00013456267191411472
385 0.00012845251071110457
386 0.0001226249027256142
387 0.00011705983077993553
388 0.00011174915437041751
389 0.00010668123172550829
390 0.00010184464949099656
391 9.723237485839893e-05
392 9.282771862084004e-05
393 8.862384987992472e-05
394 8.461182968626427e-05
395 8.078262167096014e-05
396 7.713026671669013e-05
397 7.364230553126951e-05
398 7.031338217231052e-05
399 6.713580094054346e-05
400 6.410346119469826e-05
401 6.120969075456369e-05
402 5.844710295252302e-05
403 5.580933057604234e-05
404 5.3291942026443344e-05
405 5.0889460285925565e-05
406 4.859632991614541e-05
407 4.640661904666702e-05
408 4.4316181545263564e-05
409 4.232032781529809e-05
410 4.041540740982554e-05
411 3.859701046065457e-05
412 3.686165332894922e-05
413 3.5203993012446774

In [2]:
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

29804772.0
24601430.0
22828884.0
21302954.0
19316606.0
18268800.0
15491646.0
11765580.0
8056658.0
5194257.5
3267892.0
2095733.9
1403111.9
995769.25
748806.1
591700.3
485241.5
408846.3
350979.9
305272.66
267904.16
236726.55
210293.12
187592.78
167928.98
150792.2
135772.5
122543.02
110855.016
100480.97
91257.33
83022.914
75664.4
69066.99
63131.08
57790.258
52975.0
48626.797
44693.484
41124.613
37881.688
34933.14
32245.596
29796.812
27561.648
25517.848
23646.303
21929.57
20354.254
18905.55
17573.848
16346.375
15214.06
14170.404
13206.654
12315.184
11490.6875
10727.374
10019.745
9363.799
8756.121
8191.991
7667.7793
7180.2866
6726.5776
6304.001
5910.123
5542.9385
5200.5933
4881.1953
4582.718
4304.0156
4043.612
3800.0757
3572.4258
3359.3687
3159.828
2972.96
2797.9888
2634.0435
2480.4207
2336.279
2200.938
2074.3591
1955.447
1843.7981
1738.8723
1640.2205
1547.5259
1460.427
1378.5405
1301.4805
1228.9624
1160.6572
1096.3435
1035.7764
978.69366
924.9007
874.21344
826.4458
781.4153
738.9276
698.83