In [5]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    
    # loss = (y_pred - y) ** 2
    grad_y_pred = 2.0 * (y_pred - y)
    # 
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 34400557.64175877
1 33970793.19337149
2 35559390.42790197
3 32723784.485385
4 24339241.943064075
5 14278707.40568195
6 7280823.463601988
7 3686888.4558294034
8 2099483.1497975895
9 1389727.8697143104
10 1034989.7030537094
11 826990.1176574359
12 685991.4008491402
13 580995.7936433746
14 497880.8549639818
15 430082.7936726501
16 373723.0717952729
17 326416.70993882406
18 286339.1228288908
19 252201.89179908042
20 222967.40877782332
21 197761.8400830562
22 175941.47467246826
23 156950.20815237577
24 140393.53751024377
25 125902.4238969283
26 113163.93372942501
27 101942.47224208551
28 92025.01192110308
29 83242.73597512592
30 75438.88252001912
31 68485.89314355853
32 62277.54726870303
33 56722.42774338465
34 51730.163798040296
35 47245.041885612416
36 43212.067797261014
37 39575.19974590317
38 36291.12219494038
39 33316.88298120288
40 30620.352780761397
41 28171.0085620451
42 25941.92240713961
43 23912.364168010157
44 22061.583858555612
45 20371.910918014917
46 18826.90214523873
47 174

406 4.520292157397468e-05
407 4.299745199930437e-05
408 4.09002505651637e-05
409 3.890497536624874e-05
410 3.7008234285061006e-05
411 3.520352934904746e-05
412 3.348740374571653e-05
413 3.18557508941193e-05
414 3.030333980984067e-05
415 2.88268283037373e-05
416 2.7421979172324935e-05
417 2.6086405534844733e-05
418 2.4815537638895398e-05
419 2.3607082540412862e-05
420 2.245743458903804e-05
421 2.1364364169593096e-05
422 2.032442171541778e-05
423 1.9334948957490653e-05
424 1.8394097868989667e-05
425 1.749888205011884e-05
426 1.6647539859077813e-05
427 1.5837546712187783e-05
428 1.5067318473787798e-05
429 1.4334785198654757e-05
430 1.363762918689523e-05
431 1.2974574649550888e-05
432 1.2343784103165731e-05
433 1.1743847900018849e-05
434 1.1172940242241339e-05
435 1.0630003312628871e-05
436 1.0113433985923224e-05
437 9.62225390852728e-06
438 9.154772917587023e-06
439 8.710083376966269e-06
440 8.287067827901148e-06
441 7.88456179140715e-06
442 7.501786229952337e-06
443 7.137529625438781e-06

In [17]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 23553206.0
1 17027140.0
2 13470802.0
3 10951111.0
4 8862906.0
5 7004478.0
6 5413843.5
7 4105340.75
8 3086790.5
9 2319616.25
10 1757341.625
11 1349188.75
12 1053775.375
13 838243.8125
14 678961.4375
15 559270.125
16 467602.125
17 396077.46875
18 339342.40625
19 293500.15625
20 255787.875
21 224346.140625
22 197834.734375
23 175267.703125
24 155882.171875
25 139134.40625
26 124597.875
27 111899.453125
28 100737.8515625
29 90891.2890625
30 82188.1953125
31 74456.109375
32 67570.59375
33 61399.41796875
34 55881.7734375
35 50936.046875
36 46494.734375
37 42498.546875
38 38897.0
39 35643.40625
40 32699.4921875
41 30033.453125
42 27616.06640625
43 25420.1171875
44 23421.2578125
45 21601.17578125
46 19941.31640625
47 18425.634765625
48 17039.70703125
49 15771.41015625
50 14609.42578125
51 13543.5849609375
52 12565.130859375
53 11665.3642578125
54 10837.72265625
55 10075.7548828125
56 9375.63671875
57 8729.4970703125
58 8132.76611328125
59 7581.3466796875
60 7071.50048828125
61 6599.797851562

In [18]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N 是 batch size; D_in 是 input dimension;
# H 是 hidden dimension; D_out 是 output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# 创建随机的Tensor来保存输入和输出
# 设定requires_grad=False表示在反向传播的时候我们不需要计算gradient
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# 创建随机的Tensor和权重。
# 设置requires_grad=True表示我们希望反向传播的时候计算Tensor的gradient
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # 前向传播:通过Tensor预测y；这个和普通的神经网络的前向传播没有任何不同，
    # 但是我们不需要保存网络的中间运算结果，因为我们不需要手动计算反向传播。
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # 通过前向传播计算loss
    # loss是一个形状为(1，)的Tensor
    # loss.item()可以给我们返回一个loss的scalar
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # PyTorch给我们提供了autograd的方法做反向传播。如果一个Tensor的requires_grad=True，
    # backward会自动计算loss相对于每个Tensor的gradient。在backward之后，
    # w1.grad和w2.grad会包含两个loss相对于两个Tensor的gradient信息。
    loss.backward()

    # 我们可以手动做gradient descent(后面我们会介绍自动的方法)。
    # 用torch.no_grad()包含以下statements，因为w1和w2都是requires_grad=True，
    # 但是在更新weights之后我们并不需要再做autograd。
    # 另一种方法是在weight.data和weight.grad.data上做操作，这样就不会对grad产生影响。
    # tensor.data会我们一个tensor，这个tensor和原来的tensor指向相同的内存空间，
    # 但是不会记录计算图的历史。
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 38113868.0
1 40021788.0
2 45339328.0
3 43610492.0
4 31461410.0
5 16192508.0
6 6970197.5
7 3197893.0
8 1855906.5
9 1316843.0
10 1038423.1875
11 857265.6875
12 722431.5
13 616269.5625
14 529894.5
15 458410.75
16 398638.5
17 348277.34375
18 305663.53125
19 269349.5
20 238235.984375
21 211387.59375
22 188126.9375
23 167886.109375
24 150225.625
25 134737.8125
26 121103.5234375
27 109074.625
28 98429.59375
29 88983.578125
30 80574.703125
31 73073.3984375
32 66362.46875
33 60344.7734375
34 54942.64453125
35 50083.65625
36 45706.01953125
37 41754.94140625
38 38183.54296875
39 34951.1484375
40 32019.96484375
41 29373.21875
42 26971.138671875
43 24786.07421875
44 22797.80859375
45 20983.88671875
46 19328.162109375
47 17815.013671875
48 16430.62109375
49 15163.333984375
50 14001.6201171875
51 12935.9365234375
52 11958.24609375
53 11060.33984375
54 10234.9111328125
55 9475.072265625
56 8775.6806640625
57 8131.34375
58 7537.57763671875
59 6989.998046875
60 6484.736328125
61 6018.10498046875
62 55

In [1]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 608.0863037109375
1 591.734375
2 575.8631591796875
3 560.3976440429688
4 545.3590087890625
5 530.7474975585938
6 516.5748291015625
7 502.92572021484375
8 489.63031005859375
9 476.677978515625
10 464.16119384765625
11 452.0721435546875
12 440.4025573730469
13 429.15008544921875
14 418.2108154296875
15 407.5574645996094
16 397.1946716308594
17 387.10791015625
18 377.3226013183594
19 367.7891540527344
20 358.5101318359375
21 349.451904296875
22 340.5810546875
23 331.94049072265625
24 323.4936218261719
25 315.2869873046875
26 307.30718994140625
27 299.5703430175781
28 292.0189514160156
29 284.6976623535156
30 277.5763854980469
31 270.6184387207031
32 263.8484191894531
33 257.24163818359375
34 250.78318786621094
35 244.46849060058594
36 238.29818725585938
37 232.25204467773438
38 226.32374572753906
39 220.50241088867188
40 214.80873107910156
41 209.24227905273438
42 203.80258178710938
43 198.47621154785156
44 193.2459716796875
45 188.1328582763672
46 183.13458251953125
47 178.257202148437

356 0.0005086160381324589
357 0.00048322381917387247
358 0.0004590466560330242
359 0.00043605687096714973
360 0.000414172129239887
361 0.0003933647822123021
362 0.0003735676291398704
363 0.0003547340747900307
364 0.00033681243075989187
365 0.0003197875339537859
366 0.0003035841800738126
367 0.0002881817054003477
368 0.00027354000485502183
369 0.000259620719589293
370 0.0002463853161316365
371 0.00023380568018183112
372 0.00022184834233485162
373 0.0002104822196997702
374 0.0001996952050831169
375 0.00018943572649732232
376 0.0001796803844626993
377 0.00017042136460077018
378 0.0001616231311345473
379 0.00015326309949159622
380 0.00014532993372995406
381 0.000137789873406291
382 0.00013063472579233348
383 0.00012383013381622732
384 0.00011738477769540623
385 0.00011125719174742699
386 0.00010544172982918099
387 9.991445404011756e-05
388 9.467960626352578e-05
389 8.970400085672736e-05
390 8.498464740114287e-05
391 8.050315227592364e-05
392 7.625518628628924e-05
393 7.222179556265473e-05


In [2]:
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 698.4027099609375
1 647.0601806640625
2 602.9810791015625
3 564.5834350585938
4 530.3843994140625
5 499.97161865234375
6 472.4979553222656
7 447.16741943359375
8 423.7510681152344
9 402.1412353515625
10 381.87530517578125
11 362.88104248046875
12 344.8948059082031
13 327.8245849609375
14 311.5321350097656
15 295.9076232910156
16 281.0365905761719
17 266.76513671875
18 253.05026245117188
19 239.85040283203125
20 227.20947265625
21 215.14422607421875
22 203.56845092773438
23 192.48190307617188
24 181.8638458251953
25 171.71377563476562
26 162.03981018066406
27 152.80099487304688
28 144.01266479492188
29 135.6654815673828
30 127.7291030883789
31 120.20602416992188
32 113.0589828491211
33 106.30328369140625
34 99.91429138183594
35 93.89823913574219
36 88.21248626708984
37 82.84503936767578
38 77.78511810302734
39 73.02212524414062
40 68.5506362915039
41 64.34414672851562
42 60.39891052246094
43 56.6947135925293
44 53.21841812133789
45 49.96593475341797
46 46.92340087890625
47 44.07189941

350 0.0015159437898546457
351 0.0014786592219024897
352 0.0014422980602830648
353 0.0014068670570850372
354 0.0013723000884056091
355 0.0013385944766923785
356 0.0013057265896350145
357 0.0012736832723021507
358 0.0012424285523593426
359 0.0012119465973228216
360 0.0011822384549304843
361 0.0011532686185091734
362 0.001125017530284822
363 0.0010974996257573366
364 0.0010706257307901978
365 0.0010444286745041609
366 0.001018871203996241
367 0.0009939472656697035
368 0.0009696390479803085
369 0.0009459475986659527
370 0.000922834500670433
371 0.000900284037925303
372 0.0008783009834587574
373 0.0008568497141823173
374 0.0008359445491805673
375 0.0008155495161190629
376 0.0007956468034535646
377 0.0007762546883895993
378 0.0007573314942419529
379 0.0007388722151517868
380 0.0007208694005385041
381 0.000703318219166249
382 0.0006861849687993526
383 0.0006694794283248484
384 0.0006531830294989049
385 0.0006372879142872989
386 0.0006217824993655086
387 0.0006066716741770506
388 0.00059192115