In [15]:
import numpy as np

In [22]:
# Data Generation
np.random.seed(42)
x = np.random.rand(1000, 1)
y = 1 + 2 * x + .1 * np.random.randn(1000, 1)

# Shuffles the indices
idx = np.arange(1000)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:800]
# Uses the remaining indices for validation
val_idx = idx[800:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

In [23]:
np.random.seed(42)
a = np.random.randn(1)
b = np.random.randn(1)

print(a, b)

# Sets learning rate
lr = 1e-2
# Defines number of epochs
n_epochs = 100000

for epoch in range(n_epochs):
    # Computes our model's predicted output
    yhat = a + b * x_train
    
    # How wrong is our model? That's the error! 
    error = (y_train - yhat)
    # It is a regression, so it computes mean squared error (MSE)
    loss = (error ** 2).mean()
    
    # Computes gradients for both "a" and "b" parameters
    a_grad = -2 * error.mean()
    b_grad = -2 * (x_train * error).mean()
    
    # Updates parameters using gradients and the learning rate
    a = a - lr * a_grad
    b = b - lr * b_grad
    
print(a, b)

[0.49671415] [-0.1382643]
[1.017337] [1.98435647]


## PyTorch

#### Import

In [25]:
import torch
import torch.optim as optim
import torch.nn as nn
from torchviz import makedot
from tqdm import tqdm_notebook, tnrange

ModuleNotFoundError: No module named 'torchviz'

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [30]:
device

'cpu'

In [27]:
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

In [29]:
print(type(x_train_tensor),x_train_tensor.type() )

<class 'torch.Tensor'> torch.FloatTensor


### Torch Version

In [35]:
a = torch.randn(1, requires_grad = True, dtype=torch.float)
b = torch.randn(1, requires_grad = True, dtype=torch.float)
print(a,b)

tensor([0.3855], requires_grad=True) tensor([0.6419], requires_grad=True)


In [36]:
a = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
b = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
print(a,b)

tensor([0.5720], requires_grad=True) tensor([-0.9221], requires_grad=True)


In [37]:
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

tensor([0.4023], requires_grad=True) tensor([-0.1276], requires_grad=True)


In [47]:
n_epochs = 1000

a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a,b)
for epoch in range(n_epochs):
    print(epoch)
    yhat = a + x_train_tensor*b
    error = y_train_tensor-yhat
    loss = (error**2).mean()
    loss.backward()
    print(a.grad, b.grad)
    a = a - lr*a.grad
    b = b - lr*b.grad
    a.grad.zero_()
    b.grad.zero_()
    

tensor([-0.2158], requires_grad=True) tensor([0.0920], requires_grad=True)
0
tensor([-4.3719]) tensor([-2.5256])


AttributeError: 'NoneType' object has no attribute 'zero_'

In [51]:
n_epochs = 1000

a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a,b)
for epoch in range(n_epochs):
    print(epoch)
    yhat = a + x_train_tensor*b
    error = y_train_tensor-yhat
    loss = (error**2).mean()
    loss.backward()
    print(a.grad, b.grad)

    with torch.no_grad():
        a -= lr*a.grad
        b -=lr*b.grad
    
    a.grad.zero_()
    b.grad.zero_()
    

tensor([0.1787], requires_grad=True) tensor([0.3139], requires_grad=True)
0
tensor([-3.3594]) tensor([-1.9778])
1
tensor([-3.2723]) tensor([-1.9305])
2
tensor([-3.1874]) tensor([-1.8845])
3
tensor([-3.1047]) tensor([-1.8396])
4
tensor([-3.0241]) tensor([-1.7958])
5
tensor([-2.9455]) tensor([-1.7532])
6
tensor([-2.8689]) tensor([-1.7117])
7
tensor([-2.7943]) tensor([-1.6712])
8
tensor([-2.7216]) tensor([-1.6317])
9
tensor([-2.6507]) tensor([-1.5932])
10
tensor([-2.5817]) tensor([-1.5557])
11
tensor([-2.5144]) tensor([-1.5191])
12
tensor([-2.4488]) tensor([-1.4835])
13
tensor([-2.3849]) tensor([-1.4488])
14
tensor([-2.3226]) tensor([-1.4149])
15
tensor([-2.2619]) tensor([-1.3820])
16
tensor([-2.2027]) tensor([-1.3498])
17
tensor([-2.1451]) tensor([-1.3185])
18
tensor([-2.0889]) tensor([-1.2879])
19
tensor([-2.0342]) tensor([-1.2581])
20
tensor([-1.9808]) tensor([-1.2291])
21
tensor([-1.9288]) tensor([-1.2008])
22
tensor([-1.8782]) tensor([-1.1733])
23
tensor([-1.8288]) tensor([-1.1464])


tensor([0.0438]) tensor([-0.0834])
318
tensor([0.0437]) tensor([-0.0833])
319
tensor([0.0437]) tensor([-0.0831])
320
tensor([0.0437]) tensor([-0.0830])
321
tensor([0.0436]) tensor([-0.0829])
322
tensor([0.0436]) tensor([-0.0828])
323
tensor([0.0435]) tensor([-0.0826])
324
tensor([0.0435]) tensor([-0.0825])
325
tensor([0.0435]) tensor([-0.0824])
326
tensor([0.0434]) tensor([-0.0823])
327
tensor([0.0434]) tensor([-0.0822])
328
tensor([0.0433]) tensor([-0.0820])
329
tensor([0.0433]) tensor([-0.0819])
330
tensor([0.0433]) tensor([-0.0818])
331
tensor([0.0432]) tensor([-0.0817])
332
tensor([0.0432]) tensor([-0.0816])
333
tensor([0.0431]) tensor([-0.0814])
334
tensor([0.0431]) tensor([-0.0813])
335
tensor([0.0431]) tensor([-0.0812])
336
tensor([0.0430]) tensor([-0.0811])
337
tensor([0.0430]) tensor([-0.0810])
338
tensor([0.0429]) tensor([-0.0809])
339
tensor([0.0429]) tensor([-0.0807])
340
tensor([0.0428]) tensor([-0.0806])
341
tensor([0.0428]) tensor([-0.0805])
342
tensor([0.0427]) tensor([

tensor([0.0292]) tensor([-0.0540])
635
tensor([0.0291]) tensor([-0.0540])
636
tensor([0.0291]) tensor([-0.0539])
637
tensor([0.0291]) tensor([-0.0538])
638
tensor([0.0290]) tensor([-0.0537])
639
tensor([0.0290]) tensor([-0.0537])
640
tensor([0.0289]) tensor([-0.0536])
641
tensor([0.0289]) tensor([-0.0535])
642
tensor([0.0289]) tensor([-0.0535])
643
tensor([0.0288]) tensor([-0.0534])
644
tensor([0.0288]) tensor([-0.0533])
645
tensor([0.0287]) tensor([-0.0532])
646
tensor([0.0287]) tensor([-0.0532])
647
tensor([0.0287]) tensor([-0.0531])
648
tensor([0.0286]) tensor([-0.0530])
649
tensor([0.0286]) tensor([-0.0530])
650
tensor([0.0286]) tensor([-0.0529])
651
tensor([0.0285]) tensor([-0.0528])
652
tensor([0.0285]) tensor([-0.0527])
653
tensor([0.0284]) tensor([-0.0527])
654
tensor([0.0284]) tensor([-0.0526])
655
tensor([0.0284]) tensor([-0.0525])
656
tensor([0.0283]) tensor([-0.0525])
657
tensor([0.0283]) tensor([-0.0524])
658
tensor([0.0282]) tensor([-0.0523])
659
tensor([0.0282]) tensor([

tensor([0.0198]) tensor([-0.0366])
923
tensor([0.0198]) tensor([-0.0366])
924
tensor([0.0197]) tensor([-0.0366])
925
tensor([0.0197]) tensor([-0.0365])
926
tensor([0.0197]) tensor([-0.0365])
927
tensor([0.0197]) tensor([-0.0364])
928
tensor([0.0196]) tensor([-0.0364])
929
tensor([0.0196]) tensor([-0.0363])
930
tensor([0.0196]) tensor([-0.0363])
931
tensor([0.0195]) tensor([-0.0362])
932
tensor([0.0195]) tensor([-0.0362])
933
tensor([0.0195]) tensor([-0.0361])
934
tensor([0.0195]) tensor([-0.0361])
935
tensor([0.0194]) tensor([-0.0360])
936
tensor([0.0194]) tensor([-0.0360])
937
tensor([0.0194]) tensor([-0.0359])
938
tensor([0.0194]) tensor([-0.0359])
939
tensor([0.0193]) tensor([-0.0358])
940
tensor([0.0193]) tensor([-0.0358])
941
tensor([0.0193]) tensor([-0.0357])
942
tensor([0.0193]) tensor([-0.0357])
943
tensor([0.0192]) tensor([-0.0356])
944
tensor([0.0192]) tensor([-0.0356])
945
tensor([0.0192]) tensor([-0.0355])
946
tensor([0.0192]) tensor([-0.0355])
947
tensor([0.0191]) tensor([

In [54]:
n_epochs = 1000

a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
# print(a,b)
optimizer = optim.SGD([a,b], lr=lr)
for epoch in range(n_epochs):
    print(epoch)
    yhat = a + x_train_tensor*b
    error = y_train_tensor-yhat
    loss = (error**2).mean()
    loss.backward()
    print(a.grad, b.grad)

    optimizer.step()
    optimizer.zero_grad()


0
tensor([-3.5853]) tensor([-2.0296])
1
tensor([-3.4931]) tensor([-1.9797])
2
tensor([-3.4033]) tensor([-1.9311])
3
tensor([-3.3158]) tensor([-1.8837])
4
tensor([-3.2305]) tensor([-1.8375])
5
tensor([-3.1474]) tensor([-1.7925])
6
tensor([-3.0664]) tensor([-1.7487])
7
tensor([-2.9875]) tensor([-1.7059])
8
tensor([-2.9106]) tensor([-1.6643])
9
tensor([-2.8356]) tensor([-1.6237])
10
tensor([-2.7625]) tensor([-1.5841])
11
tensor([-2.6913]) tensor([-1.5455])
12
tensor([-2.6219]) tensor([-1.5080])
13
tensor([-2.5543]) tensor([-1.4713])
14
tensor([-2.4884]) tensor([-1.4356])
15
tensor([-2.4242]) tensor([-1.4008])
16
tensor([-2.3616]) tensor([-1.3669])
17
tensor([-2.3006]) tensor([-1.3338])
18
tensor([-2.2411]) tensor([-1.3016])
19
tensor([-2.1832]) tensor([-1.2702])
20
tensor([-2.1268]) tensor([-1.2396])
21
tensor([-2.0717]) tensor([-1.2098])
22
tensor([-2.0181]) tensor([-1.1807])
23
tensor([-1.9659]) tensor([-1.1524])
24
tensor([-1.9149]) tensor([-1.1248])
25
tensor([-1.8653]) tensor([-1.097

tensor([0.0242]) tensor([-0.0513])
281
tensor([0.0243]) tensor([-0.0512])
282
tensor([0.0243]) tensor([-0.0511])
283
tensor([0.0243]) tensor([-0.0510])
284
tensor([0.0244]) tensor([-0.0509])
285
tensor([0.0244]) tensor([-0.0508])
286
tensor([0.0244]) tensor([-0.0507])
287
tensor([0.0244]) tensor([-0.0506])
288
tensor([0.0245]) tensor([-0.0505])
289
tensor([0.0245]) tensor([-0.0504])
290
tensor([0.0245]) tensor([-0.0503])
291
tensor([0.0245]) tensor([-0.0502])
292
tensor([0.0245]) tensor([-0.0501])
293
tensor([0.0245]) tensor([-0.0500])
294
tensor([0.0246]) tensor([-0.0499])
295
tensor([0.0246]) tensor([-0.0498])
296
tensor([0.0246]) tensor([-0.0497])
297
tensor([0.0246]) tensor([-0.0497])
298
tensor([0.0246]) tensor([-0.0496])
299
tensor([0.0246]) tensor([-0.0495])
300
tensor([0.0246]) tensor([-0.0494])
301
tensor([0.0246]) tensor([-0.0493])
302
tensor([0.0246]) tensor([-0.0492])
303
tensor([0.0246]) tensor([-0.0491])
304
tensor([0.0246]) tensor([-0.0490])
305
tensor([0.0246]) tensor([

tensor([0.0167]) tensor([-0.0310])
634
tensor([0.0167]) tensor([-0.0309])
635
tensor([0.0167]) tensor([-0.0309])
636
tensor([0.0167]) tensor([-0.0309])
637
tensor([0.0166]) tensor([-0.0308])
638
tensor([0.0166]) tensor([-0.0308])
639
tensor([0.0166]) tensor([-0.0307])
640
tensor([0.0166]) tensor([-0.0307])
641
tensor([0.0165]) tensor([-0.0306])
642
tensor([0.0165]) tensor([-0.0306])
643
tensor([0.0165]) tensor([-0.0306])
644
tensor([0.0165]) tensor([-0.0305])
645
tensor([0.0165]) tensor([-0.0305])
646
tensor([0.0164]) tensor([-0.0304])
647
tensor([0.0164]) tensor([-0.0304])
648
tensor([0.0164]) tensor([-0.0304])
649
tensor([0.0164]) tensor([-0.0303])
650
tensor([0.0163]) tensor([-0.0303])
651
tensor([0.0163]) tensor([-0.0302])
652
tensor([0.0163]) tensor([-0.0302])
653
tensor([0.0163]) tensor([-0.0302])
654
tensor([0.0163]) tensor([-0.0301])
655
tensor([0.0162]) tensor([-0.0301])
656
tensor([0.0162]) tensor([-0.0300])
657
tensor([0.0162]) tensor([-0.0300])
658
tensor([0.0162]) tensor([

968
tensor([0.0106]) tensor([-0.0197])
969
tensor([0.0106]) tensor([-0.0197])
970
tensor([0.0106]) tensor([-0.0197])
971
tensor([0.0106]) tensor([-0.0196])
972
tensor([0.0106]) tensor([-0.0196])
973
tensor([0.0106]) tensor([-0.0196])
974
tensor([0.0106]) tensor([-0.0196])
975
tensor([0.0105]) tensor([-0.0195])
976
tensor([0.0105]) tensor([-0.0195])
977
tensor([0.0105]) tensor([-0.0195])
978
tensor([0.0105]) tensor([-0.0195])
979
tensor([0.0105]) tensor([-0.0194])
980
tensor([0.0105]) tensor([-0.0194])
981
tensor([0.0105]) tensor([-0.0194])
982
tensor([0.0104]) tensor([-0.0193])
983
tensor([0.0104]) tensor([-0.0193])
984
tensor([0.0104]) tensor([-0.0193])
985
tensor([0.0104]) tensor([-0.0193])
986
tensor([0.0104]) tensor([-0.0192])
987
tensor([0.0104]) tensor([-0.0192])
988
tensor([0.0104]) tensor([-0.0192])
989
tensor([0.0103]) tensor([-0.0192])
990
tensor([0.0103]) tensor([-0.0191])
991
tensor([0.0103]) tensor([-0.0191])
992
tensor([0.0103]) tensor([-0.0191])
993
tensor([0.0103]) tens

In [55]:
n_epochs = 1000

lossfn = nn.MSELoss(reduction='mean')

a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

optimizer = optim.SGD([a,b], lr=lr)
for epoch in range(n_epochs):
    print(epoch)
    yhat = a + x_train_tensor*b

    
    loss = lossfn(y_train_tensor, yhat)
    
    loss.backward()
    print(a.grad, b.grad)

    optimizer.step()
    optimizer.zero_grad()


0
tensor([-7.7935]) tensor([-4.3544])
1
tensor([-7.5938]) tensor([-4.2464])
2
tensor([-7.3992]) tensor([-4.1411])
3
tensor([-7.2095]) tensor([-4.0385])
4
tensor([-7.0246]) tensor([-3.9385])
5
tensor([-6.8445]) tensor([-3.8411])
6
tensor([-6.6689]) tensor([-3.7461])
7
tensor([-6.4978]) tensor([-3.6535])
8
tensor([-6.3310]) tensor([-3.5633])
9
tensor([-6.1685]) tensor([-3.4754])
10
tensor([-6.0102]) tensor([-3.3897])
11
tensor([-5.8558]) tensor([-3.3062])
12
tensor([-5.7054]) tensor([-3.2248])
13
tensor([-5.5588]) tensor([-3.1454])
14
tensor([-5.4160]) tensor([-3.0681])
15
tensor([-5.2768]) tensor([-2.9928])
16
tensor([-5.1411]) tensor([-2.9193])
17
tensor([-5.0089]) tensor([-2.8477])
18
tensor([-4.8800]) tensor([-2.7780])
19
tensor([-4.7544]) tensor([-2.7100])
20
tensor([-4.6321]) tensor([-2.6437])
21
tensor([-4.5128]) tensor([-2.5791])
22
tensor([-4.3966]) tensor([-2.5162])
23
tensor([-4.2833]) tensor([-2.4549])
24
tensor([-4.1729]) tensor([-2.3951])
25
tensor([-4.0653]) tensor([-2.336

tensor([0.0367]) tensor([-0.0802])
286
tensor([0.0368]) tensor([-0.0800])
287
tensor([0.0369]) tensor([-0.0798])
288
tensor([0.0369]) tensor([-0.0797])
289
tensor([0.0370]) tensor([-0.0795])
290
tensor([0.0371]) tensor([-0.0793])
291
tensor([0.0371]) tensor([-0.0792])
292
tensor([0.0372]) tensor([-0.0790])
293
tensor([0.0372]) tensor([-0.0788])
294
tensor([0.0373]) tensor([-0.0787])
295
tensor([0.0373]) tensor([-0.0785])
296
tensor([0.0374]) tensor([-0.0784])
297
tensor([0.0374]) tensor([-0.0782])
298
tensor([0.0374]) tensor([-0.0780])
299
tensor([0.0375]) tensor([-0.0779])
300
tensor([0.0375]) tensor([-0.0777])
301
tensor([0.0375]) tensor([-0.0776])
302
tensor([0.0376]) tensor([-0.0774])
303
tensor([0.0376]) tensor([-0.0773])
304
tensor([0.0376]) tensor([-0.0771])
305
tensor([0.0377]) tensor([-0.0770])
306
tensor([0.0377]) tensor([-0.0769])
307
tensor([0.0377]) tensor([-0.0767])
308
tensor([0.0377]) tensor([-0.0766])
309
tensor([0.0377]) tensor([-0.0764])
310
tensor([0.0377]) tensor([

tensor([0.0290]) tensor([-0.0537])
558
tensor([0.0289]) tensor([-0.0536])
559
tensor([0.0289]) tensor([-0.0535])
560
tensor([0.0288]) tensor([-0.0534])
561
tensor([0.0288]) tensor([-0.0534])
562
tensor([0.0288]) tensor([-0.0533])
563
tensor([0.0287]) tensor([-0.0532])
564
tensor([0.0287]) tensor([-0.0532])
565
tensor([0.0287]) tensor([-0.0531])
566
tensor([0.0286]) tensor([-0.0530])
567
tensor([0.0286]) tensor([-0.0529])
568
tensor([0.0285]) tensor([-0.0529])
569
tensor([0.0285]) tensor([-0.0528])
570
tensor([0.0285]) tensor([-0.0527])
571
tensor([0.0284]) tensor([-0.0527])
572
tensor([0.0284]) tensor([-0.0526])
573
tensor([0.0283]) tensor([-0.0525])
574
tensor([0.0283]) tensor([-0.0524])
575
tensor([0.0283]) tensor([-0.0524])
576
tensor([0.0282]) tensor([-0.0523])
577
tensor([0.0282]) tensor([-0.0522])
578
tensor([0.0282]) tensor([-0.0522])
579
tensor([0.0281]) tensor([-0.0521])
580
tensor([0.0281]) tensor([-0.0520])
581
tensor([0.0280]) tensor([-0.0519])
582
tensor([0.0280]) tensor([

tensor([0.0188]) tensor([-0.0348])
879
tensor([0.0188]) tensor([-0.0348])
880
tensor([0.0187]) tensor([-0.0347])
881
tensor([0.0187]) tensor([-0.0347])
882
tensor([0.0187]) tensor([-0.0346])
883
tensor([0.0187]) tensor([-0.0346])
884
tensor([0.0186]) tensor([-0.0345])
885
tensor([0.0186]) tensor([-0.0345])
886
tensor([0.0186]) tensor([-0.0344])
887
tensor([0.0186]) tensor([-0.0344])
888
tensor([0.0185]) tensor([-0.0343])
889
tensor([0.0185]) tensor([-0.0343])
890
tensor([0.0185]) tensor([-0.0342])
891
tensor([0.0185]) tensor([-0.0342])
892
tensor([0.0184]) tensor([-0.0342])
893
tensor([0.0184]) tensor([-0.0341])
894
tensor([0.0184]) tensor([-0.0341])
895
tensor([0.0184]) tensor([-0.0340])
896
tensor([0.0183]) tensor([-0.0340])
897
tensor([0.0183]) tensor([-0.0339])
898
tensor([0.0183]) tensor([-0.0339])
899
tensor([0.0183]) tensor([-0.0338])
900
tensor([0.0182]) tensor([-0.0338])
901
tensor([0.0182]) tensor([-0.0337])
902
tensor([0.0182]) tensor([-0.0337])
903
tensor([0.0182]) tensor([