# LEARNING PYTORCH WITH EXAMPLES 

### Warm-up : numpy

In [2]:
import numpy as np

N, D_in, H, D_out = 64, 1000, 100,10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500)
    '''Forward pass'''
    h = x.dot(w1)
    h_relu = np.maximum(h,0)
    y_pred = h_relu.dot(w2)
    
    '''Compute and print loss'''
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    '''Backprop to compute gradients of w1 nd w2 with respect to loss'''
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate * grad_w2
    

0 31720400.43055302
1 26945348.074650496
2 27681328.376202393
3 28988640.33760403
4 27313765.169234376
5 21664845.072282486
6 14263754.306349156
7 8190118.250295306
8 4461892.562054984
9 2523337.6439016275
10 1569889.6565635707
11 1089877.5980023644
12 828030.9886252608
13 668086.5139946555
14 558675.84608529
15 477234.8952255328
16 412820.58888986404
17 360259.3100781108
18 316347.08529589407
19 279083.87525355094
20 247176.05939289922
21 219642.15474885114
22 195824.75548953324
23 175081.90481734351
24 156938.65137642357
25 141002.05457348668
26 126967.6555741465
27 114595.43627478118
28 103656.786420647
29 93934.24415913642
30 85276.90296857806
31 77542.82345829479
32 70614.46668241662
33 64397.07945380926
34 58809.99601504862
35 53778.39033304302
36 49238.77744520557
37 45134.04530222392
38 41418.86932987158
39 38057.896531675804
40 35005.66336491848
41 32232.99552800735
42 29709.21068294842
43 27408.53715925769
44 25309.982705468203
45 23391.46836199837
46 21636.42135970713
47 200

# PyTorch Tensors

+ __torch.tensor.mm()__ does the matrix multiplication of tensor
+ __torch.tensor.clamp()__ does the clamping which can be used to produce relu function
+ __torch.tensor.pow()__ is used to implement power function and __.sum()__ can be used to implement the sum of the elements of the tensor
+ __.clone()__ returns the copy of the tensor but keep tracks of it in the computation graph unlike __.copy()__ and will copy the tensor to new memory
+ __.detach()__ will return a tensor which will never require gradient and will share same location as the original tensor

In [3]:
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
    '''Forward pass on the network'''
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    '''Compute and print loss'''
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    '''Backprop to compute gradients of w1 and w2 with respect to loss'''
    grad_y_pred = 2.0 *(y_pred-y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    '''Update weights using gradient descent'''
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    

0 25241058.0
1 19537790.0
2 17152040.0
3 15696025.0
4 14021073.0
5 11831406.0
6 9295555.0
7 6863087.5
8 4836865.0
9 3330731.0
10 2287734.0
11 1596457.5
12 1144143.75
13 847714.25
14 649714.1875
15 513911.96875
16 417789.25
17 347362.34375
18 293985.875
19 252359.546875
20 219050.875
21 191726.875
22 168922.765625
23 149605.96875
24 133085.1875
25 118834.125
26 106454.7109375
27 95638.8515625
28 86140.0390625
29 77768.71875
30 70352.3046875
31 63757.9140625
32 57887.03515625
33 52645.265625
34 47953.80859375
35 43743.5234375
36 39957.125
37 36545.2734375
38 33472.4453125
39 30698.587890625
40 28185.689453125
41 25905.9296875
42 23834.140625
43 21948.765625
44 20230.453125
45 18663.123046875
46 17231.890625
47 15923.353515625
48 14726.134765625
49 13629.4453125
50 12624.0966796875
51 11700.69921875
52 10851.8662109375
53 10070.73828125
54 9351.44140625
55 8688.7822265625
56 8077.7685546875
57 7513.73193359375
58 6992.54541015625
59 6510.77978515625
60 6065.11962890625
61 5652.65869140625

### Pytorch Autograd and Tensors

+ Manually update weights using gradient descent. Wrap in __torch.no_grad()__ because __weights have requires_grad=True, but we don't need to track this in autograd__.
+ An alternative way is to operate on __weight.data__ and __weight.grad.data__.
+ Recall that __tensor.data__ gives __a tensor that shares the storage with tensor, but doesn't track history__.
+ You can also use torch.optim.SGD to achieve this.
+ Manually zero the gradients after updating weights with __torch.tensor.grad_zero\_()__

In [None]:
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100,10

x = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
learning_rate = 1e-6

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss =(y_pred - y.pow(2).sum())
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
    

### PyTorch: Defining new autograd functions
+ Under the hood, each primitive autograd operator is really two functions that operate on Tensors.
    + The __forward__ function computes output Tensors from input Tensors.
    + The backward function receives the gradient of the output Tensors with respect to some scalar value and computes the gradient of the input Tensors with respect to that same scalar value.
+ In Pytorch, new autograd operator is defined by defining a subclass of __torch.autograd.Function__ and implementing the __forward__ and __backward__ functions.
    + We can use our new autograd operator by __constructing an instance__ and __calling it like a fuction__, passing Tensors containing input data.
    + __forward__
        + In forward pass we receive a Tensor containing the input and return a Tensor containing the output. __ctx__ is a context object that can be used to stash information for backward computation.
        + To cache arbitrary objects for use in the backward pass, use the __ctx.save_for_backward method__.
    + __backward__
         + In the backward pass we receive a Tensor containing the gradient of the loss with respect to the output, and we need to compute the gradient of the loss with respect to the input.
         + __ctx.saved_tensors__ can be used to obtain the tensors in __backward__ method stored in the __forward__ method
    + To apply __Function__, we use __Function.apply__ method i.e. __class\_name.apply__

#### Difference between __@staticmethod__ and __@classmethod__
+ The usual method of the class, the __object instance(self)__ is implicitly passed as initial argrument.
+ With __classmethod__, the __class__ of the object instance is implicitly passed as first argument instead of __self__.
+ With __staticmethod__. neither __self__ nor __cls__ is passed as the first arguments. They behave like plain functions except that it can be called from an instance or the class.
    + Staticmethods are used to group functions which have some logical connection with a class to the class.

In [5]:
import torch

class MyRelU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    relu = MyRelU.apply
    
    y_pred= relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred-y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
    

        
        

0 42929508.0
1 45462784.0
2 46581600.0
3 37495368.0
4 21923342.0
5 9956093.0
6 4442196.0
7 2395969.0
8 1604592.0
9 1224214.75
10 992305.0625
11 826688.625
12 698724.5
13 596142.625
14 512412.5
15 443294.34375
16 385629.71875
17 337162.65625
18 296100.34375
19 261122.5625
20 231115.40625
21 205240.53125
22 182819.609375
23 163309.46875
24 146298.578125
25 131370.25
26 118231.140625
27 106635.859375
28 96368.859375
29 87249.328125
30 79134.171875
31 71884.234375
32 65390.4375
33 59567.0
34 54333.6875
35 49621.2421875
36 45372.52734375
37 41543.890625
38 38078.25390625
39 34936.265625
40 32085.8671875
41 29493.705078125
42 27133.5
43 24982.0
44 23019.15625
45 21225.72265625
46 19585.6484375
47 18078.36328125
48 16697.318359375
49 15432.00390625
50 14271.31640625
51 13205.7900390625
52 12226.236328125
53 11325.474609375
54 10496.4716796875
55 9733.7197265625
56 9030.154296875
57 8381.462890625
58 7782.75927734375
59 7230.01904296875
60 6719.345703125
61 6247.47412109375
62 5811.12060546875

### Pytorch:nn
+ __nn__ package provides a set of __Modules__, which are roughly equivalet to neural network layers.
+ A __Module__ receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters.
+ The __nn__ package also defines a set of useful __loss functions__ that are comonly used when training neural networks
+ when accessing the __gradients__ of the model, use __model.parameters.grad__
+ when accessing the __children_modules__ of the model, use __model.children__

In [8]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
                            torch.nn.Linear(D_in, H),
                            torch.nn.ReLU(),
                            torch.nn.Linear(H, D_out),
                            )

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4

for t in range(500):
    
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 686.77001953125
1 636.7199096679688
2 593.7136840820312
3 555.7003784179688
4 521.685546875
5 490.9718017578125
6 462.9117126464844
7 437.169921875
8 413.18865966796875
9 390.7152099609375
10 369.6873474121094
11 349.8592224121094
12 331.1629638671875
13 313.5677490234375
14 296.9898376464844
15 281.2752990722656
16 266.3110046386719
17 251.97779846191406
18 238.3396453857422
19 225.3903350830078
20 213.05789184570312
21 201.29515075683594
22 190.1243133544922
23 179.5125274658203
24 169.43206787109375
25 159.8672637939453
26 150.76783752441406
27 142.15342712402344
28 133.99636840820312
29 126.31324005126953
30 119.05182647705078
31 112.18504333496094
32 105.66714477539062
33 99.51497650146484
34 93.70829010009766
35 88.23571014404297
36 83.08202362060547
37 78.23383331298828
38 73.65687561035156
39 69.33566284179688
40 65.26062774658203
41 61.432621002197266
42 57.81705856323242
43 54.41640853881836
44 51.21603012084961
45 48.20186233520508
46 45.36922836303711
47 42.70772171020508

### Pytorch:optim
+ The __optim__ package in PyTorch abstracys the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms
+ If optimizer is used, then use __optimizer.zero_grad()__, if not:
    + If model is created from __nn__ package, then use __model.zero_grad()__, if not
        + Use __torch.tensor.grad.zero\_()__

In [10]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
                            torch.nn.Linear(D_in, H),
                            torch.nn.ReLU(),
                            torch.nn.Linear(H, D_out),
                            )

loss = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 657.4193115234375
1 640.6678466796875
2 624.44775390625
3 608.7592163085938
4 593.5210571289062
5 578.6680297851562
6 564.2687377929688
7 550.3374633789062
8 536.7631225585938
9 523.5759887695312
10 510.75823974609375
11 498.34600830078125
12 486.3213806152344
13 474.7369079589844
14 463.5292053222656
15 452.63037109375
16 442.07293701171875
17 431.7386474609375
18 421.6609191894531
19 411.85565185546875
20 402.29766845703125
21 393.0248107910156
22 383.9787292480469
23 375.1568908691406
24 366.6621398925781
25 358.4018249511719
26 350.3008117675781
27 342.3863220214844
28 334.7334289550781
29 327.2553405761719
30 319.9289855957031
31 312.74884033203125
32 305.73626708984375
33 298.8684997558594
34 292.1649169921875
35 285.5826721191406
36 279.1371765136719
37 272.8820495605469
38 266.7734069824219
39 260.7915344238281
40 254.9500732421875
41 249.22500610351562
42 243.62376403808594
43 238.13748168945312
44 232.7581329345703
45 227.4815673828125
46 222.32666015625
47 217.274429321289

### PyTorch: Custom nn Modules
+ We can define our own Modules by subclassing __nn.Module__ and defining a __forward__ which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors

In [11]:
import torch
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 632.3840942382812
1 588.8019409179688
2 550.829345703125
3 517.3412475585938
4 487.2964782714844
5 459.9784240722656
6 435.0256042480469
7 411.9400939941406
8 390.4691162109375
9 370.6712951660156
10 351.9881896972656
11 334.4684753417969
12 318.0120849609375
13 302.45318603515625
14 287.66619873046875
15 273.6778564453125
16 260.3806457519531
17 247.54098510742188
18 235.28622436523438
19 223.55113220214844
20 212.28363037109375
21 201.54605102539062
22 191.25491333007812
23 181.38938903808594
24 171.98019409179688
25 162.98716735839844
26 154.3724365234375
27 146.13059997558594
28 138.2767791748047
29 130.78842163085938
30 123.66007995605469
31 116.8766860961914
32 110.43254089355469
33 104.30363464355469
34 98.4907455444336
35 92.9811019897461
36 87.76337432861328
37 82.82030487060547
38 78.13220977783203
39 73.69738006591797
40 69.50666809082031
41 65.54931640625
42 61.8089714050293
43 58.2784538269043
44 54.94819641113281
45 51.81129455566406
46 48.8522834777832
47 46.0642356872

### PyTorch: Control Flow + Weight Sharing
+ We can use __normal Python flow control__ to implement __the loop__
+ We can implement __weight sharing__ among the innermost layers by simply __reusing the same Module multiple times__ when defining the __forward pass__.

In [12]:
import random
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0,3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = DynamicNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(),lr=1e-4, momentum=0.9)
for t in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 637.8126220703125
1 639.7418212890625
2 635.7413330078125
3 633.8248901367188
4 629.22607421875
5 673.8435668945312
6 628.5025024414062
7 626.6781005859375
8 542.7759399414062
9 482.1644592285156
10 633.5259399414062
11 625.4839477539062
12 609.3925170898438
13 292.12664794921875
14 620.8475341796875
15 604.4138793945312
16 567.7781372070312
17 186.27174377441406
18 160.37791442871094
19 513.4446411132812
20 109.00994110107422
21 610.83251953125
22 607.1693115234375
23 553.8339233398438
24 77.18527221679688
25 379.35516357421875
26 72.11760711669922
27 324.6487731933594
28 61.13218307495117
29 543.0363159179688
30 526.1060791015625
31 234.15106201171875
32 389.2287292480469
33 354.8470764160156
34 406.0080261230469
35 159.3876190185547
36 150.6321563720703
37 234.65814208984375
38 141.52122497558594
39 248.72360229492188
40 216.34078979492188
41 146.87339782714844
42 115.48464965820312
43 133.52316284179688
44 105.90206146240234
45 94.11792755126953
46 60.37895202636719
47 67.9872665