## Learning pytorch with examples 
- this notebook introduces the fundamental concepts of PyTorch through self-contained examples
<p>
    
including ...
- Tensors
- Autograd
- Defining new autograd functions
- nn module
- optim
- custom nn module
- Control Flow + Weight Sharing

### Tensors

In [11]:
# numpy
import numpy as np

# N: batch_size, H: hidden dims
N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in) # (64, 1000)
y = np.random.randn(N, D_out) # (64, 10)

w1 = np.random.randn(D_in, H) # (1000, 100)
w2 = np.random.randn(H, D_out) # (100, 10)

lr = 1e-6 
for t in range(500):
    # Forward pass: compute predicted y 
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss 
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss 
    grad_y_pred = 2.0 * (y_pred - y) # (64, 10)
    grad_w2 = h_relu.T.dot(grad_y_pred) # (100, 10)
    grad_h_relu = grad_y_pred.dot(w2.T) # (64, 100)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h) # (1000, 100)
    
    # Update weigths 
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2

0 30550786.678252764
1 25110953.947662655
2 21661292.91406838
3 17791799.747742258
4 13489025.448792547
5 9392729.094840549
6 6220181.915184816
7 4052364.2697424153
8 2698973.3105222657
9 1872115.8524401088
10 1364174.8259402183
11 1040819.2371894151
12 824925.8035429991
13 672437.2805064964
14 559520.7158192876
15 472486.4588767201
16 403249.1516163831
17 346887.9587069084
18 300263.7692678326
19 261202.6386545786
20 228194.39681613835
21 200095.44355836613
22 176059.88017745316
23 155369.21807537504
24 137490.5364258464
25 121982.82436266367
26 108486.96103980645
27 96713.69829501865
28 86401.66662157446
29 77356.74874203437
30 69393.73174146927
31 62358.41654543238
32 56141.94571395784
33 50630.32319654829
34 45732.521186704806
35 41374.81463894256
36 37489.2288511774
37 34014.95471840665
38 30903.133608181823
39 28113.521784802448
40 25611.089416310882
41 23359.500844502116
42 21329.90481229312
43 19496.829765893017
44 17840.247736029087
45 16341.184948237747
46 14982.297757163127


488 3.3429528421257415e-06
489 3.196852382234835e-06
490 3.057172409724853e-06
491 2.923652152715381e-06
492 2.7959777482993336e-06
493 2.6738649951215243e-06
494 2.5570758983416254e-06
495 2.445401363449915e-06
496 2.33862960731558e-06
497 2.2365362572224243e-06
498 2.1388982867884046e-06
499 2.0455437177485622e-06


In [15]:
# PyTorch
# numpy is not utilize GPUs to accelerate its numerical computations 
# pytorch Tensors can keep track of a computational graph and gradients, utilize GPU

import torch 

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

lr = 1e-6

for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 50:
        print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2 
    

50 10363.41796875
150 17.94891929626465
250 0.07549849152565002
350 0.0006497364956885576
450 6.203429074957967e-05


### Autograd
- When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

In [18]:
import torch

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64 ,1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

lr = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
    
    loss.backward() # backpropagating
    
    # we don't track weight.data. So, wrap in torch.no_grad()
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        
        # manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 650.3226318359375
199 3.5059964656829834
299 0.02962470054626465
399 0.0005231481627561152
499 6.611215940210968e-05


### PyTorch: Defining new autograd functions
- In PyTorch we can easily define our own autograd operator by defining a subclass of torch.autograd.Function and implementing the forward and backward functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Tensors containing input data.

In [19]:
import torch

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forwad pass we receive a Tensor containing the input and return 
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary 
        onjects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss 
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [20]:
dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64 ,1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

lr = 1e-6
for epoch in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply
    
    y_pred = relu(x.mm(w1)).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    if epoch % 100 == 99:
        print(t, loss.item())
        
    # Use autograd to compute the backward pass.        
    loss.backward()
    
    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

499 841.0775146484375
499 8.00600814819336
499 0.1287042498588562
499 0.0028903367929160595
499 0.00021001743152737617


### PyTorch: nn module


In [29]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction='sum')

lr = 1e-4
for epoch in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    if epoch % 100 == 99:
        print(epoch, loss.item())
        
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= lr* param.grad

99 2.4625134468078613
199 0.04482162371277809
299 0.0016880746698006988
399 8.578517008572817e-05
499 4.802728653885424e-06


### PyTorch: optim
- Up to this point we have updated the weights of our models by manually mutating the Tensors holding learnable parameters (with torch.no_grad() or .data to avoid tracking history in autograd). This is not a huge burden for simple optimization algorithms like stochastic gradient descent, but in practice we often train neural networks using more sophisticated optimizers like AdaGrad, RMSProp, Adam, etc.
- The optim package in PyTorch abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms.

In [31]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction="sum")

# Use the optim package to define an Optimizer that will update the weights of the model for us
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if epoch % 100 == 99:
        print(epoch, loss.item())
        
    # Before the backward pass, use the optimizer object to zero all of the gradients
    # for the variables it will update.
    # this is because by default, gradients are accumulated in bufffers(i.e, not overwritten)
    # whenever .backward() is called. 
    optimizer.zero_grad()
    
    loss.backward()
    
    # Calling the step function, an Optimizer makes an update to its parameters
    optimizer.step()

99 64.52503967285156
199 2.1026833057403564
299 0.03910357132554054
399 0.00018847813771571964
499 1.8031380477623316e-07


### PyTorch: Custom nn Modules


In [34]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as 
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return 
        a Tensor of output data. we can use Modules defined in the constructor as 
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [35]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for epoch in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    if epoch % 100 == 99:
        print(epoch, loss.item()) # Returns the value of this tensor as a standard Python number. 
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 2.5754308700561523
199 0.044577814638614655
299 0.0019828921649605036
399 0.0001189532849821262
499 7.833315976313315e-06


### PyTorch: Control Flow + Weight Sharing
- As an example of dynamic graphs and weight sharing, we implement a very strange model: a fully-connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.

- For this model we can use normal Python flow control to implement the loop, and we can implement weight sharing among the innermost layers by simply reusing the same Module multiple times when defining the forward pass.

In [36]:
import random
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.
        
        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

In [38]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = DynamicNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for epoch in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    if epoch % 100 == 99:
        print(epoch, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 10.051538467407227
199 3.7825474739074707
299 0.7820899486541748
399 3.851109266281128
499 17.0799617767334
