In [1]:
# import stuff
import torch
from torch import nn
from torch.nn import functional as F

import numpy as np
import matplotlib.pyplot as plt

torch.manual_seed(446)
np.random.seed(446)

# Tensors: The analog to numpy arrays  
Let's go over the basics of how to use tensors. If you're familiar with numpy, these methods are very similar to those.

In [2]:
# we create tensors in a similar way to numpy nd arrays
x_numpy = np.array([0.1, 0.2, 0.3])
x_torch = torch.tensor([0.1, 0.2, 0.3])
print("x_numpy =", x_numpy, "x_torch =", x_torch)

x_numpy = [0.1 0.2 0.3] x_torch = tensor([0.1000, 0.2000, 0.3000])


In [3]:
# you can convert between tensors and arrays freely
print(torch.from_numpy(x_numpy), x_torch.numpy())

tensor([0.1000, 0.2000, 0.3000], dtype=torch.float64) [0.1 0.2 0.3]


In [4]:
# Basic operactions work the same as in numpy
y_numpy = np.array([3,4,5.])
y_torch = torch.tensor([3,4,5.])
print("x + y")
print(x_numpy + y_numpy, x_torch + y_torch)

x + y
[3.1 4.2 5.3] tensor([3.1000, 4.2000, 5.3000])


In [5]:
# many functions that are in numpy are also in pytorch
print("norm")
print(np.linalg.norm(x_numpy), torch.norm(x_torch))
print()

# to apply an operation along a dimension,
# we use the dim keyword argument instead of axis
print("mean along the 0th dimension")
x_numpy = np.array([[1,2],[3,4.]])
x_torch = torch.tensor([[1,2],[3,4.]])
print(np.mean(x_numpy, axis=0), torch.mean(x_torch, dim=0))

norm
0.37416573867739417 tensor(0.3742)

mean along the 0th dimension
[2. 3.] tensor([2., 3.])


## Tensor.view()  
This is analogous to numpy.reshape(). Let's use a toy example

In [6]:
N, C, W, H = 10000, 3, 28, 28
X = torch.randn((N, C, W, H))

print(X.shape)                  # should return [10000, 3, 28, 28]
print(X.view(N, C, 784).shape)  # condenses the last 2 dimensions (28 x 28 = 784)
print(X.view(-1, C, 784).shape) # passing a -1 automatically choses the correct dimension to reshape

torch.Size([10000, 3, 28, 28])
torch.Size([10000, 3, 784])
torch.Size([10000, 3, 784])


## The require_grad keyword  
require_grad tells PyTorch to keep track of the [computation graph](https://colah.github.io/posts/2015-08-Backprop/) when using these tensors. In a broad sense computation graphs keep track of the operations done on a set of variables and how the results from those operations rely on their inputs. Setting up these graphs lets you run through backpropagation using the method that we talked about last lecture. So basically, setting require_grad to true will allow you to automatically calculate the gradient through PyTorch!

In [7]:
a = torch.tensor(2.0, requires_grad=True) # we set requires_grad=True to let PyTorch know to keep the graph
b = torch.tensor(1.0, requires_grad=True)
c = a + b # PyTorch will remember these operations in the computation graph
d = b + 1
e = c * d
print('c', c)
print('d', d)
print('e', e)

c tensor(3., grad_fn=<AddBackward0>)
d tensor(2., grad_fn=<AddBackward0>)
e tensor(6., grad_fn=<MulBackward0>)


As an example for how we can use this to auto-differentiate functions, lets consider $y = (x-2)^2$.  
We could just find the derivative of this function analytically, or we could let PyTorch do it automatically

In [8]:
# define the function
def f(x):
    return (x-2)**2
# define the derivative (we can do so pretty easily)
def fp(x):
    return 2*(x-2)

x = torch.tensor([1.0], requires_grad=True)

y = f(x)     # this operation is recorded by PyTorch in the computation graph
y.backward() # use the computation graph to calculate gradients at each step

print('Analytical f\'(x):', fp(x))
print('PyTorch\'s f\'(x):', x.grad)

Analytical f'(x): tensor([-2.], grad_fn=<MulBackward0>)
PyTorch's f'(x): tensor([-2.])


You get the same answer! So the schematic for **auto differentiation** goes like this:
* Define your data as a tensor x  
* Do operations on your tensor to get an output o (forward propagation)
* Call o.backward()
* get the gradient you want with x.grad()  
  
To further show how you would do this in practice, let's use this auto-differentiation to impliment a basic gradient decent algorithm using the same function as above

In [9]:
x = torch.tensor([5.0], requires_grad=True) # starting guess
step_size = 0.25

print('iter,\tx,\tf(x),\tf\'(x),\tf\'(x) pytorch')
for i in range(15):
    y = f(x)     # this operation gets saved internally
    y.backward() # compute the gradient
    
    print('{},\t{:.3f},\t{:.3f},\t{:.3f},\t{:.3f}'.format(i, x.item(), f(x).item(), fp(x).item(), x.grad.item()))
    x.data = x.data - (step_size * x.grad) # perform a GD update step
    
    # We need to zero the grad variable since the backward()
    # call accumulates the gradients in .grad instead of overwriting.
    x.grad.zero_()

iter,	x,	f(x),	f'(x),	f'(x) pytorch
0,	5.000,	9.000,	6.000,	6.000
1,	3.500,	2.250,	3.000,	3.000
2,	2.750,	0.562,	1.500,	1.500
3,	2.375,	0.141,	0.750,	0.750
4,	2.188,	0.035,	0.375,	0.375
5,	2.094,	0.009,	0.188,	0.188
6,	2.047,	0.002,	0.094,	0.094
7,	2.023,	0.001,	0.047,	0.047
8,	2.012,	0.000,	0.023,	0.023
9,	2.006,	0.000,	0.012,	0.012
10,	2.003,	0.000,	0.006,	0.006
11,	2.001,	0.000,	0.003,	0.003
12,	2.001,	0.000,	0.001,	0.001
13,	2.000,	0.000,	0.001,	0.001
14,	2.000,	0.000,	0.000,	0.000


# Now to Chapter 5 Stuff  
This section covers the same stuff as in the slides, but in a more coherent fasion. Much of this code is taken from the book directly. Feel free to play around with things!  
## Defining blocks  
For more complex networks we want an easier way of defining all the layers than doing so one at a time. To do that, we can use blocks!

In [10]:
# how you would normally make a neural network without blocks
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[-0.1457, -0.2313,  0.2607,  0.1052, -0.1396, -0.1844,  0.0716, -0.0148,
         -0.1052, -0.0598],
        [-0.0518, -0.2350,  0.3061, -0.0201, -0.1069, -0.1128,  0.0217,  0.0109,
         -0.1015, -0.0378]], grad_fn=<AddmmBackward0>)

The way you define a block in PyTorch is by defining a class that inherits from nn.Module. Inheriting allows PyTorch to handle a lot of the nitty-gritty details automatically. For example, as long as you aren't defining a new type of operator, PyTorch will autmoatically generate a backpropagation function, so you don't have to! All you have to define for a valid block is a constructor that initializes your block structure, and a forward propagation function. The below cell defines 2 different blocks, and then stiches them together to form a more complex network

In [11]:
# first, a standard black with 1 hidden layer and 1 output layer
class MLP(nn.Module):
    # Declare a layer with model parameters. Here, we declare two fully
    # connected layers
    def __init__(self):
        # Call the constructor of the `MLP` parent class `Module` to perform
        # the necessary initialization. In this way, other function arguments
        # can also be specified during class instantiation, such as the model
        # parameters, `params` (to be described later)
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # Hidden layer
        self.out = nn.Linear(256, 10)  # Output layer

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input `X`
    def forward(self, X):
        # Note here we use the funtional version of ReLU defined in the
        # nn.functional module.
        return self.out(F.relu(self.hidden(X)))

# Now let's define a more wacky block
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # Random weight parameters that will not compute gradients and
        # therefore keep constant during training
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        # Use the created constant parameters, as well as the `relu` and `mm`
        # functions
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # Reuse the fully-connected layer. This is equivalent to sharing
        # parameters with two fully-connected layers
        X = self.linear(X)
        # Control flow (this code is arbitrary and just to show that you can do stuff like this
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

# Now let's stitch these 2 blocks together - note that you send blocks as parameters passed to Sequential instead of just layers
# So this network has an MLP block, followed by a linear layer, and finally a FixedHiddenMLP block
# 20 -> 256 -> 10 -> 20 -> 20
net = nn.Sequential(MLP(), nn.Linear(10, 20), FixedHiddenMLP())
net(X)

tensor(-0.0494, grad_fn=<SumBackward0>)

## Custom Layers  
We can use this method of making blocks to make custom individual layers as well

In [12]:
# Let's define a layer that doesn't have any tunable parameters, and just subtracts the mean of the input
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()
    
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [13]:
# Let's try making a layer with parameters this time. This is basically just a normal layer but without an activation function
class MyLinear(nn.Module):
    def __init__(self, in_units, out_units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, out_units))
        self.bias = nn.Parameter(torch.randn(out_units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

# Notice you have to pass in the size of the inputs and outputs
linear = MyLinear(5, 3)
print(linear.weight)
print()

# Just like beofre with blocks, you can combine these custom layers together to make larger networks
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
print(net(torch.rand(2, 64)))

# EXERSIZE: Try making your own custom layer!

Parameter containing:
tensor([[ 1.2583, -1.0947, -0.2238],
        [ 1.7347, -0.1966,  0.9766],
        [ 0.0026,  0.7633, -1.1057],
        [-0.4654, -1.4138,  1.2072],
        [-1.0673, -0.7208,  0.1859]], requires_grad=True)

tensor([[0.],
        [0.]])


## Accessing Parameters  
Sometimes you want to be able to access your parameters, like for debugging or for saving your network. There are a few ways you can do that

In [14]:
# Let's start by defining a network we can manipulate the parameters of
params_net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU(), nn.Linear(4, 2))

# to see what this network looks like, we can just print it
print(params_net)

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=2, bias=True)
)


In [15]:
# Let's say we want the bias from the last layer
# We can query the layer you want using state_dict() then specify the bias
print(params_net[4].state_dict())
print(params_net[4].state_dict()["bias"])
print()

# We could also just access the bias directly
print(type(params_net[4].bias))
print(params_net[4].bias)
print(params_net[4].bias.data) # using .data will get you just the values, no overhead
print()

# Finally, you can query the entire net, then specify you want the bias from the 4th entry
print(params_net.state_dict()['4.bias'].data)

OrderedDict([('weight', tensor([[-0.0866, -0.0627, -0.0664, -0.1503],
        [-0.2659, -0.0934,  0.1608,  0.3442]])), ('bias', tensor([0.0628, 0.0727]))])
tensor([0.0628, 0.0727])

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.0628, 0.0727], requires_grad=True)
tensor([0.0628, 0.0727])

tensor([0.0628, 0.0727])


## Initializing Parameters  
We've learned that the way you initialize your network is very inportant, but how do you actually do that? You may have noticed I haven't bothered specifying how parameters are initialized in the above examples, and that's because PyTorch automatically does so for us using a normal distribution with $\sigma = 1 / \sqrt{N_{weights}}$. If we want more control over initialization, we can do it ourselves by using some common in-built initilization functions, or by creating our own function

In [16]:
# Standard normal distribution
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

# let's test this using the network from above
params_net.apply(init_normal)
params_net[0].weight.data[0], params_net[0].bias.data[0]

(tensor([-0.0113,  0.0058, -0.0023,  0.0110]), tensor(0.))

In [17]:
# You can use an xavier distribution too
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
# Or, you can even just make all the weights constant
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

params_net[0].apply(xavier)
params_net[2].apply(init_constant)
print(params_net[0].weight.data[0])
print(params_net[2].weight.data)

# EXERSIZE: Try making your own parameter initilization function! 

tensor([-0.0128, -0.0364, -0.0721,  0.4716])
tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])


## Sharing Parameters  
Sometimes we may want multiple layers to share the same weights. We can make that happen pretty easily

In [18]:
# We need to give the shared layer a name so that we can refer to its
# parameters
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(torch.rand(size=(2, 4)))

# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


## Saving and Loading  
PyTorch lets you save both individual tensors, and whole networks, to file for checkpointing / saving. The major caveat for saving networks is that it only saves the parameters, and not the structure. So, you'll need to rebuild or copy the structure over in code before loading in the parameters again

In [19]:
# Saving tensors is easy - just like numpy!
x = torch.arange(4)
print(x)
torch.save(x, 'x-file')

y = torch.load('x-file')
print(y)

tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3])


In [20]:
# Let's try saving the network we used above
torch.save(params_net.state_dict(), 'p_net.params')

# To test loading in these parameters, let's make a clone and load in p_net.params
clone_net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU(), nn.Linear(4, 2))
clone_net.load_state_dict(torch.load('p_net.params'))
clone_net.eval()

# If we've loaded correctly, the print statements below should be the same. Feel free to verify this for different layers!
print(params_net[4].state_dict())
print(clone_net[4].state_dict())

OrderedDict([('weight', tensor([[ 0.0113, -0.0101,  0.0112,  0.0005],
        [-0.0008, -0.0005, -0.0174, -0.0270]])), ('bias', tensor([0., 0.]))])
OrderedDict([('weight', tensor([[ 0.0113, -0.0101,  0.0112,  0.0005],
        [-0.0008, -0.0005, -0.0174, -0.0270]])), ('bias', tensor([0., 0.]))])


## Running on a GPU  
This last section requires an NVIDIA GPU and CUDA to run properly. Even if your machine doesn't have a GPU, it might still be useful to take a look at the code 

In [21]:
torch.device('cpu'), torch.device('cuda')
# Here's a helper function that specifies the device as gpu if one exists. If there isn't a GPU, it'll return the cpu device
def try_gpu(i=0):  #@save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')
try_gpu() # If setup correctly this should return device(type="cuda")

device(type='cpu')

In [22]:
# Let's try doing something on the GPU (this code will still run if not setup correctly, but on a cpu like normal)
X = torch.ones(2, 3, device=try_gpu())
Y = torch.rand(2, 3, device=try_gpu())
print(X)  # If done right this should specify device="cuda"
Z = X + Y # This operation will happen on the GPU

tensor([[1., 1., 1.],
        [1., 1., 1.]])


In [23]:
# Finally, you can transfer networks to GPU if you want
GPU_net = params_net.to(device=try_gpu())
GPU_net(torch.rand(4)) # This operation will happen on the GPU

tensor([ 0.0261, -0.0924], grad_fn=<AddBackward0>)