# Introduction to PyTorch

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
print(torch.__version__)
np.random.seed(42)

1.12.0+cpu


## Simple Regression Problem

### Data Generation

In [4]:
x = np.random.rand(1000, 1)
y = 1 + 2 * x + .1 * np.random.randn(1000, 1)

# Shuffles the indices
idx = np.arange(1000)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:800]
# Uses the remaining indices for validation
val_idx = idx[800:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

### Network Parameters

In [5]:
a = np.random.randn(1)
b = np.random.randn(1)
lr = 1e-1
num_epochs = 1000

## Simple Regression in PyTorch

### Data Preprocessing

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

print(type(x_train), type(x_train_tensor), x_train_tensor.type())

<class 'numpy.ndarray'> <class 'torch.Tensor'> torch.FloatTensor


### Network Parameters

If we want PyTorch to handle backprop for us, we need to make sure `requires_grad` evaluated to `True`.

In [10]:
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

tensor([1.9470], requires_grad=True) tensor([-0.7928], requires_grad=True)


### More about computational graphs

In [33]:
x = torch.tensor(-2., requires_grad=True)
y = torch.tensor( 5., requires_grad=True)
z = torch.tensor(-4., requires_grad=True)
q = x + y
f = q * z
print(x)
print(q)
print(f)

tensor(-2., requires_grad=True)
tensor(3., grad_fn=<AddBackward0>)
tensor(-12., grad_fn=<MulBackward0>)


In [34]:
print('q:', q)
print('q.grad:', q.retain_grad())
print('f:', f)
print('f.grad:', f.retain_grad())

q: tensor(3., grad_fn=<AddBackward0>)
q.grad: None
f: tensor(-12., grad_fn=<MulBackward0>)
f.grad: None


In [35]:
f.backward()
print('x.grad', x.grad)
print('y.grad', y.grad)
print('z.grad', z.grad)

x.grad tensor(-4.)
y.grad tensor(-4.)
z.grad tensor(3.)


### Optimizer

In [17]:
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
lr = 1e-1
num_epochs = 1000

optimizer = torch.optim.SGD([a,b], lr=lr)

for epoch in range(num_epochs):
    
    # forward prop
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()

    # computing gradients
    loss.backward()

    # updating parameters
    optimizer.step()
    optimizer.zero_grad()

print(a, b)

tensor([1.0093], requires_grad=True) tensor([1.9865], requires_grad=True)


### Built-in losses

In [27]:
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
lr = 1e-1
num_epochs = 1000

optimizer = torch.optim.SGD([a,b], lr=lr)
criterion = torch.nn.MSELoss()

for epoch in range(num_epochs):
    
    # forward prop
    yhat = a + b * x_train_tensor
    loss = criterion(y_train_tensor, yhat)
    # computing gradients
    loss.backward()

    # updating parameters
    optimizer.step()
    optimizer.zero_grad()

print(a, b)

tensor([1.0173], device='cuda:0', requires_grad=True) tensor([1.9844], device='cuda:0', requires_grad=True)


### Model

A model is represented by a python class that inherits from the [Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) class from PyTorch. This should feel similar to the homework exercises.

Every model needs to implement two functions:
1. `__init__(self)`: defines the parameters and layers of the model.
2. `forward(self,x)`: defines the architecture of the networks and outputs the prediction, given the input.

In [18]:
import torch.nn as nn

class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        
    def forward(self, x):
        return self.a + self.b * x

Notice that the parameters are wraped with `nn.Parameter` which tells PyTorch they are the parameters of our model. This is important since the optimizer should know which variables to treat as parameters.

In [19]:
net = ManualLinearRegression()
for param in net.parameters():
    print(param)

Parameter containing:
tensor([0.2345], requires_grad=True)
Parameter containing:
tensor([0.2303], requires_grad=True)


In [20]:
net.state_dict()

OrderedDict([('a', tensor([0.2345])), ('b', tensor([0.2303]))])

We need to send the model and the data to the same device. 

In [21]:
net = ManualLinearRegression()
print(net.state_dict())
lr = 1e-1
num_epochs = 1000

optimizer = torch.optim.SGD([a,b], lr=lr)
criterion = torch.nn.MSELoss()

for epoch in range(num_epochs):
    
    # forward prop
    yhat = net(x_train_tensor)
    loss = criterion(y_train_tensor, yhat)
    # computing gradients
    loss.backward()

    # updating parameters
    optimizer.step()
    optimizer.zero_grad()

print(net.state_dict())

OrderedDict([('a', tensor([-1.1229])), ('b', tensor([-0.1863]))])
OrderedDict([('a', tensor([-1.1229])), ('b', tensor([-0.1863]))])


In [34]:
net = ManualLinearRegression().to(device)
print(net.state_dict())
lr = 1e-1
num_epochs = 1000

optimizer = torch.optim.SGD(net.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

for epoch in range(num_epochs):
    net.train()
    # forward prop
    yhat = net(x_train_tensor)
    loss = criterion(y_train_tensor, yhat)
    # computing gradients
    loss.backward()

    # updating parameters
    optimizer.step()
    optimizer.zero_grad()

print(net.state_dict())

OrderedDict([('a', tensor([-1.1229], device='cuda:0')), ('b', tensor([-0.1863], device='cuda:0'))])
OrderedDict([('a', tensor([1.0173], device='cuda:0')), ('b', tensor([1.9844], device='cuda:0'))])


`model.train()` sets the model to training mode. Recall some operations, such as Dropout, have distinct behaviors in training and evaluation.

### Nested models

PyTorch has built-in models, such as a fully connected layer we can use in our model.

In [22]:
class LayerLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(1, 1)
                
    def forward(self, x):
        return self.linear(x)
    
net = LayerLinearRegression().to(device)
print(net.state_dict())
lr = 1e-1
num_epochs = 1000

optimizer = torch.optim.SGD(net.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

for epoch in range(num_epochs):
    net.train()
    # forward prop
    yhat = net(x_train_tensor)
    loss = criterion(y_train_tensor, yhat)
    # computing gradients
    loss.backward()

    # updating parameters
    optimizer.step()
    optimizer.zero_grad()

print(net.state_dict())

OrderedDict([('linear.weight', tensor([[0.7388]])), ('linear.bias', tensor([0.1354]))])
OrderedDict([('linear.weight', tensor([[1.9865]])), ('linear.bias', tensor([1.0093]))])


### Sequential models

Instead of building a class, we can use a sequential model. Sequential models in PyTorch are reserved for simple networks. The sequential model can contain other models and applies them in sequence to the input to produce its output.

Sequential models do not support multiple inputs or outputs. 

In [23]:
net = nn.Sequential(nn.Linear(1, 1)).to(device)
print(net.state_dict())
lr = 1e-1
num_epochs = 1000

optimizer = torch.optim.SGD(net.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

for epoch in range(num_epochs):
    net.train()
    # forward prop
    yhat = net(x_train_tensor)
    loss = criterion(y_train_tensor, yhat)
    # computing gradients
    loss.backward()

    # updating parameters
    optimizer.step()
    optimizer.zero_grad()

print(net.state_dict())

OrderedDict([('0.weight', tensor([[0.4822]])), ('0.bias', tensor([-0.1412]))])
OrderedDict([('0.weight', tensor([[1.9865]])), ('0.bias', tensor([1.0093]))])


In [24]:
def make_train_step(net, loss_fn, optimizer):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        # Sets net to TRAIN mode
        net.train()
        # Makes predictions
        yhat = net(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        # Computes gradients
        loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        optimizer.zero_grad()
        # Returns the loss
        return loss.item()
    
    # Returns the function that will be called inside the train loop
    return train_step

# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(net, criterion, optimizer)
losses = []

for epoch in range(num_epochs):
    # Performs one train step and returns the corresponding loss
    loss = train_step(x_train_tensor, y_train_tensor)
    losses.append(loss)
    
# Checks model's parameters
print(net.state_dict())

OrderedDict([('0.weight', tensor([[1.9865]])), ('0.bias', tensor([1.0093]))])


## Datasets 

A dataset is represented by a python class that inherits from the Dataset class. It outputs a tuple of `(inputs, labels)`. 

1. The `__init__(self)` method takes any argument needed to build a list of tuples (CSV file, image folder, tensors, numpy arrays, etc.). You don't need to load all the dataset in the `__init__` method if the dataset is large. 
2. `__get_item__(self, index)` allows the dataset to be indexed so it behaves like a list. It can return the corresponding slices of the pre-loaded dataset or load them on demand.
3. `__len__(self)` returns the number of instances in the dataset.

In [25]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

# Wait, is this a CPU tensor now? Why? Where is .to(device)?
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])
print(len(train_data))

(tensor([0.0625]), tensor([1.0950]))
800


The dataset does not send the data to a device since it does not know in advanced where it needs to go.

## Dataloaders

The dataloader takes the dataset as input and outputs an iterator over the data.

In [26]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)

In [27]:
next(iter(train_loader))

[tensor([[0.8956],
         [0.6617],
         [0.8349],
         [0.0926],
         [0.6642],
         [0.8810],
         [0.9213],
         [0.2892],
         [0.2053],
         [0.4276],
         [0.1412],
         [0.8010],
         [0.6100],
         [0.0705],
         [0.2091],
         [0.0300]]),
 tensor([[2.9005],
         [2.3652],
         [2.7717],
         [1.2005],
         [2.4571],
         [2.8043],
         [2.8403],
         [1.6382],
         [1.3711],
         [1.9297],
         [1.2852],
         [2.6216],
         [2.1991],
         [1.1649],
         [1.4561],
         [1.0472]])]

In [28]:
losses = []
net = LayerLinearRegression().to(device)
train_step = make_train_step(net, criterion, optimizer)

for epoch in range(num_epochs):
    for x_batch, y_batch in train_loader:
        # the dataset "lives" in the CPU, so do our mini-batches
        # therefore, we need to send those mini-batches to the
        # device where the model "lives"
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
print(net.state_dict())

OrderedDict([('linear.weight', tensor([[-0.4607]])), ('linear.bias', tensor([-0.1173]))])


1. The inner loop gets the mini-batch from the dataloader and sends it to the correct device.
2. In order to make use of the memory, it is better to keep the dataset as agnostic as possible.
3. Try to keep the GPU busy with calculations and not by passing data and parameters back and forth.

## Evaluation

When evaluating the model, there are two things to consider:

1. `torch.no_grad()`: it's a good practice to wrap the validation inner loop with this context manager to disable any gradient calculations.
2. `.eval()`: setting the model to evaluation mode will adjust behavior of functions such as dropout.

In [29]:
from torch.utils.data import TensorDataset
from torch.utils.data.dataset import random_split

x = np.random.rand(1000, 1)
y = 1 + 2 * x + .1 * np.random.randn(1000, 1)

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

train_dataset, val_dataset = random_split(dataset, [800, 200])

train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)

In [30]:
losses = []
val_losses = []
net = LayerLinearRegression().to(device)
lr = 1e-1
num_epochs = 1000

optimizer = torch.optim.SGD(net.parameters(), lr=lr)
criterion = torch.nn.MSELoss()
train_step = make_train_step(net, criterion, optimizer)

for epoch in range(num_epochs):
    net.train()
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
    with torch.no_grad():
        net.eval()
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            yhat = net(x_val)
            val_loss = criterion(y_val, yhat)
            val_losses.append(val_loss.item())

print(net.state_dict())

OrderedDict([('linear.weight', tensor([[2.0222]])), ('linear.bias', tensor([0.9963]))])
