In [1]:
import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
x = torch.tensor(3.)
w = torch.tensor(4., requires_grad=True)
b = torch.tensor(5., requires_grad=True)
x,w,b

(tensor(3.), tensor(4., requires_grad=True), tensor(5., requires_grad=True))

In [4]:
y = w * x + b
y

tensor(17., grad_fn=<AddBackward0>)

In [5]:
y.backward()

In [6]:
# Display gradients
print('dy/dx:', x.grad)
print('dy/dw:', w.grad)
print('dy/db:', b.grad)

dy/dx: None
dy/dw: tensor(3.)
dy/db: tensor(1.)


In [7]:
import numpy as np

In [8]:
# Input (temp, rainfall, humidity)
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70]], dtype='float32')
inputs

array([[ 73.,  67.,  43.],
       [ 91.,  88.,  64.],
       [ 87., 134.,  58.],
       [102.,  43.,  37.],
       [ 69.,  96.,  70.]], dtype=float32)

In [9]:
# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119]], dtype='float32')
targets

array([[ 56.,  70.],
       [ 81., 101.],
       [119., 133.],
       [ 22.,  37.],
       [103., 119.]], dtype=float32)

In [10]:
# Convert inputs and targets to tensors
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [11]:
# Weights and biases
w = torch.randn(2, 3, requires_grad=True)
b = torch.randn(2, requires_grad=True)
print(w)
print(b)

tensor([[-0.4684,  0.3947,  0.2923],
        [ 1.8795,  0.3470,  0.6187]], requires_grad=True)
tensor([0.5245, 0.0905], requires_grad=True)


In [12]:
def model(x):
    return x @ w.t() + b

In [13]:
preds = model(inputs)
print(preds)

tensor([[  5.3462, 187.1460],
        [ 11.3424, 241.2563],
        [ 29.6175, 245.9895],
        [-19.4632, 229.6103],
        [ 26.5582, 206.3960]], grad_fn=<AddBackward0>)


In [14]:
# Compare with targets
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [15]:
def mse(t1, t2):
    diff = t1 - t2
    return torch.sum(diff * diff) / diff.numel()

In [16]:
loss = mse(preds, targets)
print(loss)

tensor(11386.8154, grad_fn=<DivBackward0>)


In [17]:
# Compute gradients
loss.backward()

In [18]:
# Gradients for weights
print(w)
print(w.grad)

tensor([[-0.4684,  0.3947,  0.2923],
        [ 1.8795,  0.3470,  0.6187]], requires_grad=True)
tensor([[-5463.3149, -6124.4507, -3741.0891],
        [11364.3271, 10400.8359,  6762.2744]])


In [19]:
w
w.grad

tensor([[-5463.3149, -6124.4507, -3741.0891],
        [11364.3271, 10400.8359,  6762.2744]])

In [20]:
with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5

In [21]:
# Let's verify that the loss is actually lower
loss = mse(preds, targets)
print(loss)

tensor(11386.8154, grad_fn=<DivBackward0>)


In [22]:
w.grad.zero_()
b.grad.zero_()
print(w.grad)
print(b.grad)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([0., 0.])


Train the model using gradient descent

As seen above, we reduce the loss and improve our model using the gradient descent optimization algorithm. Thus, we can train the model using the following steps:

    Generate predictions

    Calculate the loss

    Compute gradients w.r.t the weights and biases

    Adjust the weights by subtracting a small quantity proportional to the gradient

    Reset the gradients to zero

Let's implement the above step by step.

In [23]:
# Generate predictions
preds = model(inputs)
print(preds)

tensor([[ 15.0471, 168.9724],
        [ 24.0985, 217.4329],
        [ 44.7479, 218.2420],
        [ -9.8723, 211.0430],
        [ 38.8268, 183.8349]], grad_fn=<AddBackward0>)


In [24]:
# Calculate the loss
loss = mse(preds, targets)
print(loss)

tensor(8067.5205, grad_fn=<DivBackward0>)


In [25]:
# Compute gradients
loss.backward()
print(w.grad)
print(b.grad)

tensor([[-4461.2910, -5046.4194, -3076.1382],
        [ 9492.4834,  8401.5322,  5525.9165]])
tensor([-53.6304, 107.9050])


In [26]:
# Adjust weights & reset gradients
with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5
    w.grad.zero_()
    b.grad.zero_()

In [27]:
print(w)
print(b)

tensor([[-0.3691,  0.5064,  0.3605],
        [ 1.6709,  0.1590,  0.4958]], requires_grad=True)
tensor([0.5257, 0.0881], requires_grad=True)


In [28]:
# Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(5825.7798, grad_fn=<DivBackward0>)


## Train for multiple epochs

To reduce the loss further, we can repeat the process of adjusting the weights and biases using the gradients multiple times. Each iteration is called an _epoch_. Let's train the model for 100 epochs.

In [29]:
epoch = 100
for i in range(epoch):
    preds = model(inputs)
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad():
        w -= w.grad * 1e-5
        b -= b.grad * 1e-5
        w.grad.zero_()
        b.grad.zero_()

In [30]:
# Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(353.6259, grad_fn=<DivBackward0>)


In [31]:
# Predictions
preds

tensor([[ 58.5323,  78.8183],
        [ 81.9244, 103.3066],
        [117.1392, 113.2827],
        [ 28.3936,  86.0818],
        [ 97.2659,  95.1066]], grad_fn=<AddBackward0>)

In [32]:
# Targets
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])

## Linear regression using PyTorch built-ins

We've implemented linear regression & gradient descent model using some basic tensor operations. However, since this is a common pattern in deep learning, PyTorch provides several built-in functions and classes to make it easy to create and train models with just a few lines of code.

Let's begin by importing the `torch.nn` package from PyTorch, which contains utility classes for building neural networks.

In [33]:
import torch.nn as nn

In [34]:
# Input (temp, rainfall, humidity)
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70], 
                   [74, 66, 43], 
                   [91, 87, 65], 
                   [88, 134, 59], 
                   [101, 44, 37], 
                   [68, 96, 71], 
                   [73, 66, 44], 
                   [92, 87, 64], 
                   [87, 135, 57], 
                   [103, 43, 36], 
                   [68, 97, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119],
                    [57, 69], 
                    [80, 102], 
                    [118, 132], 
                    [21, 38], 
                    [104, 118], 
                    [57, 69], 
                    [82, 100], 
                    [118, 134], 
                    [20, 38], 
                    [102, 120]], 
                   dtype='float32')

inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)

In [35]:
inputs

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 74.,  66.,  43.],
        [ 91.,  87.,  65.],
        [ 88., 134.,  59.],
        [101.,  44.,  37.],
        [ 68.,  96.,  71.],
        [ 73.,  66.,  44.],
        [ 92.,  87.,  64.],
        [ 87., 135.,  57.],
        [103.,  43.,  36.],
        [ 68.,  97.,  70.]])

In [36]:
from torch.utils.data import TensorDataset

In [37]:
# Define dataset
train_ds = TensorDataset(inputs, targets)
train_ds[0:3]

(tensor([[ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.]]),
 tensor([[ 56.,  70.],
         [ 81., 101.],
         [119., 133.]]))

In [38]:
from torch.utils.data import DataLoader

In [39]:
# Define data loader
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [40]:
for xb, yb in train_dl:
    print(xb)
    print(yb)
    break

tensor([[ 68.,  96.,  71.],
        [ 73.,  66.,  44.],
        [ 69.,  96.,  70.],
        [ 91.,  87.,  65.],
        [ 87., 135.,  57.]])
tensor([[104., 118.],
        [ 57.,  69.],
        [103., 119.],
        [ 80., 102.],
        [118., 134.]])


In [41]:
# Define model
model = nn.Linear(3, 2)
print(model.weight)
print(model.bias)

Parameter containing:
tensor([[-0.4712, -0.5124,  0.0864],
        [-0.3313, -0.4075,  0.5128]], requires_grad=True)
Parameter containing:
tensor([-0.2415,  0.1593], requires_grad=True)


In [42]:
# Parameters
list(model.parameters())

[Parameter containing:
 tensor([[-0.4712, -0.5124,  0.0864],
         [-0.3313, -0.4075,  0.5128]], requires_grad=True),
 Parameter containing:
 tensor([-0.2415,  0.1593], requires_grad=True)]

In [43]:
# Generate predictions
preds = model(inputs)
preds

tensor([[ -65.2532,  -29.2831],
        [ -82.6800,  -33.0371],
        [-104.8852,  -53.5346],
        [ -67.1384,  -32.1874],
        [ -75.8945,  -25.9316],
        [ -65.2120,  -29.2069],
        [ -82.0811,  -32.1168],
        [-105.2700,  -53.3531],
        [ -67.1796,  -32.2636],
        [ -75.3368,  -25.0875],
        [ -64.6544,  -28.3628],
        [ -82.6388,  -32.9609],
        [-105.4841,  -54.4549],
        [ -67.6960,  -33.0315],
        [ -75.9357,  -26.0078]], grad_fn=<AddmmBackward0>)

## Loss Function

Instead of defining a loss function manually, we can use the built-in loss function `mse_loss`.

In [44]:
# Import nn.functional
import torch.nn.functional as F

In [45]:
# Define loss function
loss_fn = F.mse_loss

In [46]:
loss = loss_fn(model(inputs), targets)
print(loss)

tensor(21953.6211, grad_fn=<MseLossBackward0>)


In [47]:
# Define optimizer
opt = torch.optim.SGD(model.parameters(), lr=1e-5)

In [48]:
opt

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 1e-05
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

## Train the model

We are now ready to train the model. We'll follow the same process to implement gradient descent:

1. Generate predictions

2. Calculate the loss

3. Compute gradients w.r.t the weights and biases

4. Adjust the weights by subtracting a small quantity proportional to the gradient

5. Reset the gradients to zero

The only change is that we'll work batches of data instead of processing the entire training data in every iteration. Let's define a utility function `fit` that trains the model for a given number of epochs.

In [49]:
# Utility function to train the model
def fit(num_epochs, model, loss_fn, opt, train_dl):
    
    # Repeat for given number of epochs
    for epoch in range(num_epochs):
        
        # Train with batches of data
        for xb,yb in train_dl:
            
            # 1. Generate predictions
            pred = model(xb)
            
            # 2. Calculate loss
            loss = loss_fn(pred, yb)
            
            # 3. Compute gradients
            loss.backward()
            
            # 4. Update parameters using gradients
            opt.step()
            
            # 5. Reset the gradients to zero
            opt.zero_grad()
        
        # Print the progress
        if (epoch+1) % 10 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

In [50]:
fit(100, model, loss_fn, opt, train_dl)

Epoch [10/100], Loss: 374.6248
Epoch [20/100], Loss: 199.9262
Epoch [30/100], Loss: 98.7275
Epoch [40/100], Loss: 96.3187
Epoch [50/100], Loss: 101.7623
Epoch [60/100], Loss: 34.4488
Epoch [70/100], Loss: 22.8939
Epoch [80/100], Loss: 20.0171
Epoch [90/100], Loss: 25.4693
Epoch [100/100], Loss: 12.5075


In [51]:
# Generate predictions
preds = model(inputs)
preds

tensor([[ 57.9545,  71.2314],
        [ 82.5896, 101.7158],
        [115.4518, 128.7286],
        [ 26.7586,  42.5541],
        [ 99.3062, 117.7835],
        [ 56.8810,  70.3195],
        [ 82.4716, 101.9725],
        [115.8082, 129.4570],
        [ 27.8320,  43.4660],
        [100.2617, 118.9520],
        [ 57.8365,  71.4881],
        [ 81.5162, 100.8039],
        [115.5698, 128.4720],
        [ 25.8031,  41.3855],
        [100.3797, 118.6954]], grad_fn=<AddmmBackward0>)

In [52]:
# Compare with targets
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 57.,  69.],
        [ 80., 102.],
        [118., 132.],
        [ 21.,  38.],
        [104., 118.],
        [ 57.,  69.],
        [ 82., 100.],
        [118., 134.],
        [ 20.,  38.],
        [102., 120.]])

In [53]:
model(torch.tensor([[75, 63, 44.]]))

tensor([[54.9157, 68.9725]], grad_fn=<AddmmBackward0>)