In [46]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from IPython.display import display, Markdown

import numpy as np
import torch

# Using Tensors and Autograd

## Data

In [2]:
housing = fetch_california_housing()

X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=0.1)

In [3]:
print(f'{X_train.shape=}')
print(f'{X_test.shape=}')
print(f'{X_val.shape=}')
print()
print(f'{y_train.shape=}')
print(f'{y_test.shape=}')
print(f'{y_val.shape=}')

X_train.shape=(13932, 8)
X_test.shape=(5160, 8)
X_val.shape=(1548, 8)

y_train.shape=(13932,)
y_test.shape=(5160,)
y_val.shape=(1548,)


In [4]:
X_train = torch.FloatTensor(X_train)
X_val = torch.FloatTensor(X_val)
X_test = torch.FloatTensor(X_test)

means = X_train.mean(dim=0, keepdims=True)
stds = X_train.std(dim=0, keepdims=True)

X_train = (X_train - means) / stds
X_val = (X_val - means) / stds
X_test = (X_test - means) / stds

In [5]:
y_train = torch.FloatTensor(y_train).reshape(-1, 1)
y_val = torch.FloatTensor(y_val).reshape(-1, 1)
y_test = torch.FloatTensor(y_test).reshape(-1, 1)

## Model

### Parameters initialization

In [6]:
torch.manual_seed(42)
n_features = X_train.shape[1]  # 8
weights = torch.randn((n_features, 1), requires_grad=True)
bias = torch.tensor(0., requires_grad=True)

In [7]:
def print_linear_func():
    equation_str = 'f(x) = '
    equation_str += ' + '.join(list(map(
        lambda item: f'{item[1][0]:.2f}x_{item[0]}',
        enumerate(weights.tolist(), start=1)
    )))
    equation_str += f' + {bias.item()}'

    try:
        display(Markdown(f'${equation_str}$'))
    except NameError:
        print(equation_str)

In [8]:
print_linear_func()  # randomly initiated

$f(x) = 0.34x_1 + 0.13x_2 + 0.23x_3 + 0.23x_4 + -1.12x_5 + -0.19x_6 + 2.21x_7 + -0.64x_8 + 0.0$

### Training

In [9]:
learning_rate = 0.4
n_epochs = 100

for epoch in range(n_epochs):
    y_pred = X_train @ weights + bias  # calculates prediction

    loss = ((y_pred - y_train) ** 2).mean()  # MSE
    loss.backward()

    with torch.no_grad():
        bias -= learning_rate * bias.grad
        weights -= learning_rate * weights.grad
        bias.grad.zero_()
        weights.grad.zero_()

    print(f'Epoch {epoch+1:3}/{n_epochs}, Loss: {loss.item()}')    

Epoch   1/100, Loss: 15.992842674255371
Epoch   2/100, Loss: 4.682830810546875
Epoch   3/100, Loss: 2.1424787044525146
Epoch   4/100, Loss: 1.2740927934646606
Epoch   5/100, Loss: 0.9412086009979248
Epoch   6/100, Loss: 0.8018259406089783
Epoch   7/100, Loss: 0.735872209072113
Epoch   8/100, Loss: 0.6991417407989502
Epoch   9/100, Loss: 0.6748808026313782
Epoch  10/100, Loss: 0.6565617322921753
Epoch  11/100, Loss: 0.6415442228317261
Epoch  12/100, Loss: 0.6286912560462952
Epoch  13/100, Loss: 0.617459774017334
Epoch  14/100, Loss: 0.6075484156608582
Epoch  15/100, Loss: 0.5987609028816223
Epoch  16/100, Loss: 0.5909507870674133
Epoch  17/100, Loss: 0.5839995741844177
Epoch  18/100, Loss: 0.5778065919876099
Epoch  19/100, Loss: 0.5722846984863281
Epoch  20/100, Loss: 0.5673576593399048
Epoch  21/100, Loss: 0.5629581212997437
Epoch  22/100, Loss: 0.5590269565582275
Epoch  23/100, Loss: 0.5555118322372437
Epoch  24/100, Loss: 0.5523662567138672
Epoch  25/100, Loss: 0.5495496392250061
Epo

In [10]:
print_linear_func()

$f(x) = 0.86x_1 + 0.12x_2 + -0.32x_3 + 0.37x_4 + -0.00x_5 + -0.05x_6 + -0.88x_7 + -0.86x_8 + 2.069737434387207$

### Prediction

In [11]:
X_new = X_test[:3]
X_new

tensor([[-1.1522, -0.2859, -0.5005, -0.1674, -0.0279,  0.0772,  0.1947,  0.2852],
        [-0.7065,  0.1102, -0.1620,  0.2087,  0.1230, -0.0343, -0.2362,  0.0602],
        [-0.2091,  1.8527, -0.5864,  0.1920, -0.0995, -0.1572,  1.0096, -1.4302]])

In [12]:
with torch.no_grad():
    y_pred = X_new @ weights + bias

In [13]:
y_pred

tensor([[0.7306],
        [1.7652],
        [2.7173]])

### Evaluating

In [15]:
with torch.no_grad():
    y_pred = X_test @ weights + bias

In [16]:
print('RMSE:', np.sqrt(mean_squared_error(y_test.numpy(), y_pred.numpy())).item())
print('  r²:', r2_score(y_test.numpy(), y_pred.numpy()))

RMSE: 0.7354057464293138
  r²: 0.5912830829620361


# Using Pytorch's High-Level API

## nn.Modules

In [19]:
import torch.nn as nn

In [20]:
torch.manual_seed(42)

model = nn.Linear(
    in_features=n_features,
    out_features=1
)

In [27]:
def print_model_linear_func():
    equation_str = 'f(x) = '
    equation_str += ' + '.join(list(map(
        lambda item: f'{item[1]:.2f}x_{item[0]}',
        enumerate(model.weight.tolist()[0], start=1)
    )))
    equation_str += f' + {model.bias.item()}'

    try:
        display(Markdown(f'${equation_str}$'))
    except NameError:
        print(equation_str)

In [28]:
print_model_linear_func()

$f(x) = 0.27x_1 + 0.29x_2 + -0.08x_3 + 0.32x_4 + -0.08x_5 + 0.07x_6 + -0.17x_7 + 0.21x_8 + 0.31167247891426086$

## Exploring a model

In [21]:
model.weight

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [22]:
model.bias

Parameter containing:
tensor([0.3117], requires_grad=True)

In [29]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
Parameter containing:
tensor([0.3117], requires_grad=True)


In [30]:
for param in model.named_parameters():
    print(param)

('weight', Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True))
('bias', Parameter containing:
tensor([0.3117], requires_grad=True))


In [31]:
model(X_train[:2])  # make predictions

tensor([[0.6524],
        [0.5043]], grad_fn=<AddmmBackward0>)

Aurélien Géron. Hands On Machine Learning. p. 553:

> When we use a module as a function, PyTorch internally **calls the module’s forward()** method. In the case of the nn.Linear module, the forward() method computes `X @ self.weight.T + self.bias` (where `X` is the input). That’s just what we need for linear regression!

## Optimizer and Loss function

In [38]:
print(f'{learning_rate=}')
print(f'{n_epochs=}')

learning_rate=0.4
n_epochs=100


In [35]:
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()

In [36]:
def train_bgd(model, optimizer, criterion, X_train, y_train, n_epochs):
    for epoch in range(n_epochs):
        y_pred = model(X_train)

        loss = criterion(y_pred, y_train)  # calculate loss function
        loss.backward()

        optimizer.step()  # update parameters with grad
        optimizer.zero_grad()  # reset grad
        
        print(f"Epoch {epoch + 1:3}/{n_epochs}, Loss: {loss.item()}")

In [39]:
train_bgd(model, optimizer, mse, X_train, y_train, n_epochs)

Epoch   1/100, Loss: 4.27860164642334
Epoch   2/100, Loss: 0.7788285613059998
Epoch   3/100, Loss: 0.6250892877578735
Epoch   4/100, Loss: 0.605811595916748
Epoch   5/100, Loss: 0.5953069925308228
Epoch   6/100, Loss: 0.5868978500366211
Epoch   7/100, Loss: 0.5797048211097717
Epoch   8/100, Loss: 0.5734245777130127
Epoch   9/100, Loss: 0.5678966045379639
Epoch  10/100, Loss: 0.56301349401474
Epoch  11/100, Loss: 0.5586923956871033
Epoch  12/100, Loss: 0.5548644661903381
Epoch  13/100, Loss: 0.5514711737632751
Epoch  14/100, Loss: 0.5484612584114075
Epoch  15/100, Loss: 0.545789897441864
Epoch  16/100, Loss: 0.5434176921844482
Epoch  17/100, Loss: 0.5413098335266113
Epoch  18/100, Loss: 0.5394359230995178
Epoch  19/100, Loss: 0.5377690196037292
Epoch  20/100, Loss: 0.5362852215766907
Epoch  21/100, Loss: 0.5349637269973755
Epoch  22/100, Loss: 0.5337859392166138
Epoch  23/100, Loss: 0.5327355861663818
Epoch  24/100, Loss: 0.531798243522644
Epoch  25/100, Loss: 0.5309612154960632
Epoch  

In [48]:
print_model_linear_func()

$f(x) = 0.85x_1 + 0.12x_2 + -0.32x_3 + 0.36x_4 + -0.00x_5 + -0.05x_6 + -0.89x_7 + -0.86x_8 + 2.069737434387207$

### Prediction

In [40]:
X_new = X_test[:3]
X_new

tensor([[-1.1522, -0.2859, -0.5005, -0.1674, -0.0279,  0.0772,  0.1947,  0.2852],
        [-0.7065,  0.1102, -0.1620,  0.2087,  0.1230, -0.0343, -0.2362,  0.0602],
        [-0.2091,  1.8527, -0.5864,  0.1920, -0.0995, -0.1572,  1.0096, -1.4302]])

In [41]:
with torch.no_grad():
    y_pred = model(X_new)

In [42]:
y_pred

tensor([[0.7285],
        [1.7669],
        [2.7148]])

### Evaluating

In [43]:
with torch.no_grad():
    y_pred = model(X_test)

In [45]:
print('RMSE:', np.sqrt(mean_squared_error(y_test.numpy(), y_pred.numpy())).item())
print('  r²:', r2_score(y_test.numpy(), y_pred.numpy()))

RMSE: 0.735153799833295
  r²: 0.5915631055831909


<hr>



# Conclusion

Comparison between the codes:

**Only tensors and Autograd:**

```python
for epoch in range(n_epochs):
    y_pred = X_train @ weights + bias             # 1. Predictions calculated directly

    loss = ((y_pred - y_train) ** 2).mean()       # 2. Loss calculated directly
    loss.backward()

    with torch.no_grad():                         # 3. Manually passing context 
        bias -= learning_rate * bias.grad         #    bias update
        weights -= learning_rate * weights.grad   #    weights update
        bias.grad.zero_()                         #    bias zero grad
        weights.grad.zero_()                      #    weights zero grad
```

**High-level API:**

```python
for epoch in range(n_epochs):
    y_pred = model(X_train)                       # 1. Calculated through model() 

    loss = criterion(y_pred, y_train)             # 2. Calculated through nn.MSE() module
    loss.backward()

    optimizer.step()                              # 3. weights and bias update in no_grad() context
    optimizer.zero_grad()                         #    weights and bias zero grad in no_grad() context
```

In [51]:
# tensors and Autograd
print_linear_func()

$f(x) = 0.86x_1 + 0.12x_2 + -0.32x_3 + 0.37x_4 + -0.00x_5 + -0.05x_6 + -0.88x_7 + -0.86x_8 + 2.069737434387207$

In [52]:
# High-level API
print_model_linear_func()

$f(x) = 0.85x_1 + 0.12x_2 + -0.32x_3 + 0.36x_4 + -0.00x_5 + -0.05x_6 + -0.89x_7 + -0.86x_8 + 2.069737434387207$