# Batch Gradient Descent

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from IPython.display import display, Markdown
import numpy as np

import torch.nn as nn
import torch

## Data

In [4]:
housing = fetch_california_housing()

X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=0.1)

In [5]:
print(f'{X_train.shape=}')
print(f'{X_test.shape=}')
print(f'{X_val.shape=}')
print()
print(f'{y_train.shape=}')
print(f'{y_test.shape=}')
print(f'{y_val.shape=}')

X_train.shape=(13932, 8)
X_test.shape=(5160, 8)
X_val.shape=(1548, 8)

y_train.shape=(13932,)
y_test.shape=(5160,)
y_val.shape=(1548,)


In [6]:
X_train = torch.FloatTensor(X_train)
X_val = torch.FloatTensor(X_val)
X_test = torch.FloatTensor(X_test)

means = X_train.mean(dim=0, keepdims=True)
stds = X_train.std(dim=0, keepdims=True)

X_train = (X_train - means) / stds
X_val = (X_val - means) / stds
X_test = (X_test - means) / stds

In [7]:
y_train = torch.FloatTensor(y_train).reshape(-1, 1)
y_val = torch.FloatTensor(y_val).reshape(-1, 1)
y_test = torch.FloatTensor(y_test).reshape(-1, 1)

In [10]:
n_features = X_train.shape[1]  # 8

## Model

In [11]:
torch.manual_seed(42)

model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)

i.e., it's a sequence of linear layers, with ReLU at the end of every hidden layer:

`(input) 8 -> 50 -> 40 -> 1 (output)`

In [15]:
learning_rate = 0.4
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
n_epochs = 100

In [16]:
def train_bgd(model, optimizer, criterion, X_train, y_train, n_epochs):
    for epoch in range(n_epochs):
        y_pred = model(X_train)

        loss = criterion(y_pred, y_train)  # calculate loss function
        loss.backward()

        optimizer.step()  # update parameters with grad
        optimizer.zero_grad()  # reset grad
        
        print(f"Epoch {epoch + 1:3}/{n_epochs}, Loss: {loss.item()}")

In [17]:
train_bgd(model, optimizer, criterion, X_train, y_train, n_epochs)

Epoch   1/100, Loss: 4.989955902099609
Epoch   2/100, Loss: 12.778043746948242
Epoch   3/100, Loss: 5.615721702575684
Epoch   4/100, Loss: 5.615721702575684
Epoch   5/100, Loss: 5.615721702575684
Epoch   6/100, Loss: 5.615721702575684
Epoch   7/100, Loss: 5.615721702575684
Epoch   8/100, Loss: 5.615721702575684
Epoch   9/100, Loss: 5.615721702575684
Epoch  10/100, Loss: 5.615721702575684
Epoch  11/100, Loss: 5.615721702575684
Epoch  12/100, Loss: 5.615721702575684
Epoch  13/100, Loss: 5.615721702575684
Epoch  14/100, Loss: 5.615721702575684
Epoch  15/100, Loss: 5.615721702575684
Epoch  16/100, Loss: 5.615721702575684
Epoch  17/100, Loss: 5.615721702575684
Epoch  18/100, Loss: 5.615721702575684
Epoch  19/100, Loss: 5.615721702575684
Epoch  20/100, Loss: 5.615721702575684
Epoch  21/100, Loss: 5.615721702575684
Epoch  22/100, Loss: 5.615721702575684
Epoch  23/100, Loss: 5.615721702575684
Epoch  24/100, Loss: 5.615721702575684
Epoch  25/100, Loss: 5.615721702575684
Epoch  26/100, Loss: 5.6

### Prediction

In [18]:
X_new = X_test[:3]
X_new

tensor([[-1.1522, -0.2859, -0.5005, -0.1674, -0.0279,  0.0772,  0.1947,  0.2852],
        [-0.7065,  0.1102, -0.1620,  0.2087,  0.1230, -0.0343, -0.2362,  0.0602],
        [-0.2091,  1.8527, -0.5864,  0.1920, -0.0995, -0.1572,  1.0096, -1.4302]])

In [19]:
with torch.no_grad():
    y_pred = model(X_new)

In [20]:
y_pred

tensor([[0.],
        [0.],
        [0.]])

### Evaluating

In [21]:
with torch.no_grad():
    y_pred = model(X_test)

In [22]:
print('RMSE:', np.sqrt(mean_squared_error(y_test.numpy(), y_pred.numpy())).item())
print('  r²:', r2_score(y_test.numpy(), y_pred.numpy()))

RMSE: 2.3621926566939595
  r²: -3.2169570922851562


# Mini-batch GD with DataLoader

In [23]:
from torch.utils.data import TensorDataset, DataLoader

In [24]:
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [29]:
torch.manual_seed(42)

model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

model.to(device)

Sequential(
  (0): Linear(in_features=8, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=40, bias=True)
  (3): ReLU()
  (4): Linear(in_features=40, out_features=1, bias=True)
)

In [30]:
learning_rate = 0.2
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
n_epochs = 100

In [32]:
def train(model, optimizer, criterion, train_loader, n_epochs):
    model.train()  # Switches the model to training mode. The .eval() method switches to evaluation mode

    for epoch in range(n_epochs):
        total_loss = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            y_pred = model(X_batch)

            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward

            optimizer.step()
            optimizer.zero_grad()

        mean_loss = total_loss / len(train_loader)

        print(f"Epoch {epoch + 1:3}/{n_epochs}, Loss: {mean_loss:.4f}")

In [33]:
train(model, optimizer, criterion, train_loader, n_epochs)

Epoch   1/100, Loss: 4.9913
Epoch   2/100, Loss: 4.9920
Epoch   3/100, Loss: 4.9891
Epoch   4/100, Loss: 4.9888
Epoch   5/100, Loss: 4.9881
Epoch   6/100, Loss: 4.9881
Epoch   7/100, Loss: 4.9912
Epoch   8/100, Loss: 4.9916
Epoch   9/100, Loss: 4.9925
Epoch  10/100, Loss: 4.9888
Epoch  11/100, Loss: 4.9867
Epoch  12/100, Loss: 4.9905
Epoch  13/100, Loss: 4.9913
Epoch  14/100, Loss: 4.9889
Epoch  15/100, Loss: 4.9876
Epoch  16/100, Loss: 4.9900
Epoch  17/100, Loss: 4.9890
Epoch  18/100, Loss: 4.9910
Epoch  19/100, Loss: 4.9933
Epoch  20/100, Loss: 4.9882
Epoch  21/100, Loss: 4.9880
Epoch  22/100, Loss: 5.0010
Epoch  23/100, Loss: 4.9913
Epoch  24/100, Loss: 4.9896
Epoch  25/100, Loss: 4.9923
Epoch  26/100, Loss: 4.9905
Epoch  27/100, Loss: 4.9932
Epoch  28/100, Loss: 4.9930
Epoch  29/100, Loss: 4.9932
Epoch  30/100, Loss: 4.9888
Epoch  31/100, Loss: 4.9889
Epoch  32/100, Loss: 4.9900
Epoch  33/100, Loss: 4.9878
Epoch  34/100, Loss: 4.9877
Epoch  35/100, Loss: 4.9982
Epoch  36/100, Loss:

### Model Evaluating (done right)

In [34]:
def evaluate(model, data_loader, metric_fn, aggregate_fn=torch.mean):
    model.eval()
    metrics = []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            y_pred = model(X_batch)
            metric = metric_fn(y_pred, y_batch)
            metrics.append(metric)

    return aggregate_fn(torch.stack(metrics))

In [35]:
valid_dataset = TensorDataset(X_val, y_val)
valid_loader = DataLoader(valid_dataset, batch_size=32)
valid_mse = evaluate(model, valid_loader, criterion)

In [36]:
valid_mse

tensor(5.0676)

In [46]:
valid_rmse = evaluate(
    model,
    valid_loader,
    criterion,  # mse
    aggregate_fn=lambda metrics: torch.sqrt(torch.mean(metrics))
)

valid_rmse

tensor(2.2511)

In [48]:
import torchmetrics  # continue...

ModuleNotFoundError: No module named 'torchmetrics'