In [1]:
import torch

In [2]:
X = torch.tensor([[1.0, 3.0, 5.5], [6.0, 3.2, 7.0]])

In [3]:
X

tensor([[1.0000, 3.0000, 5.5000],
        [6.0000, 3.2000, 7.0000]])

In [4]:
X.shape

torch.Size([2, 3])

In [5]:
X.dtype

torch.float32

In [6]:
X[0]

tensor([1.0000, 3.0000, 5.5000])

In [7]:
X[0, 1]

tensor(3.)

In [8]:
10 * (X + 1.0)

tensor([[20., 40., 65.],
        [70., 42., 80.]])

In [9]:
X.exp()

tensor([[   2.7183,   20.0855,  244.6919],
        [ 403.4288,   24.5325, 1096.6332]])

In [10]:
X.mean()

tensor(4.2833)

In [11]:
X.max()

tensor(7.)

In [12]:
X.max(dim = 0) # Maximum value per column

torch.return_types.max(
values=tensor([6.0000, 3.2000, 7.0000]),
indices=tensor([1, 1, 1]))

In [13]:
X.max(dim = 1) # Maximum value per rows

torch.return_types.max(
values=tensor([5.5000, 7.0000]),
indices=tensor([2, 2]))

In [14]:
X.T

tensor([[1.0000, 6.0000],
        [3.0000, 3.2000],
        [5.5000, 7.0000]])

In [15]:
X @ X.T  # Matrix Multiplication

tensor([[40.2500, 54.1000],
        [54.1000, 95.2400]])

In [16]:
import numpy as np

In [17]:
X.numpy()

array([[1. , 3. , 5.5],
       [6. , 3.2, 7. ]], dtype=float32)

In [18]:
torch.tensor(np.array([[1., 3., 4.], [2., 6., 7.]])) # deafult numpy float 64 

tensor([[1., 3., 4.],
        [2., 6., 7.]], dtype=torch.float64)

In [19]:
x1 = torch.FloatTensor(np.array([[1., 4., 7.], [2., 3., 6]])) # FloatTensor automatically converts to 32bit

In [20]:
x1.dtype

torch.float32

In [21]:
X[:, 1]

tensor([3.0000, 3.2000])

In [22]:
X[1, :] = 99

In [23]:
X

tensor([[ 1.0000,  3.0000,  5.5000],
        [99.0000, 99.0000, 99.0000]])

In [24]:
X.relu_()

tensor([[ 1.0000,  3.0000,  5.5000],
        [99.0000, 99.0000, 99.0000]])

In [25]:
X[:, 1] = -0.1

In [26]:
X

tensor([[ 1.0000, -0.1000,  5.5000],
        [99.0000, -0.1000, 99.0000]])

In [27]:
X.relu_()

tensor([[ 1.0000,  0.0000,  5.5000],
        [99.0000,  0.0000, 99.0000]])

In [28]:
X.sigmoid_()

tensor([[0.7311, 0.5000, 0.9959],
        [1.0000, 0.5000, 1.0000]])

In [29]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [30]:
device

'cuda'

In [31]:
M = torch.tensor([[1., 2., 3.], [4., 5., 6.]])

In [32]:
M = M.to(device)

In [33]:
M.device

device(type='cuda', index=0)

In [34]:
N = torch.tensor([[1., 3., 5.], [2., 4., 6.]])

In [35]:
N.device

device(type='cpu')

In [36]:
N = torch.tensor([[1., 3., 5.], [2., 4., 6.]], device = device)

In [37]:
N.device

device(type='cuda', index=0)

In [38]:
R = M @ N.T

In [39]:
R

tensor([[22., 28.],
        [49., 64.]], device='cuda:0')

In [40]:
M = torch.rand((1000, 1000))

In [41]:
%timeit M @ M.T

4.84 ms ± 149 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [42]:
M = torch.rand((1000, 1000), device="cuda")

In [43]:
%timeit M @ M.T

825 μs ± 9.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [44]:
x = torch.tensor(5.0, requires_grad = True)
# remember all computations on x which helps in autograd while backpropagation using requires_grad = True

In [45]:
f = x ** 2
f

tensor(25., grad_fn=<PowBackward0>)

In [46]:
f.backward()
# backpropagate the gradient through computational graph to starting point X

In [47]:
x.grad

tensor(10.)

In [48]:
learning_rate = 0.1
with torch.no_grad():
    x -= learning_rate * x.grad

# to disable the computational graph to track this step to avoid gradient descent step

In [49]:
x

tensor(4., requires_grad=True)

In [50]:
x_detached = x.detach()

In [51]:
x_detached -= learning_rate * x.grad
# same as no_grad

In [52]:
x_detached

tensor(3.)

In [53]:
learning_rate = 0.1
x = torch.tensor(5.0, requires_grad = True)
for iteration in range(100):
    f = x ** 2
    f.backward()

    with torch.no_grad():
        x -= learning_rate * x.grad

    x.grad.zero_()

    print(f'Iteration {iteration+1}: x = {x.item()}')

Iteration 1: x = 4.0
Iteration 2: x = 3.200000047683716
Iteration 3: x = 2.559999942779541
Iteration 4: x = 2.047999858856201
Iteration 5: x = 1.6383998394012451
Iteration 6: x = 1.3107198476791382
Iteration 7: x = 1.0485758781433105
Iteration 8: x = 0.8388606905937195
Iteration 9: x = 0.6710885763168335
Iteration 10: x = 0.5368708372116089
Iteration 11: x = 0.4294966757297516
Iteration 12: x = 0.3435973525047302
Iteration 13: x = 0.2748778760433197
Iteration 14: x = 0.21990230679512024
Iteration 15: x = 0.17592184245586395
Iteration 16: x = 0.14073747396469116
Iteration 17: x = 0.11258997768163681
Iteration 18: x = 0.09007198363542557
Iteration 19: x = 0.07205758988857269
Iteration 20: x = 0.057646073400974274
Iteration 21: x = 0.04611685872077942
Iteration 22: x = 0.036893486976623535
Iteration 23: x = 0.029514789581298828
Iteration 24: x = 0.023611832410097122
Iteration 25: x = 0.018889466300606728
Iteration 26: x = 0.015111573040485382
Iteration 27: x = 0.012089258059859276
Iterati

In [54]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

X, y = fetch_california_housing(return_X_y=True)  

X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.4, 
    random_state=42
)

In [55]:
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42
)

print(X_train.shape, X_valid.shape, X_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

(12384, 8) (4128, 8) (4128, 8)
(12384,) (4128,) (4128,)


In [56]:
X_train = torch.FloatTensor(X_train)
X_valid = torch.FloatTensor(X_valid)
X_test = torch.FloatTensor(X_test)
means = X_train.mean(dim=0, keepdims=True)
stds = X_train.std(dim=0, keepdims=True)
X_train = (X_train - means) / stds
X_valid = (X_valid - means) / stds
X_test = (X_test - means) / stds

In [57]:
y_train = torch.FloatTensor(y_train).reshape(-1, 1)
y_valid = torch.FloatTensor(y_valid).reshape(-1, 1)
y_test = torch.FloatTensor(y_test).reshape(-1, 1)

In [58]:
torch.manual_seed(42)
n_features = X_train.shape[1]
w = torch.randn((n_features, 1), requires_grad=True)
b = torch.tensor(0., requires_grad=True)

In [59]:
learning_rate = 0.4
n_epochs = 20
for epoch in range(n_epochs):
    y_pred = X_train @ w + b
    loss = ((y_pred - y_train) ** 2).mean()
    loss.backward()
    with torch.no_grad():
        b -= learning_rate * b.grad
        w -= learning_rate * w.grad
        b.grad.zero_()
        w.grad.zero_()
        print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}")

Epoch 1/20, Loss: 16.031423568725586
Epoch 2/20, Loss: 4.745883464813232
Epoch 3/20, Loss: 2.166027784347534
Epoch 4/20, Loss: 1.2763842344284058
Epoch 5/20, Loss: 0.9349923729896545
Epoch 6/20, Loss: 0.7927424311637878
Epoch 7/20, Loss: 0.7260330319404602
Epoch 8/20, Loss: 0.6892495155334473
Epoch 9/20, Loss: 0.665134847164154
Epoch 10/20, Loss: 0.6469970345497131
Epoch 11/20, Loss: 0.632148027420044
Epoch 12/20, Loss: 0.6194404363632202
Epoch 13/20, Loss: 0.6083313822746277
Epoch 14/20, Loss: 0.5985223054885864
Epoch 15/20, Loss: 0.5898195505142212
Epoch 16/20, Loss: 0.5820793509483337
Epoch 17/20, Loss: 0.5751853585243225
Epoch 18/20, Loss: 0.5690386295318604
Epoch 19/20, Loss: 0.5635537505149841
Epoch 20/20, Loss: 0.5586556196212769


In [60]:
X_new = X_test[:3]

with torch.no_grad():
    y_pred = X_new @ w + b

In [61]:
y_pred

tensor([[1.8848],
        [0.8700],
        [2.4256]])

In [62]:
import torch.nn as nn

torch.manual_seed(42)
model = nn.Linear(in_features = n_features, out_features = 1)

# The torch automatically create the Weight and Bias randomly based on input features and out feature count

In [63]:
model.bias

Parameter containing:
tensor([0.3117], requires_grad=True)

In [64]:
model.weight

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [65]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
Parameter containing:
tensor([0.3117], requires_grad=True)


In [66]:
model(X_train[:3])

tensor([[ 0.9189],
        [ 0.5916],
        [-0.4145]], grad_fn=<AddmmBackward0>)

In [67]:
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate) # using the models parameters with Stochastic Gradient
mse = nn.MSELoss()

In [68]:
def train_bgd(model, optimizer, criterion, X_train, y_train, n_epochs):
    for epoch in range(n_epochs):
        y_pred = model(X_train)
        loss = criterion(y_pred, y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch + 1} / {n_epochs}, Loss: {loss.item()}")

In [69]:
train_bgd(model, optimizer, mse, X_train, y_train, n_epochs)

Epoch 1 / 20, Loss: 4.2471442222595215
Epoch 2 / 20, Loss: 0.7678685784339905
Epoch 3 / 20, Loss: 0.6155121922492981
Epoch 4 / 20, Loss: 0.5964327454566956
Epoch 5 / 20, Loss: 0.586022138595581
Epoch 6 / 20, Loss: 0.5776858925819397
Epoch 7 / 20, Loss: 0.5705541372299194
Epoch 8 / 20, Loss: 0.5643265247344971
Epoch 9 / 20, Loss: 0.5588439702987671
Epoch 10 / 20, Loss: 0.5540000200271606
Epoch 11 / 20, Loss: 0.5497127175331116
Epoch 12 / 20, Loss: 0.5459139943122864
Epoch 13 / 20, Loss: 0.542545735836029
Epoch 14 / 20, Loss: 0.5395573377609253
Epoch 15 / 20, Loss: 0.5369042158126831
Epoch 16 / 20, Loss: 0.5345473289489746
Epoch 17 / 20, Loss: 0.5324525237083435
Epoch 18 / 20, Loss: 0.5305894017219543
Epoch 19 / 20, Loss: 0.5289314389228821
Epoch 20 / 20, Loss: 0.5274549126625061


In [70]:
X_new = X_test[:3]

with torch.no_grad():
    y_pred = model(X_new)

In [71]:
y_pred

tensor([[1.8978],
        [0.8016],
        [2.4146]])

In [72]:
torch.manual_seed(42)

model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)

# Neural Network with two hidden layers and one output layer
# FIrst Layer Feature_size X 50 neurons, second layer with 50 input X 40 neurons and final output layer with 40 inputs and one output neuron
# We used Rectified Linear Unnit here

In [73]:
model.parameters

<bound method Module.parameters of Sequential(
  (0): Linear(in_features=8, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=40, bias=True)
  (3): ReLU()
  (4): Linear(in_features=40, out_features=1, bias=True)
)>

In [74]:
learning_rate = 0.5

optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
mse = nn.MSELoss()

In [75]:
train_bgd(model, optimizer, mse, X_train, y_train, n_epochs)

Epoch 1 / 20, Loss: 4.961425304412842
Epoch 2 / 20, Loss: 31.237781524658203
Epoch 3 / 20, Loss: 30.767780303955078
Epoch 4 / 20, Loss: 2.052952766418457
Epoch 5 / 20, Loss: 3.2457525730133057
Epoch 6 / 20, Loss: 1.7149120569229126
Epoch 7 / 20, Loss: 1.3133938312530518
Epoch 8 / 20, Loss: 1.304430603981018
Epoch 9 / 20, Loss: 1.294636607170105
Epoch 10 / 20, Loss: 1.2797232866287231
Epoch 11 / 20, Loss: 1.256789207458496
Epoch 12 / 20, Loss: 1.2194485664367676
Epoch 13 / 20, Loss: 1.1553502082824707
Epoch 14 / 20, Loss: 1.047003984451294
Epoch 15 / 20, Loss: 0.9009128212928772
Epoch 16 / 20, Loss: 0.7937737107276917
Epoch 17 / 20, Loss: 0.7520601153373718
Epoch 18 / 20, Loss: 0.7288212180137634
Epoch 19 / 20, Loss: 0.7163874506950378
Epoch 20 / 20, Loss: 0.741695761680603


In [76]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)

In [77]:
torch.manual_seed(42)

model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
)

model = model.to(device)

In [78]:
def train(model, optimizer, criterion, train_loader, n_epochs, device):

    

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
           

        mean_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {mean_loss:.4f}")

        
            
    

In [79]:
train(model, optimizer, mse, train_loader, n_epochs, device)

# Due to Different scale of input features the model cannot escape the gradient and stuck at same loss

Epoch 1/20, Loss: 4.9614
Epoch 2/20, Loss: 4.9614
Epoch 3/20, Loss: 4.9614
Epoch 4/20, Loss: 4.9614
Epoch 5/20, Loss: 4.9614
Epoch 6/20, Loss: 4.9614
Epoch 7/20, Loss: 4.9614
Epoch 8/20, Loss: 4.9614
Epoch 9/20, Loss: 4.9614
Epoch 10/20, Loss: 4.9614
Epoch 11/20, Loss: 4.9614
Epoch 12/20, Loss: 4.9614
Epoch 13/20, Loss: 4.9614
Epoch 14/20, Loss: 4.9614
Epoch 15/20, Loss: 4.9614
Epoch 16/20, Loss: 4.9614
Epoch 17/20, Loss: 4.9614
Epoch 18/20, Loss: 4.9614
Epoch 19/20, Loss: 4.9614
Epoch 20/20, Loss: 4.9614


In [83]:
from sklearn.preprocessing import StandardScaler
import torch
import numpy as np

feature_scaler = StandardScaler()
X_train_scaled = feature_scaler.fit_transform(X_train.cpu().numpy())
X_valid_scaled = feature_scaler.transform(X_valid.cpu().numpy())
X_test_scaled = feature_scaler.transform(X_test.cpu().numpy())

def safe_reshape(y):
    if hasattr(y, 'numpy'):
        return y.cpu().numpy().reshape(-1, 1)
    elif hasattr(y, 'values'):
        return y.values.reshape(-1, 1)    
    else:
        return y.reshape(-1, 1)

target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(safe_reshape(y_train)).flatten()
y_valid_scaled = target_scaler.transform(safe_reshape(y_valid)).flatten()
y_test_scaled = target_scaler.transform(safe_reshape(y_test)).flatten()

In [86]:
X_train = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train = torch.tensor(y_train_scaled, dtype=torch.float32)
X_valid = torch.tensor(X_valid_scaled, dtype=torch.float32)
y_valid = torch.tensor(y_valid_scaled, dtype=torch.float32)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test = torch.tensor(y_test_scaled, dtype=torch.float32)

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

valid_dataset = TensorDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

print("NORMALIZATION:")
print(f"X_train - mean: {X_train.mean():.4f}, std: {X_train.std():.4f}")
print(f"y_train - mean: {y_train.mean():.4f}, std: {y_train.std():.4f}")
print(f"y_train range: {y_train.min():.4f} → {y_train.max():.4f}")


NORMALIZATION:
X_train - mean: -0.0000, std: 1.0000
y_train - mean: 0.0000, std: 1.0000
y_train range: -1.6612 → 2.5495


In [90]:
def train_val(model, optimizer, criterion, train_loader, valid_loader, n_epochs, device):
    for epoch in range(n_epochs):

        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch+1}/{n_epochs}")
        print(f"  Train Loss: {train_loss/len(train_loader):.4f}")


In [91]:
torch.manual_seed(42)
n_features = X_train.shape[1]

model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50, 40),
    nn.ReLU(),
    nn.Linear(40, 1)
).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
criterion = nn.MSELoss()


In [92]:
train_val(model, optimizer, criterion, train_loader, valid_loader, n_epochs=20, device=device)

Epoch 1/20
  Train Loss: 0.4934
Epoch 2/20
  Train Loss: 0.2946
Epoch 3/20
  Train Loss: 0.2789
Epoch 4/20
  Train Loss: 0.2713
Epoch 5/20
  Train Loss: 0.2671
Epoch 6/20
  Train Loss: 0.2609
Epoch 7/20
  Train Loss: 0.2527
Epoch 8/20
  Train Loss: 0.2471
Epoch 9/20
  Train Loss: 0.2438
Epoch 10/20
  Train Loss: 0.2402
Epoch 11/20
  Train Loss: 0.2349
Epoch 12/20
  Train Loss: 0.2327
Epoch 13/20
  Train Loss: 0.2301
Epoch 14/20
  Train Loss: 0.2294
Epoch 15/20
  Train Loss: 0.2280
Epoch 16/20
  Train Loss: 0.2201
Epoch 17/20
  Train Loss: 0.2200
Epoch 18/20
  Train Loss: 0.2208
Epoch 19/20
  Train Loss: 0.2208
Epoch 20/20
  Train Loss: 0.2182


In [93]:
def evaluate(model, data_loader, metric_fn, aggregate_fn = torch.mean):
    model.eval()

    metrics = []
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            y_pred = model(X_batch)
            metric = metric_fn(y_pred, y_batch)
            metrics.append(metric)
        return aggregate_fn(torch.stack(metrics))

In [94]:
valid_mse = evaluate(model, valid_loader, mse)

  return F.mse_loss(input, target, reduction=self.reduction)


In [95]:
valid_mse

tensor(1.8546, device='cuda:0')

In [96]:
def rmse(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean().sqrt()

evaluate(model, valid_loader, rmse)
    

tensor(1.3488, device='cuda:0')

In [97]:
valid_mse.sqrt()

# not same as direct RMSE calculation because of batch calculation of RMSE values over each batch

tensor(1.3619, device='cuda:0')

In [102]:
evaluate(model, valid_loader, mse, aggregate_fn = lambda metrics: torch.sqrt(torch.mean(metrics)))

# Now it gives the correct RMSE value by keeping rmse value for each batch using aggregate_fn

tensor(1.3619, device='cuda:0')

In [104]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
   ---------------------------------------- 0.0/983.2 kB ? eta -:--:--
   ---------------------------------------- 0.0/983.2 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/983.2 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/983.2 kB ? eta -:--:--
   ------------------------------- -------- 786.4/983.2 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 983.2/983.2 kB 1.3 MB/s  0:00:00
Downloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchmetrics

   -------------------- ------------------- 1/2 [torchmetrics]
   -------------------- ------------------- 1/2 [torchmetrics]
   --------------

In [108]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()

    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            y_pred = model(X_batch)
            metric.update(y_pred.squeeze(), y_batch)

        return metric.compute()

In [109]:
rmse = torchmetrics.MeanSquaredError(squared = False).to(device)

In [110]:
evaluate_tm(model, valid_loader, rmse)

tensor(0.4926, device='cuda:0')

In [111]:
def train(model, optimizer, criterion, train_loader, valid_loader, n_epochs, device):
   
    for epoch in range(n_epochs):
        
        model.train()
        train_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        
      
        model.eval() 
        valid_loss = 0.0
        
        with torch.no_grad():
            for X_batch, y_batch in valid_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                y_pred = model(X_batch)
                loss = criterion(y_pred.squeeze(), y_batch)
                valid_loss += loss.item()
        
        avg_valid_loss = valid_loss / len(valid_loader)
        
    
        print(f"Epoch {epoch + 1}/{n_epochs}")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Valid Loss: {avg_valid_loss:.4f}")
        
        if avg_train_loss + 0.1 < avg_valid_loss:
            print(f" WARNING: Possible OVERFITTING (Train: {avg_train_loss:.4f} << Valid: {avg_valid_loss:.4f})")
        
        print("-" * 50)

train(model, optimizer, mse, train_loader, valid_loader, n_epochs=20, device=device)


Epoch 1/20
  Train Loss: 0.2158
  Valid Loss: 0.2311
--------------------------------------------------
Epoch 2/20
  Train Loss: 0.2143
  Valid Loss: 0.2329
--------------------------------------------------
Epoch 3/20
  Train Loss: 0.2122
  Valid Loss: 0.2306
--------------------------------------------------
Epoch 4/20
  Train Loss: 0.2119
  Valid Loss: 0.2348
--------------------------------------------------
Epoch 5/20
  Train Loss: 0.2131
  Valid Loss: 0.2253
--------------------------------------------------
Epoch 6/20
  Train Loss: 0.2070
  Valid Loss: 0.2284
--------------------------------------------------
Epoch 7/20
  Train Loss: 0.2072
  Valid Loss: 0.2437
--------------------------------------------------
Epoch 8/20
  Train Loss: 0.2070
  Valid Loss: 0.2408
--------------------------------------------------
Epoch 9/20
  Train Loss: 0.2047
  Valid Loss: 0.2293
--------------------------------------------------
Epoch 10/20
  Train Loss: 0.2069
  Valid Loss: 0.2325
----------