In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from pathlib import Path
import importlib.util

module_path = Path.cwd().resolve() / ".." /"src" / "nnknn" / "improved_nnknn.py"
spec = importlib.util.spec_from_file_location("improved_nnknn", str(module_path))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
NNkNNRegression = mod.NNkNNRegression



In [2]:
def load_california_dataset():
    data = fetch_california_housing()
    X = data.data.astype(np.float32)
    y = data.target.reshape(-1, 1).astype(np.float32)
    return X, y

# Common preprocessing function
def preprocess_dataset(X, y, test_size=0.2):
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X = scaler_X.fit_transform(X)
    y = scaler_y.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )

    return X_train, X_test, y_train, y_test, scaler_X, scaler_y


In [3]:
X,y = load_california_dataset()
X_train, X_test, y_train, y_test, scaler_X, scaler_y = preprocess_dataset(X, y)

# Convert to torch
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_test_t  = torch.tensor(X_test, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.float32)

# Stored cases = training samples
stored_cases   = X_train_t.clone()
stored_targets = y_train_t.clone()

In [4]:
##############################################
# Train NN-kNN model
##############################################
model = NNkNNRegression(
        stored_cases=stored_cases,
        stored_targets=stored_targets,
        input_dim=stored_cases.shape[1],
        embed_dim=64,
        k=16,
        neighbor_dropout=0.1,
        use_local_linear=True
    )

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

batch_size = 128
epochs = 10

print("Training NN-kNN Regression Model...\n")
model.train()
for epoch in range(epochs):

    perm = torch.randperm(X_train_t.size(0))
    losses = []

    for i in range(0, X_train_t.size(0), batch_size):

        idx = perm[i:i+batch_size]
        xb = X_train_t[idx] 
        yb = y_train_t[idx]

        pred, info = model(xb)

        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        
    print("epoch", epoch, "loss", np.mean(losses))


Training NN-kNN Regression Model...

epoch 0 loss 0.5498562095932258
epoch 1 loss 0.24249962330326552
epoch 2 loss 0.22671665297460186
epoch 3 loss 0.2024166994663172
epoch 4 loss 0.1915012614787087
epoch 5 loss 0.18177359285511713
epoch 6 loss 0.1753191093719283
epoch 7 loss 0.17228222730778908
epoch 8 loss 0.1690385952707409
epoch 9 loss 0.1642266592198564


In [5]:
# ---------------------------------------------------------
# Calculate NNkNN evaluation metrics
# ---------------------------------------------------------
model.eval()
with torch.no_grad():
    preds, info = model(X_test_t)

# Inverse transform
preds_np = scaler_y.inverse_transform(preds.numpy())
ytrue_np = scaler_y.inverse_transform(y_test)

# rmse = np.sqrt(np.mean((preds_np - ytrue_np) ** 2))
nnknn_rmse = mean_squared_error(ytrue_np, preds_np)
nnknn_mae = mean_absolute_error(ytrue_np, preds_np)
nnknn_r2 = r2_score(ytrue_np, preds_np)

In [6]:

# ---------------------------------------------------------
# Baseline: Linear Regression
# ---------------------------------------------------------

lr = LinearRegression()
lr.fit(X_train, y_train)

preds_lr = scaler_y.inverse_transform(lr.predict(X_test))
ytrue_lr = scaler_y.inverse_transform(y_test)

lr_rmse = np.sqrt(mean_squared_error(ytrue_lr, preds_lr))
lr_mae = mean_absolute_error(ytrue_lr, preds_lr)
lr_r2 = r2_score(ytrue_lr, preds_lr)


# ---------------------------------------------------------
# Baseline: KNN Regressor
# ---------------------------------------------------------

knn = KNeighborsRegressor(n_neighbors=8)
knn.fit(X_train, y_train)

preds_knn = scaler_y.inverse_transform(knn.predict(X_test))
ytrue_knn = scaler_y.inverse_transform(y_test)

knn_rmse = np.sqrt(mean_squared_error(ytrue_knn, preds_knn))
knn_mae = mean_absolute_error(ytrue_knn, preds_knn)
knn_r2 = r2_score(ytrue_knn, preds_knn)


# ---------------------------------------------------------
# 8. Print comparison table
# ---------------------------------------------------------

print("\n==================== CALIFORNIA RESULTS ====================")
print(f"Linear Regression:  RMSE={lr_rmse:.4f}, MAE={lr_mae:.4f}, R²={lr_r2:.4f}")
print(f"KNN (k=8):          RMSE={knn_rmse:.4f}, MAE={knn_mae:.4f}, R²={knn_r2:.4f}")
print(f"NN-kNN Regression:  RMSE={nnknn_rmse:.4f}, MAE={nnknn_mae:.4f}, R²={nnknn_r2:.4f}")
print("=================================================")



Linear Regression:  RMSE=0.7456, MAE=0.5332, R²=0.5758
KNN (k=8):          RMSE=0.6485, MAE=0.4385, R²=0.6791
NN-kNN Regression:  RMSE=0.3209, MAE=0.3844, R²=0.7551
