The goal is to find the best set of hyperparameters which maximize the generalization performance on a training set.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../dataset/california-housing.csv")

# deleting unecessary colmnn
df = df.drop(columns=["ocean_proximity"])

# features
data = df.drop(columns="median_house_value")

# target
target = df["median_house_value"]

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42
)


define the regression pipeline:

In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

scaler = StandardScaler()
model = make_pipeline(scaler, KNeighborsRegressor())

hyperparameters tuning and print the best combination of parameters, Used RandomizedSearchCV to evaluate 20 random combinations of KNN neighbors and StandardScaler settings using negative MAE, and report the best hyperparameter set.

In [16]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    "kneighborsregressor__n_neighbors": np.logspace(0, 3, num=10).astype(
        np.int32
    ),
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
}

model_random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_error",
    n_iter=20,
    n_jobs=2,
    verbose=1,
    random_state=1,
)
model_random_search.fit(data_train, target_train)
model_random_search.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


{'standardscaler__with_std': True,
 'standardscaler__with_mean': False,
 'kneighborsregressor__n_neighbors': np.int32(10)}

In [17]:
import pandas as pd

cv_results = pd.DataFrame(model_random_search.cv_results_)

In [18]:
cv_results["mean_test_score"] *= -1

In [19]:
column_name_mapping = {
    "param_kneighborsregressor__n_neighbors": "n_neighbors",
    "param_standardscaler__with_mean": "centering",
    "param_standardscaler__with_std": "scaling",
    "mean_test_score": "mean test score",
}

cv_results = cv_results.rename(columns=column_name_mapping)
cv_results = cv_results[column_name_mapping.values()].sort_values(
    "mean test score"
)

In [20]:
column_scaler = ["centering", "scaling"]
cv_results[column_scaler] = cv_results[column_scaler].astype(np.int64)
cv_results["n_neighbors"] = cv_results["n_neighbors"].astype(np.int64)
cv_results

Unnamed: 0,n_neighbors,centering,scaling,mean test score
17,10,0,1,42876.58595
18,4,0,1,43410.858156
6,46,0,1,45969.387982
16,2,1,1,46128.974451
9,100,0,1,48863.247775
0,1,0,1,51124.989406
15,215,1,1,52473.595052
12,215,0,1,52473.595052
10,464,1,1,57468.092962
13,1000,1,1,63644.047441


In [22]:
import plotly.express as px

fig = px.parallel_coordinates(
    cv_results,
    dimensions=[
        "n_neighbors",
        "centering",
        "scaling",
        "mean test score"
    ],
    color="mean test score"
)

fig.show()