# Pipelines to compare regression models
Dataset: `kaggle: piyushagni5/white-wine-quality`

In [22]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

import kagglehub
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
path  = kagglehub.dataset_download("piyushagni5/white-wine-quality")

In [3]:
df = kagglehub.datasets.dataset_load(
    adapter=kagglehub.KaggleDatasetAdapter.PANDAS,
    handle="piyushagni5/white-wine-quality",
    path="winequality-white.csv",
    pandas_kwargs={"sep": ";"}
)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
X = df.drop("quality", axis=1)
y = df["quality"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

In [13]:
models={
    "RandomForestRegressor": RandomForestRegressor(),
    "kNNRegressor": KNeighborsRegressor(),
}

pipeRFR = Pipeline([
    ("scaler", StandardScaler(with_mean=False, with_std=True)),
    ("featureSelector", SelectKBest(score_func=f_regression)),
    ("regressor", RandomForestRegressor())
])

pipeKNN = Pipeline([
    ("scaler", StandardScaler(with_mean=False, with_std=True)),
    ("featureSelector", SelectKBest(score_func=f_regression)),
    ("regressor", KNeighborsRegressor())
])

### Optimizing hyperparameters

In [6]:
import optuna
optuna.logging.disable_default_handler()

In [15]:
def objectiveRFR(trial):
    k = trial.suggest_int("n_features", 5, 11)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    pipeRFR.set_params(
        featureSelector__k=k,
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
    )


    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    score = cross_val_score(
        pipeRFR,
        X_train,
        y_train,
        cv=cv,
        scoring="neg_mean_absolute_error"
    )
    return -score.mean()

def objectiveKNN(trial):
    n_features_total = X_train.shape[1]

    k = trial.suggest_int("n_features", 5, n_features_total)
    n_neighbors = trial.suggest_int("n_neighbors", 3, 50)
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    p = trial.suggest_int("p", 1, 2)

    pipeKNN.set_params(
        featureSelector__k=k,
        regressor__n_neighbors=n_neighbors,
        regressor__weights=weights,
        regressor__p=p,
    )

    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_val_score(
        pipeKNN,
        X_train,
        y_train,
        cv=cv,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
    )
    return -scores.mean()

In [17]:
studyRFR = optuna.create_study(study_name="RFR",sampler=optuna.samplers.TPESampler(), direction="minimize")

studyRFR.optimize(objectiveRFR, n_trials=30)


In [18]:
studyKNN = optuna.create_study(study_name="KNN",sampler=optuna.samplers.TPESampler(), direction="minimize")
studyKNN.optimize(objectiveKNN, n_trials=30)

In [25]:
pipeRFR.set_params(
    featureSelector__k=studyRFR.best_params['n_features'],
    regressor__n_estimators=studyRFR.best_params['n_estimators'],
    regressor__max_depth=studyRFR.best_params['max_depth'],
    regressor__min_samples_split=studyRFR.best_params['min_samples_split'],
    regressor__min_samples_leaf=studyRFR.best_params['min_samples_leaf'],
)
pipeKNN.set_params(
    featureSelector__k=studyKNN.best_params['n_features'],
    regressor__n_neighbors=studyKNN.best_params['n_neighbors'],
    regressor__weights=studyKNN.best_params['weights'],
    regressor__p=studyKNN.best_params['p'],
)

0,1,2
,steps,"[('scaler', ...), ('featureSelector', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,score_func,<function f_r...0019EC2E97BA0>
,k,11

0,1,2
,n_neighbors,12
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


### Wilcoxon Statistical test

In [34]:
from scipy.stats import wilcoxon
import numpy as np

pipeRFR.fit(X_train, y_train)
y_pred_rf = pipeRFR.predict(X_test)

pipeKNN.fit(X_train, y_train)
y_pred_knn = pipeKNN.predict(X_test)

err_rf = np.abs(y_test - y_pred_rf)
err_knn = np.abs(y_test - y_pred_knn)

stat, p_value = wilcoxon(err_rf, err_knn)
alpha = 0.05
bool(p_value < alpha)

True