In [2]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [3]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [11]:
data.mean(), data.std()

(Body Mass (g)          4201.754386
 Flipper Length (mm)     200.915205
 Culmen Length (mm)       43.921930
 dtype: float64,
 Body Mass (g)          801.954536
 Flipper Length (mm)     14.061714
 Culmen Length (mm)       5.459584
 dtype: float64)

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=51)),
])

model.get_params()
cv_scores = cross_validate(model, data, target, scoring="balanced_accuracy",
                          return_train_score=True, cv=10)
cv_scores = pd.DataFrame(cv_scores)
cv_scores, cv_scores["test_score"].mean()

(   fit_time  score_time  test_score  train_score
 0  0.003624    0.002946    0.952381     0.950812
 1  0.003318    0.002708    0.977778     0.945338
 2  0.003042    0.002708    1.000000     0.943681
 3  0.002984    0.002676    0.863248     0.960389
 4  0.002964    0.002679    0.882540     0.959308
 5  0.002958    0.002671    0.952381     0.941016
 6  0.002953    0.002657    0.955556     0.937450
 7  0.002946    0.002654    0.952381     0.943467
 8  0.002935    0.002646    0.930159     0.954396
 9  0.002943    0.002657    0.952381     0.945928,
 0.9418803418803419)

In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

model = Pipeline(steps=[
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

model.get_params()
cv_scores = cross_validate(model, data, target, scoring="balanced_accuracy",
                          return_train_score=True, cv=10)
cv_scores = pd.DataFrame(cv_scores)
cv_scores, cv_scores["test_score"].mean()

(   fit_time  score_time  test_score  train_score
 0  0.002360    0.002806    0.664683     0.804811
 1  0.001856    0.002423    0.736020     0.787876
 2  0.001839    0.002403    0.741026     0.791960
 3  0.001832    0.002407    0.704274     0.799418
 4  0.001887    0.002450    0.584127     0.802599
 5  0.001854    0.002444    0.669841     0.802589
 6  0.001856    0.002453    0.834921     0.800690
 7  0.001850    0.002443    0.742857     0.768687
 8  0.001848    0.002437    0.882540     0.783182
 9  0.001813    0.002373    0.838095     0.780741,
 0.7398382173382173)

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]


for pp in all_preprocessors:
    for k in [5, 51, 101]:
        print("Preprocessor: ", pp, "K-neighbor: ", k)
        model = Pipeline(steps=[
            ("preprocessor", pp),
            ("classifier", KNeighborsClassifier(n_neighbors=k)),
        ])

        model.get_params()
        cv_scores = cross_validate(model, data, target, scoring="balanced_accuracy",
                                  return_train_score=True, cv=10)
        cv_scores = pd.DataFrame(cv_scores)
        print("51 neighbor", cv_scores["test_score"].mean(), cv_scores["test_score"].std())

Preprocessor:  None K-neighbor:  5
51 neighbor 0.7398382173382173 0.09137390105837827
Preprocessor:  None K-neighbor:  51
51 neighbor 0.6051816239316239 0.03845307130868317
Preprocessor:  None K-neighbor:  101
51 neighbor 0.6138568376068376 0.03317445035717024
Preprocessor:  StandardScaler() K-neighbor:  5
51 neighbor 0.9521978021978021 0.04206050394033833
Preprocessor:  StandardScaler() K-neighbor:  51
51 neighbor 0.9418803418803419 0.041009971747782106
Preprocessor:  StandardScaler() K-neighbor:  101
51 neighbor 0.8766422466422465 0.04386966180108448
Preprocessor:  MinMaxScaler() K-neighbor:  5
51 neighbor 0.9477777777777779 0.03612210813810691
Preprocessor:  MinMaxScaler() K-neighbor:  51
51 neighbor 0.9202930402930404 0.04797817747635613
Preprocessor:  MinMaxScaler() K-neighbor:  101
51 neighbor 0.8623565323565323 0.04874594277907138
Preprocessor:  QuantileTransformer(n_quantiles=100) K-neighbor:  5
51 neighbor 0.9470940170940171 0.03562497834842707
Preprocessor:  QuantileTransform