# ЛР1 Регрессия: KNN (Car Prices)

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
import kagglehub

path = kagglehub.dataset_download("sidharth178/car-prices-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
list(data_dir.glob("**/*"))[:30]

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/sidharth178/car-prices-dataset?dataset_version_number=1...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 592k/592k [00:05<00:00, 103kB/s]

Extracting files...
Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1





[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/test.csv'),
 PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/train.csv')]

In [16]:
csv_files = list(data_dir.glob("**/*.csv"))
csv_files

[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/test.csv'),
 PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/train.csv')]

### Для задачи регрессии используется файл `train.csv`, содержащий целевую переменную, а файл `test.csv` будем использовать позже для предсказаний

In [17]:
df = pd.read_csv(csv_files[1])
df.shape, df.head()

((19237, 18),
          ID  Price  Levy Manufacturer    Model  Prod. year   Category  \
 0  45654403  13328  1399        LEXUS   RX 450        2010       Jeep   
 1  44731507  16621  1018    CHEVROLET  Equinox        2011       Jeep   
 2  45774419   8467     -        HONDA      FIT        2006  Hatchback   
 3  45769185   3607   862         FORD   Escape        2011       Jeep   
 4  45809263  11726   446        HONDA      FIT        2014  Hatchback   
 
   Leather interior Fuel type Engine volume    Mileage  Cylinders  \
 0              Yes    Hybrid           3.5  186005 km        6.0   
 1               No    Petrol             3  192000 km        6.0   
 2               No    Petrol           1.3  200000 km        4.0   
 3              Yes    Hybrid           2.5  168966 km        4.0   
 4              Yes    Petrol           1.3   91901 km        4.0   
 
   Gear box type Drive wheels   Doors             Wheel   Color  Airbags  
 0     Automatic          4x4  04-May        Left

In [19]:
target_col = "Price"
df = df.dropna(subset=[target_col]).copy()
df.shape

(19237, 18)

In [20]:
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [21]:
X = df.drop(columns=[target_col])
y = df[target_col]
X.shape, y.shape

((19237, 17), (19237,))

### Числовые признаки, представленные в строковом формате привели к числовому виду

In [22]:
X["Mileage"] = (
    X["Mileage"]
    .astype(str)
    .str.replace(" km", "", regex=False)
)
X["Mileage"] = pd.to_numeric(X["Mileage"], errors="coerce")

X["Engine volume"] = (
    X["Engine volume"]
    .astype(str)
    .str.extract(r"([\d\.]+)")[0]
)
X["Engine volume"] = pd.to_numeric(X["Engine volume"], errors="coerce")

X[["Mileage", "Engine volume"]].head()

Unnamed: 0,Mileage,Engine volume
0,186005,3.5
1,192000,3.0
2,200000,1.3
3,168966,2.5
4,91901,1.3


In [23]:
mask = X.notna().all(axis=1)
X = X.loc[mask]
y = y.loc[mask]

X.shape, y.shape

((19237, 17), (19237,))

### Разделяем данные на обучающую и тестовую выборки

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

((15389, 17), (3848, 17))

### Для числовых признаков применяется масштабирование, а для категориальных one-hot кодирование

In [29]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)

len(num_cols), len(cat_cols), num_cols[:10], cat_cols[:10]

(6,
 11,
 ['ID', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags'],
 ['Levy',
  'Manufacturer',
  'Model',
  'Category',
  'Leather interior',
  'Fuel type',
  'Gear box type',
  'Drive wheels',
  'Doors',
  'Wheel'])

### Строим базовый KNN-регрессор

In [31]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

baseline_reg = Pipeline(steps=[
    ("prep", preprocess),
    ("knn", KNeighborsRegressor(n_neighbors=5, weights="uniform"))
])

baseline_reg.fit(X_train, y_train)
pred = baseline_reg.predict(X_test)

mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)

print(f"Baseline MAE:  {mae:.4f}")
print(f"Baseline RMSE: {rmse:.4f}")

Baseline MAE:  5243.0049
Baseline RMSE: 11048.7042


In [32]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "knn__n_neighbors": [3, 5, 7, 11, 15, 21, 31],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"],
}

grid = GridSearchCV(
    estimator=baseline_reg,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV MAE:", -grid.best_score_)

Fitting 3 folds for each of 28 candidates, totalling 84 fits




Best params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 15, 'knn__weights': 'distance'}
Best CV MAE: 8739.476467796667


### Оценено качество улучшенного KNN-регрессора на тестовой выборке и выполнено сравнение с бейзлайном

In [36]:
best_reg = grid.best_estimator_
pred_best = best_reg.predict(X_test)

mae_best = mean_absolute_error(y_test, pred_best)
mse_best = mean_squared_error(y_test, pred_best)
rmse_best = np.sqrt(mse_best)

print(f"Improved MAE:  {mae_best:.4f}")
print(f"Improved RMSE: {rmse_best:.4f}")

Improved MAE:  5851.3976
Improved RMSE: 42670.8872


### Собственная реализация + преобразуем признаки после препроцессинга

In [37]:
import numpy as np

prep = best_reg.named_steps["prep"]

X_train_tr = prep.fit_transform(X_train)
X_test_tr  = prep.transform(X_test)

if hasattr(X_train_tr, "toarray"):
    X_train_tr = X_train_tr.toarray()
if hasattr(X_test_tr, "toarray"):
    X_test_tr = X_test_tr.toarray()

X_train_tr = np.asarray(X_train_tr, dtype=float)
X_test_tr  = np.asarray(X_test_tr, dtype=float)

y_train_np = np.asarray(y_train, dtype=float)
y_test_np  = np.asarray(y_test, dtype=float)

X_train_tr.shape, X_test_tr.shape

((15389, 2031), (3848, 2031))

### Для снижения вычислительной нагрузки ограничиваем набор

In [38]:
MAX_TRAIN_FOR_MYKNN = 50_000
if X_train_tr.shape[0] > MAX_TRAIN_FOR_MYKNN:
    rng = np.random.RandomState(42)
    idx = rng.choice(X_train_tr.shape[0], MAX_TRAIN_FOR_MYKNN, replace=False)
    X_train_tr_small = X_train_tr[idx]
    y_train_np_small = y_train_np[idx]
else:
    X_train_tr_small = X_train_tr
    y_train_np_small = y_train_np

X_train_tr_small.shape

(15389, 2031)

### Реализация собственного KNN-регрессора

In [43]:
class MyKNNRegressor:
    def __init__(self, n_neighbors=5, weights="uniform", metric="euclidean", eps=1e-9):
        self.n_neighbors = int(n_neighbors)
        self.weights = weights
        self.metric = metric
        self.eps = eps
        self.X_train_ = None
        self.y_train_ = None

    def fit(self, X, y):
        self.X_train_ = np.asarray(X, dtype=float)
        self.y_train_ = np.asarray(y, dtype=float)
        return self

    def _distances(self, X):
        X = np.asarray(X, dtype=float)

        if self.metric == "euclidean":
            X2 = np.sum(X**2, axis=1, keepdims=True)                 # (batch, 1)
            T2 = np.sum(self.X_train_**2, axis=1, keepdims=True).T    # (1, n_train)
            d2 = X2 + T2 - 2.0 * (X @ self.X_train_.T)
            d2 = np.maximum(d2, 0.0)
            return np.sqrt(d2)

        if self.metric == "manhattan":
            return np.sum(np.abs(X[:, None, :] - self.X_train_[None, :, :]), axis=2)

        raise ValueError("metric must be 'euclidean' or 'manhattan'")

    def predict(self, X, batch_size=512):
        if self.X_train_ is None:
            raise RuntimeError("Call fit() before predict().")

        X = np.asarray(X, dtype=float)
        n = X.shape[0]
        preds = np.empty(n, dtype=float)
        k = self.n_neighbors

        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            Xb = X[start:end]

            dist = self._distances(Xb)  # (batch, n_train)
            nn_idx = np.argpartition(dist, kth=k-1, axis=1)[:, :k]
            nn_dist = np.take_along_axis(dist, nn_idx, axis=1)
            nn_y = self.y_train_[nn_idx]

            if self.weights == "uniform":
                preds[start:end] = np.mean(nn_y, axis=1)
            elif self.weights == "distance":
                w = 1.0 / (nn_dist + self.eps)
                preds[start:end] = np.sum(w * nn_y, axis=1) / np.sum(w, axis=1)
            else:
                raise ValueError("weights must be 'uniform' or 'distance'")

        return preds

### Бейзлайн реализации + метрики

In [46]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

my_reg = MyKNNRegressor(n_neighbors=5, weights="uniform", metric="euclidean")
my_reg.fit(X_train_tr_small, y_train_np_small)

pred_my = my_reg.predict(X_test_tr, batch_size=512)

mae_my = mean_absolute_error(y_test_np, pred_my)
rmse_my = np.sqrt(mean_squared_error(y_test_np, pred_my))

print(f"MyKNNReg (baseline) MAE:  {mae_my:.4f}")
print(f"MyKNNReg (baseline) RMSE: {rmse_my:.4f}")

MyKNNReg (baseline) MAE:  5243.0049
MyKNNReg (baseline) RMSE: 11048.7042


### Сравнение со sklearn на тех же данных и параметрах

In [47]:
from sklearn.neighbors import KNeighborsRegressor

sk_reg = KNeighborsRegressor(n_neighbors=5, weights="uniform", metric="euclidean")
sk_reg.fit(X_train_tr_small, y_train_np_small)

pred_sk = sk_reg.predict(X_test_tr)

mae_sk = mean_absolute_error(y_test_np, pred_sk)
rmse_sk = np.sqrt(mean_squared_error(y_test_np, pred_sk))

print(f"sklearn KNN (same data/params) MAE:  {mae_sk:.4f}")
print(f"sklearn KNN (same data/params) RMSE: {rmse_sk:.4f}")
print("delta MAE (my - sklearn):", mae_my - mae_sk)

sklearn KNN (same data/params) MAE:  5243.0049
sklearn KNN (same data/params) RMSE: 11048.7042
delta MAE (my - sklearn): 0.0


In [48]:
bp = grid.best_params_
bp

{'knn__metric': 'euclidean',
 'knn__n_neighbors': 15,
 'knn__weights': 'distance'}

In [51]:
my_reg_best = MyKNNRegressor(
    n_neighbors=bp["knn__n_neighbors"],
    weights=bp["knn__weights"],
    metric=bp["knn__metric"],
)

my_reg_best.fit(X_train_tr_small, y_train_np_small)
pred_my_best = my_reg_best.predict(X_test_tr, batch_size=512)

mae_my_best = mean_absolute_error(y_test_np, pred_my_best)
rmse_my_best = np.sqrt(mean_squared_error(y_test_np, pred_my_best))

print(f"MyKNNReg (improved) MAE:  {mae_my_best:.4f}")
print(f"MyKNNReg (improved) RMSE: {rmse_my_best:.4f}")

MyKNNReg (improved) MAE:  5851.3978
MyKNNReg (improved) RMSE: 42670.8872


### Сводная таблица результатов

In [54]:
results = pd.DataFrame({
    "Model": [
        "sklearn KNN (baseline)",
        "sklearn KNN (improved)",
        "MyKNNReg (baseline)",
        "MyKNNReg (improved)",
    ],
    "MAE": [mae, mae_best, mae_my, mae_my_best],
    "RMSE": [rmse, rmse_best, rmse_my, rmse_my_best],
})

results

Unnamed: 0,Model,MAE,RMSE
0,sklearn KNN (baseline),5243.004886,11048.704161
1,sklearn KNN (improved),5851.397561,42670.88722
2,MyKNNReg (baseline),5243.004886,11048.704161
3,MyKNNReg (improved),5851.397836,42670.887211


В результате подбора гиперпараметров с использованием кросс-валидации качество KNN-регрессора на тестовой выборке не улучшилось. Более того, базовая модель показала меньшую ошибку по сравнению с улучшенной конфигурацией. Это указывает на чувствительность алгоритма KNN к распределению данных и наличие переобучения на этапе кросс-валидации, что особенно заметно при наличии выбросов в целевой переменной