# ЛР 5 - градиентный бустинг

In [1]:
import kagglehub
from pathlib import Path
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("uciml/forest-cover-type-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
csv_files = list(data_dir.glob("**/*.csv"))
df_forest = pd.read_csv(csv_files[0])
df_forest.shape, df_forest.head()

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/uciml/forest-cover-type-dataset/versions/1


((581012, 55),
    Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
 0       2596      51      3                               258   
 1       2590      56      2                               212   
 2       2804     139      9                               268   
 3       2785     155     18                               242   
 4       2595      45      2                               153   
 
    Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
 0                               0                              510   
 1                              -6                              390   
 2                              65                             3180   
 3                             118                             3090   
 4                              -1                              391   
 
    Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
 0            221             232            148   
 1            220             235            151   
 2 

In [3]:
from sklearn.model_selection import train_test_split

target_col = "Cover_Type"
X_clf = df_forest.drop(columns=[target_col])
y_clf = df_forest[target_col]

# (опционально) подвыборка для скорости
MAX_ROWS = 120_000
if len(X_clf) > MAX_ROWS:
    X_clf, _, y_clf, _ = train_test_split(
        X_clf, y_clf, train_size=MAX_ROWS, stratify=y_clf, random_state=42
    )

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

### Baseline HistGradientBoostingClassifier
Построен бейзлайн градиентного бустинга для классификации без подбора гиперпараметров

In [5]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

hgb_clf_base = HistGradientBoostingClassifier(
    random_state=42
)

hgb_clf_base.fit(Xc_train, yc_train)
pred = hgb_clf_base.predict(Xc_test)

acc = accuracy_score(yc_test, pred)
f1m = f1_score(yc_test, pred, average="macro")

print(f"HGB baseline Accuracy: {acc:.4f}")
print(f"HGB baseline Macro-F1: {f1m:.4f}")
print("\nClassification report:\n", classification_report(yc_test, pred))

HGB baseline Accuracy: 0.8547
HGB baseline Macro-F1: 0.8107

Classification report:
               precision    recall  f1-score   support

           1       0.85      0.83      0.84      8751
           2       0.86      0.89      0.87     11703
           3       0.87      0.91      0.89      1477
           4       0.77      0.81      0.79       113
           5       0.77      0.50      0.61       392
           6       0.83      0.74      0.78       717
           7       0.93      0.85      0.89       847

    accuracy                           0.85     24000
   macro avg       0.84      0.79      0.81     24000
weighted avg       0.85      0.85      0.85     24000



### Улучшение GridSearchCV

In [6]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "learning_rate": [0.05, 0.1],
    "max_depth": [None, 6, 10],
    "max_leaf_nodes": [31, 63],
    "min_samples_leaf": [20, 50],
}

grid_hgb_clf = GridSearchCV(
    HistGradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_hgb_clf.fit(Xc_train, yc_train)

print("Best params:", grid_hgb_clf.best_params_)
print("Best CV Macro-F1:", grid_hgb_clf.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best params: {'learning_rate': 0.05, 'max_depth': None, 'max_leaf_nodes': 63, 'min_samples_leaf': 50}
Best CV Macro-F1: 0.8180512080496651


### Оценка улучшенной модели

In [9]:
best_hgb_clf = grid_hgb_clf.best_estimator_
pred_best = best_hgb_clf.predict(Xc_test)

acc_best = accuracy_score(yc_test, pred_best)
f1m_best = f1_score(yc_test, pred_best, average="macro")

print(f"HGB improved Accuracy: {acc_best:.4f}")
print(f"HGB improved Macro-F1: {f1m_best:.4f}")

HGB improved Accuracy: 0.8616
HGB improved Macro-F1: 0.8305


### CarPrice regression

In [10]:
path = kagglehub.dataset_download("sidharth178/car-prices-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
csv_files = list(data_dir.glob("**/*.csv"))
df = pd.read_csv(csv_files[1])
df.shape, df.head()

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1


((19237, 18),
          ID  Price  Levy Manufacturer    Model  Prod. year   Category  \
 0  45654403  13328  1399        LEXUS   RX 450        2010       Jeep   
 1  44731507  16621  1018    CHEVROLET  Equinox        2011       Jeep   
 2  45774419   8467     -        HONDA      FIT        2006  Hatchback   
 3  45769185   3607   862         FORD   Escape        2011       Jeep   
 4  45809263  11726   446        HONDA      FIT        2014  Hatchback   
 
   Leather interior Fuel type Engine volume    Mileage  Cylinders  \
 0              Yes    Hybrid           3.5  186005 km        6.0   
 1               No    Petrol             3  192000 km        6.0   
 2               No    Petrol           1.3  200000 km        4.0   
 3              Yes    Hybrid           2.5  168966 km        4.0   
 4              Yes    Petrol           1.3   91901 km        4.0   
 
   Gear box type Drive wheels   Doors             Wheel   Color  Airbags  
 0     Automatic          4x4  04-May        Left

In [11]:
df_reg = df.copy()

X_reg = df_reg.drop(columns=["Price"]).copy()
y_reg = df_reg["Price"].copy()

X_reg["Mileage"] = (
    X_reg["Mileage"].astype(str).str.replace(" km", "", regex=False)
)
X_reg["Mileage"] = pd.to_numeric(X_reg["Mileage"], errors="coerce")

X_reg["Engine volume"] = (
    X_reg["Engine volume"].astype(str).str.extract(r"([\d\.]+)")[0]
)
X_reg["Engine volume"] = pd.to_numeric(X_reg["Engine volume"], errors="coerce")

if "Levy" in X_reg.columns:
    X_reg["Levy"] = X_reg["Levy"].astype(str).replace("-", np.nan)
    X_reg["Levy"] = pd.to_numeric(X_reg["Levy"], errors="coerce")

X_reg.shape, y_reg.shape

((19237, 17), (19237,))

### Baseline HistGradientBoostingRegressor
Построен бейзлайн градиентного бустинга для регрессии с использованием предобработки признаков

In [14]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

numeric_features = selector(dtype_include=np.number)
categorical_features = selector(dtype_exclude=np.number)

preprocess_reg = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), categorical_features),
    ],
    remainder="drop"
)

hgb_reg_base = Pipeline(steps=[
    ("prep", preprocess_reg),
    ("hgb", HistGradientBoostingRegressor(
        random_state=42
    ))
])

hgb_reg_base.fit(X_reg_train, y_reg_train)
pred = hgb_reg_base.predict(X_reg_test)

mae = mean_absolute_error(y_reg_test, pred)
rmse = np.sqrt(mean_squared_error(y_reg_test, pred))

print(f"HGB baseline MAE:  {mae:.4f}")
print(f"HGB baseline RMSE: {rmse:.4f}")

HGB baseline MAE:  9389.5720
HGB baseline RMSE: 30764.9932


### Улучшение GridSearchCV

In [15]:
param_grid = {
    "hgb__learning_rate": [0.05, 0.1],
    "hgb__max_depth": [None, 6, 10],
    "hgb__max_leaf_nodes": [31, 63],
    "hgb__min_samples_leaf": [20, 50],
}

grid_hgb_reg = GridSearchCV(
    hgb_reg_base,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_hgb_reg.fit(X_reg_train, y_reg_train)

print("Best params:", grid_hgb_reg.best_params_)
print("Best CV MAE:", -grid_hgb_reg.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits




Best params: {'hgb__learning_rate': 0.1, 'hgb__max_depth': None, 'hgb__max_leaf_nodes': 63, 'hgb__min_samples_leaf': 50}
Best CV MAE: 9688.763834031546


### Оценка

In [18]:
best_hgb_reg = grid_hgb_reg.best_estimator_
pred_best = best_hgb_reg.predict(X_reg_test)

mae_best = mean_absolute_error(y_reg_test, pred_best)
rmse_best = np.sqrt(mean_squared_error(y_reg_test, pred_best))

print(f"HGB improved MAE:  {mae_best:.4f}")
print(f"HGB improved RMSE: {rmse_best:.4f}")

HGB improved MAE:  8647.5710
HGB improved RMSE: 18555.7868


### Итоговая сводка

In [21]:
summary_lr5 = pd.DataFrame({
    "Task": ["Classification", "Classification", "Regression", "Regression"],
    "Model": [
        "HGB baseline",
        "HGB improved",
        "HGB baseline",
        "HGB improved",
    ],
    "Metric 1": [f1m, f1m_best, mae, mae_best],
    "Metric 2": [acc, acc_best, rmse, rmse_best],
})

summary_lr5

Unnamed: 0,Task,Model,Metric 1,Metric 2
0,Classification,HGB baseline,0.810662,0.854708
1,Classification,HGB improved,0.830502,0.861625
2,Regression,HGB baseline,9389.572041,30764.993174
3,Regression,HGB improved,8647.571046,18555.786785


Градиентный бустинг показал высокую эффективность как в задаче классификации, так и в задаче регрессии. Подбор гиперпараметров позволил существенно улучшить качество модели, особенно в задаче регрессии. По сравнению с Random Forest, бустинг показал сопоставимые результаты в классификации, однако уступил по качеству в задаче прогнозирования стоимости автомобилей, что подчёркивает различия в поведении ансамблевых методов на различных типах задач