# ЛР 3 - решающие деревья

In [4]:
import kagglehub
from pathlib import Path
import pandas as pd

In [2]:
path = kagglehub.dataset_download("uciml/forest-cover-type-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
list(data_dir.glob("**/*"))[:20]

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/uciml/forest-cover-type-dataset/versions/1


[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/uciml/forest-cover-type-dataset/versions/1/covtype.csv')]

In [5]:
csv_files = list(data_dir.glob("**/*.csv"))
df = pd.read_csv(csv_files[0])
df.shape, df.head()

((581012, 55),
    Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
 0       2596      51      3                               258   
 1       2590      56      2                               212   
 2       2804     139      9                               268   
 3       2785     155     18                               242   
 4       2595      45      2                               153   
 
    Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
 0                               0                              510   
 1                              -6                              390   
 2                              65                             3180   
 3                             118                             3090   
 4                              -1                              391   
 
    Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
 0            221             232            148   
 1            220             235            151   
 2 

In [7]:
from sklearn.model_selection import train_test_split

target_col = "Cover_Type"
X = df.drop(columns=[target_col])
y = df[target_col]

MAX_ROWS = 90_000
if len(df) > MAX_ROWS:
    df_s = df.sample(n=MAX_ROWS, random_state=42)
    X = df_s.drop(columns=[target_col])
    y = df_s[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((72000, 54), (18000, 54))

### DecisionTree baseline
Построен бейзлайн классификатора Decision Tree без ограничения глубины, что позволяет оценить склонность модели к переобучению

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

pred = dt_clf.predict(X_test)

acc = accuracy_score(y_test, pred)
f1m = f1_score(y_test, pred, average="macro")

print(f"DT baseline Accuracy: {acc:.4f}")
print(f"DT baseline Macro-F1: {f1m:.4f}")
print("\nClassification report:\n", classification_report(y_test, pred))

DT baseline Accuracy: 0.8564
DT baseline Macro-F1: 0.7937

Classification report:
               precision    recall  f1-score   support

           1       0.85      0.85      0.85      6595
           2       0.88      0.87      0.88      8762
           3       0.84      0.85      0.85      1104
           4       0.77      0.71      0.74        80
           5       0.66      0.65      0.65       304
           6       0.73      0.72      0.72       530
           7       0.86      0.87      0.87       625

    accuracy                           0.86     18000
   macro avg       0.80      0.79      0.79     18000
weighted avg       0.86      0.86      0.86     18000



### Контроль глубины и размеры листа
\+ оценка

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [5, 10, 15, 20, None],
    "min_samples_leaf": [1, 5, 10, 20],
    "criterion": ["gini", "entropy"],
}

grid_clf = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_clf.fit(X_train, y_train)

print("Best params:", grid_clf.best_params_)
print("Best CV Macro-F1:", grid_clf.best_score_)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
Best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}
Best CV Macro-F1: 0.7548268336989494


Ограничение глубины дерева снижает переобучение и улучшает обобщающую способность модели.

In [14]:
best_dt_clf = grid_clf.best_estimator_
pred_best = best_dt_clf.predict(X_test)

acc_best = accuracy_score(y_test, pred_best)
f1m_best = f1_score(y_test, pred_best, average="macro")

print(f"DT improved Accuracy: {acc_best:.4f}")
print(f"DT improved Macro-F1: {f1m_best:.4f}")

DT improved Accuracy: 0.8564
DT improved Macro-F1: 0.7937


### CarPrices regression

In [39]:
path = kagglehub.dataset_download("sidharth178/car-prices-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
list(data_dir.glob("**/*"))[:30]

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1


[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/test.csv'),
 PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/train.csv')]

In [55]:
csv_files = list(data_dir.glob("**/*.csv"))
csv_files

[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/test.csv'),
 PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/train.csv')]

In [56]:
df = pd.read_csv(csv_files[1])
df.shape, df.head()

((19237, 18),
          ID  Price  Levy Manufacturer    Model  Prod. year   Category  \
 0  45654403  13328  1399        LEXUS   RX 450        2010       Jeep   
 1  44731507  16621  1018    CHEVROLET  Equinox        2011       Jeep   
 2  45774419   8467     -        HONDA      FIT        2006  Hatchback   
 3  45769185   3607   862         FORD   Escape        2011       Jeep   
 4  45809263  11726   446        HONDA      FIT        2014  Hatchback   
 
   Leather interior Fuel type Engine volume    Mileage  Cylinders  \
 0              Yes    Hybrid           3.5  186005 km        6.0   
 1               No    Petrol             3  192000 km        6.0   
 2               No    Petrol           1.3  200000 km        4.0   
 3              Yes    Hybrid           2.5  168966 km        4.0   
 4              Yes    Petrol           1.3   91901 km        4.0   
 
   Gear box type Drive wheels   Doors             Wheel   Color  Airbags  
 0     Automatic          4x4  04-May        Left

In [57]:
target_col = "Price"
df = df.dropna(subset=[target_col]).copy()

X = df.drop(columns=[target_col])
y = df[target_col]

df.shape, X.shape, y.shape

((19237, 18), (19237, 17), (19237,))

In [58]:
df_reg = df.copy()

X_reg = df_reg.drop(columns=["Price"]).copy()
y_reg = df_reg["Price"].copy()

X_reg["Mileage"] = (
    X_reg["Mileage"].astype(str).str.replace(" km", "", regex=False)
)
X_reg["Mileage"] = pd.to_numeric(X_reg["Mileage"], errors="coerce")

X_reg["Engine volume"] = (
    X_reg["Engine volume"].astype(str).str.extract(r"([\d\.]+)")[0]
)
X_reg["Engine volume"] = pd.to_numeric(X_reg["Engine volume"], errors="coerce")

if "Levy" in X_reg.columns:
    X_reg["Levy"] = X_reg["Levy"].astype(str).replace("-", np.nan)
    X_reg["Levy"] = pd.to_numeric(X_reg["Levy"], errors="coerce")

X_reg.shape, y_reg.shape

((19237, 17), (19237,))

### Baseline DecisionTreeRegressor
Бейзлайн регрессионного дерева обучен без ограничений, что позволяет оценить влияние переобучения

In [59]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

numeric_features = selector(dtype_include=np.number)
categorical_features = selector(dtype_exclude=np.number)

preprocess_reg = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features),
    ],
    remainder="drop"
)

dt_reg_base = Pipeline(steps=[
    ("prep", preprocess_reg),
    ("dt", DecisionTreeRegressor(random_state=42))
])

dt_reg_base.fit(X_reg_train, y_reg_train)
pred = dt_reg_base.predict(X_reg_test)

mae_dt = mean_absolute_error(y_reg_test, pred)
rmse_dt = np.sqrt(mean_squared_error(y_reg_test, pred))

print("DT baseline MAE:", mae_dt)
print("DT baseline RMSE:", rmse_dt)

DT baseline MAE: 4935.571725571725
DT baseline RMSE: 11453.163058050974


In [60]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "dt__max_depth": [5, 10, 15, 20, None],
    "dt__min_samples_leaf": [1, 5, 10, 20],
    "dt__min_samples_split": [2, 10, 20],
}

grid_reg = GridSearchCV(
    estimator=dt_reg_base,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_reg.fit(X_reg_train, y_reg_train)

print("Best params:", grid_reg.best_params_)
print("Best CV MAE:", -grid_reg.best_score_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
Best params: {'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2}
Best CV MAE: 7190.5923186340815


### Качество улучшенного дерева регрессии оценено на тестовой выборке и сопоставлено с бейзлайном

In [61]:
best_dt_reg = grid_reg.best_estimator_
pred_best = best_dt_reg.predict(X_reg_test)

mae_dt_best = mean_absolute_error(y_reg_test, pred_best)
rmse_dt_best = np.sqrt(mean_squared_error(y_reg_test, pred_best))

print(f"DT improved MAE:  {mae_dt_best:.4f}")
print(f"DT improved RMSE: {rmse_dt_best:.4f}")

DT improved MAE:  4935.5717
DT improved RMSE: 11453.1631


### Сводная таблица по результатам

In [62]:
summary_lr3_reg = pd.DataFrame({
    "Model": [
        "Decision Tree (baseline)",
        "Decision Tree (improved)",
    ],
    "MAE": [mae_dt, mae_dt_best],
    "RMSE": [rmse_dt, rmse_dt_best],
})

summary_lr3_reg

Unnamed: 0,Model,MAE,RMSE
0,Decision Tree (baseline),4935.571726,11453.163058
1,Decision Tree (improved),4935.571726,11453.163058


In [63]:
summary_lr3 = pd.DataFrame({
    "Task": ["Classification", "Classification", "Regression", "Regression"],
    "Model": [
        "DT baseline",
        "DT improved",
        "DT baseline",
        "DT improved",
    ],
    "Metric 1": [f1m, f1m_best, mae_dt, mae_dt_best],
    "Metric 2": [acc, acc_best, rmse_dt, rmse_dt_best],
})

summary_lr3

Unnamed: 0,Task,Model,Metric 1,Metric 2
0,Classification,DT baseline,0.793726,0.856444
1,Classification,DT improved,0.793726,0.856444
2,Regression,DT baseline,4935.571726,11453.163058
3,Regression,DT improved,4935.571726,11453.163058


Для одиночных деревьев решений подбор гиперпараметров не привёл к улучшению качества по сравнению с бейзлайном. Это указывает на ограниченные возможности одного дерева решений в задачах с высокой вариативностью данных. Полученные результаты подтверждают необходимость использования ансамблевых методов для дальнейшего повышения качества моделей