# ЛР4 - random forest

In [18]:
import kagglehub
from pathlib import Path
import pandas as pd
import numpy as np

In [19]:
path = kagglehub.dataset_download("uciml/forest-cover-type-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
csv_files = list(data_dir.glob("**/*.csv"))
df_forest = pd.read_csv(csv_files[0])
df_forest.shape, df_forest.head()

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/uciml/forest-cover-type-dataset/versions/1


((581012, 55),
    Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
 0       2596      51      3                               258   
 1       2590      56      2                               212   
 2       2804     139      9                               268   
 3       2785     155     18                               242   
 4       2595      45      2                               153   
 
    Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
 0                               0                              510   
 1                              -6                              390   
 2                              65                             3180   
 3                             118                             3090   
 4                              -1                              391   
 
    Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
 0            221             232            148   
 1            220             235            151   
 2 

In [9]:
from sklearn.model_selection import train_test_split

target_col = "Cover_Type"
X_clf = df_forest.drop(columns=[target_col])
y_clf = df_forest[target_col]

# (опционально) подвыборка для скорости
MAX_ROWS = 120_000
if len(X_clf) > MAX_ROWS:
    X_clf, _, y_clf, _ = train_test_split(
        X_clf, y_clf, train_size=MAX_ROWS, stratify=y_clf, random_state=42
    )

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

### Baseline RandomForestClassifire
Бейзлайн ансамблевой модели Random Forest построенный без подбора гиперпараметров

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

rf_clf_base = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_clf_base.fit(Xc_train, yc_train)
pred = rf_clf_base.predict(Xc_test)

acc = accuracy_score(yc_test, pred)
f1m = f1_score(yc_test, pred, average="macro")

print(f"RF baseline Accuracy: {acc:.4f}")
print(f"RF baseline Macro-F1: {f1m:.4f}")
print("\nClassification report:\n", classification_report(yc_test, pred))

RF baseline Accuracy: 0.9103
RF baseline Macro-F1: 0.8516

Classification report:
               precision    recall  f1-score   support

           1       0.93      0.89      0.91      8751
           2       0.90      0.95      0.93     11703
           3       0.88      0.93      0.90      1477
           4       0.87      0.82      0.85       113
           5       0.90      0.53      0.66       392
           6       0.88      0.75      0.81       717
           7       0.94      0.87      0.90       847

    accuracy                           0.91     24000
   macro avg       0.90      0.82      0.85     24000
weighted avg       0.91      0.91      0.91     24000



### Улучшение GridSearchCV

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [200, 400],
    "max_depth": [None, 20, 30],
    "min_samples_leaf": [1, 5],
    "max_features": ["sqrt", "log2"],
}

grid_rf_clf = GridSearchCV(
    rf_clf_base,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_rf_clf.fit(Xc_train, yc_train)

print("Best params:", grid_rf_clf.best_params_)
print("Best CV Macro-F1:", grid_rf_clf.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits




Best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}
Best CV Macro-F1: 0.8307428015132855


### Оценка улучшенной модели

In [15]:
best_rf_clf = grid_rf_clf.best_estimator_
pred_best = best_rf_clf.predict(Xc_test)

acc_best = accuracy_score(yc_test, pred_best)
f1m_best = f1_score(yc_test, pred_best, average="macro")

print(f"RF improved Accuracy: {acc_best:.4f}")
print(f"RF improved Macro-F1: {f1m_best:.4f}")

RF improved Accuracy: 0.9103
RF improved Macro-F1: 0.8516


### CarPrice regression

In [21]:
path = kagglehub.dataset_download("sidharth178/car-prices-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
csv_files = list(data_dir.glob("**/*.csv"))
df = pd.read_csv(csv_files[1])
df.shape, df.head()

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1


((19237, 18),
          ID  Price  Levy Manufacturer    Model  Prod. year   Category  \
 0  45654403  13328  1399        LEXUS   RX 450        2010       Jeep   
 1  44731507  16621  1018    CHEVROLET  Equinox        2011       Jeep   
 2  45774419   8467     -        HONDA      FIT        2006  Hatchback   
 3  45769185   3607   862         FORD   Escape        2011       Jeep   
 4  45809263  11726   446        HONDA      FIT        2014  Hatchback   
 
   Leather interior Fuel type Engine volume    Mileage  Cylinders  \
 0              Yes    Hybrid           3.5  186005 km        6.0   
 1               No    Petrol             3  192000 km        6.0   
 2               No    Petrol           1.3  200000 km        4.0   
 3              Yes    Hybrid           2.5  168966 km        4.0   
 4              Yes    Petrol           1.3   91901 km        4.0   
 
   Gear box type Drive wheels   Doors             Wheel   Color  Airbags  
 0     Automatic          4x4  04-May        Left

In [22]:
df_reg = df.copy()

X_reg = df_reg.drop(columns=["Price"]).copy()
y_reg = df_reg["Price"].copy()

X_reg["Mileage"] = (
    X_reg["Mileage"].astype(str).str.replace(" km", "", regex=False)
)
X_reg["Mileage"] = pd.to_numeric(X_reg["Mileage"], errors="coerce")

X_reg["Engine volume"] = (
    X_reg["Engine volume"].astype(str).str.extract(r"([\d\.]+)")[0]
)
X_reg["Engine volume"] = pd.to_numeric(X_reg["Engine volume"], errors="coerce")

if "Levy" in X_reg.columns:
    X_reg["Levy"] = X_reg["Levy"].astype(str).replace("-", np.nan)
    X_reg["Levy"] = pd.to_numeric(X_reg["Levy"], errors="coerce")

X_reg.shape, y_reg.shape

((19237, 17), (19237,))

### Бейзлайн RandomForestRegressor

In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

numeric_features = selector(dtype_include=np.number)
categorical_features = selector(dtype_exclude=np.number)

preprocess_reg = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features),
    ],
    remainder="drop"
)

rf_reg_base = Pipeline(steps=[
    ("prep", preprocess_reg),
    ("rf", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

rf_reg_base.fit(X_reg_train, y_reg_train)
pred = rf_reg_base.predict(X_reg_test)

mae = mean_absolute_error(y_reg_test, pred)
rmse = np.sqrt(mean_squared_error(y_reg_test, pred))

print(f"RF baseline MAE:  {mae:.4f}")
print(f"RF baseline RMSE: {rmse:.4f}")

RF baseline MAE:  3904.2351
RF baseline RMSE: 8760.4173


### Улучшение GridSearchCV

In [29]:
param_grid = {
    "rf__n_estimators": [300, 600],
    "rf__max_depth": [None, 20, 30],
    "rf__min_samples_leaf": [1, 5],
    "rf__max_features": ["sqrt", 0.7],
}

grid_rf_reg = GridSearchCV(
    rf_reg_base,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_rf_reg.fit(X_reg_train, y_reg_train)

print("Best params:", grid_rf_reg.best_params_)
print("Best CV MAE:", -grid_rf_reg.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits




Best params: {'rf__max_depth': None, 'rf__max_features': 0.7, 'rf__min_samples_leaf': 1, 'rf__n_estimators': 300}
Best CV MAE: 7169.449509070951


In [36]:
best_rf_reg = grid_rf_reg.best_estimator_
pred_best = best_rf_reg.predict(X_reg_test)

mae_best = mean_absolute_error(y_reg_test, pred_best)
rmse_best = np.sqrt(mean_squared_error(y_reg_test, pred_best))

print(f"RF improved MAE:  {mae_best:.4f}")
print(f"RF improved RMSE: {rmse_best:.4f}")

RF improved MAE:  4107.3356
RF improved RMSE: 15524.6320


### Сводная таблица

In [39]:
summary_lr4 = pd.DataFrame({
    "Task": ["Classification", "Classification", "Regression", "Regression"],
    "Model": [
        "RF baseline",
        "RF improved",
        "RF baseline",
        "RF improved",
    ],
    "Metric 1": [f1m, f1m_best, mae, mae_best],
    "Metric 2": [acc, acc_best, rmse, rmse_best],
})

summary_lr4

Unnamed: 0,Task,Model,Metric 1,Metric 2
0,Classification,RF baseline,0.851563,0.910292
1,Classification,RF improved,0.851563,0.910292
2,Regression,RF baseline,3904.235141,8760.417346
3,Regression,RF improved,4107.335552,15524.632006


Ансамблевый метод Random Forest показал существенное улучшение качества по сравнению с одиночным деревом решений. При этом подбор гиперпараметров не привёл к дополнительному улучшению качества на тестовой выборке, а в задаче регрессии даже ухудшил результат. Это демонстрирует, что кросс-валидация не всегда гарантирует оптимальное обобщение и подчёркивает важность сравнения моделей на независимой тестовой выборке