# Лабораторная работа №4 (Проведение исследований со случайным лесом)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from typing import Optional
from scipy.stats import randint

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [2]:
df_class = pd.read_csv('classification.csv')
df_reg = pd.read_csv('regression.csv')

  df_reg = pd.read_csv('regression.csv')


Датасет для классификации (Air Quality & Health Impact Analysis):
* **RecordID:** Уникальный идентификатор, присваиваемый каждой записи
* **AQI:** Индекс качества воздуха, показывающий, насколько загрязнен воздух в настоящее время или насколько загрязненным он, по прогнозам, станет в будущем
* **PM10**:  Концентрация твердых частиц диаметром менее 10 микрометров (μg/m³)
* **PM2_5**: Концентрация твердых частиц диаметром менее 2,5 микрометров (μg/m³)
* **NO2**: Концентрация диоксида азота (ppb)
* **SO2**: Концентрация диоксида серы (ppb)
* **O3**: Концентрация озона (ppb)
* **Temperature**: Температура в градусах Цельсия (°C)
* **Humidity**: Процент влажности (%)
* **WindSpeed**: Скорость ветра (m/s)
* **RespiratoryCases**: Количество зарегистрированных респираторных случаев.
* **CardiovascularCases**: Количество зарегистрированных сердечно-сосудистых случаев
* **HospitalAdmissions**: Количество зарегистрированных случаев госпитализации
* **Target Variable: HealthImpactClass**

Датасет для регрессии (Electrity Prices):
* **DateTime**: дата и время
* **Holiday**: название праздника, если день нерабочий день
* **HolidayFlag**: целое число, 1, если день нерабочий день, ноль в противном случае
* **DayOfWeek**: целое число (0-6), 0 понедельник, день недели
* **WeekOfYear**: текущая неделя в течение года, начинающегося с этой даты
* **Day integer**: день
* **Month integer**: месяц
* **Year integer**: год
* **PeriodOfDay**: период суток
* **ForecastWindProduction**: прогнозируемая мощность ветра на этот период
* **SystemLoadEA**: национальный прогноз нагрузки на этот период
* **SMPEA**: прогноз цен на данный период
* **ORKTemperature**: фактическая температура
* **ORKWindspeed**: фактическая скорость ветра
* **CO2Intensity**: фактическая интенсивность выбросов CO2 в произведенной электроэнергии (г/кВт*ч)
* **ActualWindProduction**: фактическая нагрузка на национальную систему за этот период
* **SystemLoadEP2**: фактическая цена за данный период времени, прогнозируемое значение.
* **Target Variable: SystemLoadEP2**

## Создание бейзлайна

In [3]:
def simple_classification(df):
  X_class = df.drop(['HealthImpactClass','HealthImpactScore','RecordID'], axis=1)
  y_class = df['HealthImpactClass']

  X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

  return X_train_class, X_test_class, y_train_class, y_test_class

In [4]:
X_train, X_test, y_train, y_test = simple_classification(df_class)

model = RandomForestClassifier()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

accuracy_class = accuracy_score(y_test, predictions)
print(f"Точность RandomForestClassifier: {accuracy_class:.4f}")

Точность RandomForestClassifier: 0.8997


### Регрессия

In [5]:
def simple_regression(df):
  X_reg = df.drop(['SMPEP2', 'DateTime','Holiday'], axis=1, errors='ignore')
  y_reg = df['SMPEP2']

  for col in X_reg.columns:
      X_reg[col] = pd.to_numeric(X_reg[col], errors='coerce')

  y_reg = pd.to_numeric(y_reg, errors='coerce')

  X_reg = X_reg.fillna(0)
  y_reg = y_reg.fillna(0)

  X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

  return X_train_reg, X_test_reg, y_train_reg, y_test_reg

In [6]:
X_train, X_test, y_train, y_test = simple_regression(df_reg)

model = RandomForestRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse_reg = mean_squared_error(y_test, predictions)
print(f"Среднеквадратичная ошибка RandomForestRegressor: {mse_reg:.4f}")

Среднеквадратичная ошибка RandomForestRegressor: 470.4373


## Улучшение бейзлайна

In [7]:
def upgraded_classification(df):
  df = df.drop(columns=["RecordID"], errors="ignore")

  low_PM2_5 = df["PM2_5"].quantile(0.25)
  strong_PM2_5 = df["PM2_5"].quantile(0.75)
  df["LowPM2_5"] = (df["PM2_5"] <= low_PM2_5).astype(int)
  df["StrongPM2_5"] = (df["PM2_5"] >= strong_PM2_5).astype(int)

  low_PM10 = df["PM10"].quantile(0.25)
  strong_PM10 = df["PM10"].quantile(0.75)
  df["LowPM10"] = (df["PM10"] <= low_PM10).astype(int)
  df["StrongPM10"] = (df["PM10"] >= strong_PM10).astype(int)


  low_AQI = df["AQI"].quantile(0.25)
  strong_AQI = df["AQI"].quantile(0.75)
  df["LowAQI"] = (df["AQI"] <= low_AQI).astype(int)
  df["StrongAQI"] = (df["AQI"] >= strong_AQI).astype(int)

  df = df.drop(columns=["RespiratoryCases", "CardiovascularCases","WindSpeed","Temperature","Humidity",'PM2_5'], axis=1)

  df = df[[
      "HealthImpactClass",
      "LowAQI",
      "LowPM2_5",
      "LowPM10",
      "StrongPM2_5",
      "PM10",
      "O3",
      "StrongAQI",
      "AQI",
      "HealthImpactScore"
  ]]

  X_class = df.drop(['HealthImpactClass','HealthImpactScore'], axis=1)
  y_class = df['HealthImpactClass']

  X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

  return X_train_class, X_test_class, y_train_class, y_test_class

In [8]:
X_train, X_test, y_train, y_test = upgraded_classification(df_class)

In [9]:
rf = RandomForestClassifier(random_state=42)

param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": randint(3, 30),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy", "log_loss"]
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Лучшие параметры:", random_search.best_params_)
print("Лучший score:", random_search.best_score_)

print("Точность на тесте:", random_search.score(X_test, y_test))

Лучшие параметры: {'bootstrap': True, 'criterion': 'log_loss', 'max_depth': 19, 'max_features': None, 'min_samples_leaf': 6, 'min_samples_split': 6, 'n_estimators': 165}
Лучший score: 0.88960080509896
Точность на тесте: 0.8876146788990825


### Регрессия

In [10]:
def upgraded_regression(df):
  df['ForecastWindProduction'] = pd.to_numeric(df['ForecastWindProduction'], errors='coerce')
  df['SystemLoadEA'] = pd.to_numeric(df['SystemLoadEA'], errors='coerce')
  df['SMPEA'] = pd.to_numeric(df['SMPEA'], errors='coerce')
  df['ORKTemperature'] = pd.to_numeric(df['ORKTemperature'], errors='coerce')
  df['ORKWindspeed'] = pd.to_numeric(df['ORKWindspeed'], errors='coerce')
  df['CO2Intensity'] = pd.to_numeric(df['CO2Intensity'], errors='coerce')
  df['ActualWindProduction'] = pd.to_numeric(df['ActualWindProduction'], errors='coerce')
  df['SystemLoadEP2'] = pd.to_numeric(df['SystemLoadEP2'], errors='coerce')
  df['SMPEP2'] = pd.to_numeric(df['SMPEP2'], errors='coerce')

  df = df.drop(["DateTime","Holiday"],axis = 1)

  df = df.dropna()

  df = df[df['SMPEP2'] > 0]
  df = df[df['SMPEP2'] != 1000]

  X_reg = df.drop(['SMPEP2'], axis=1, errors='ignore')
  y_reg = df['SMPEP2']

  X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

  return X_train_reg, X_test_reg, y_train_reg, y_test_reg

In [11]:
X_train, X_test, y_train, y_test = upgraded_regression(df_reg)

In [12]:
rf = RandomForestRegressor(random_state=42)

param_dist = {
    "n_estimators": randint(100, 500),
    "max_depth": randint(3, 30),
    "min_samples_split": randint(2, 20)
}


random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=5,
    cv=5,
    scoring="neg_mean_squared_error",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Лучшие параметры:", random_search.best_params_)
print("Лучший MSE:", -random_search.best_score_)


y_pred = random_search.predict(X_test)
print("MSE на тесте:", mean_squared_error(y_test, y_pred))

Лучшие параметры: {'max_depth': 26, 'min_samples_split': 4, 'n_estimators': 249}
Лучший MSE: 489.6096713909945
MSE на тесте: 433.5225540991735


Стандартный бейзлайн:

* Точность RandomForestClassifier: 0.8962
* Среднеквадратичная ошибка (MSE) RandomForestRegressor: 476.6302

Улучшенный байзлайн:

* Точность RandomForestClassifier: 0.8876
* Среднеквадратичная ошибка (MSE) RandomForestRegressor: 433.52

## Имплементация алгоритма

In [16]:
# Код дерева из Лабы №3
class My_Decision_Tree_Classifier:
    def __init__(self,
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None


    def _gini(self, y):
        m = len(y)
        if m == 0:
            return 0
        _, counts = np.unique(y, return_counts=True)
        p = counts / m
        return 1 - np.sum(p * p)

    def _best_split(self, X, y):
        m, n = X.shape
        if m < self.min_samples_split:
            return None, None

        best_gini = 1e10
        best_feat = None
        best_thr = None

        for feature in range(n):
            sorted_idx = X[:, feature].argsort()
            X_f = X[sorted_idx, feature]
            y_f = y[sorted_idx]

            unique_vals = np.unique(X_f)
            if len(unique_vals) == 1:
                continue


            left_counts = {}
            right_counts = dict(zip(*np.unique(y_f, return_counts=True)))
            left_n = 0
            right_n = len(y_f)

            for i in range(1, m):
                cls = y_f[i - 1]


                left_counts[cls] = left_counts.get(cls, 0) + 1
                right_counts[cls] -= 1
                left_n += 1
                right_n -= 1

                if X_f[i] == X_f[i - 1]:
                    continue
                if left_n < self.min_samples_leaf or right_n < self.min_samples_leaf:
                    continue


                left_gini = 1.0
                if left_n > 0:
                    p = np.fromiter(left_counts.values(), dtype=float) / left_n
                    left_gini = 1 - np.sum(p * p)

                right_gini = 1.0
                if right_n > 0:
                    p = np.fromiter(right_counts.values(), dtype=float) / right_n
                    right_gini = 1 - np.sum(p * p)

                gini = (left_n * left_gini + right_n * right_gini) / m

                if gini < best_gini:
                    best_gini = gini
                    best_feat = feature
                    best_thr = (X_f[i] + X_f[i - 1]) / 2

        return best_feat, best_thr

    def _build_tree(self, X, y, depth=0):
        num_samples = len(y)
        num_labels = len(np.unique(y))

        if (self.max_depth is not None and depth >= self.max_depth) or \
           num_labels == 1 or \
           num_samples < self.min_samples_split:
            values, counts = np.unique(y, return_counts=True)
            return {"leaf": True, "class": values[np.argmax(counts)]}

        feat, thr = self._best_split(X, y)
        if feat is None:
            values, counts = np.unique(y, return_counts=True)
            return {"leaf": True, "class": values[np.argmax(counts)]}

        left_idx = X[:, feat] <= thr
        right_idx = X[:, feat] > thr

        return {
            "leaf": False,
            "feature": feat,
            "threshold": thr,
            "left": self._build_tree(X[left_idx], y[left_idx], depth + 1),
            "right": self._build_tree(X[right_idx], y[right_idx], depth + 1)
        }

    def fit(self, X, y):
        X, y = np.array(X), np.array(y)
        self.tree = self._build_tree(X, y)
        return self

    def _predict_one(self, x, node):
        while not node["leaf"]:
            if x[node["feature"]] <= node["threshold"]:
                node = node["left"]
            else:
                node = node["right"]
        return node["class"]

    def predict(self, X):
        X = np.array(X)
        return np.array([self._predict_one(x, self.tree) for x in X])


class My_Random_Forest_Classifier:
    def __init__(self,
                 n_estimators=100,
                 max_features="sqrt",
                 max_depth=None,
                 min_samples_split=2,
                 bootstrap=True,
                 random_state=None):

        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.trees = []
        self.feature_indices = []

        if self.random_state is not None:
            np.random.seed(self.random_state)

    def _slice_rows(self, X, idx):
        if isinstance(X, np.ndarray):
            return X[idx]
        else:
            return X.iloc[idx]

    def _slice_cols(self, X, idx):
        if isinstance(X, np.ndarray):
            return X[:, idx]
        else:
            return X.iloc[:, idx]

    def _sample_features(self, n_features):
        if self.max_features == "sqrt":
            k = int(np.sqrt(n_features))
        elif self.max_features == "log2":
            k = int(np.log2(n_features))
        elif isinstance(self.max_features, int):
            k = self.max_features
        else:
            k = n_features

        return np.random.choice(n_features, k, replace=False)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idx = np.random.choice(n_samples, n_samples, replace=True)
        return self._slice_rows(X, idx), self._slice_rows(y, idx)

    def fit(self, X, y):
        self.trees = []
        self.feature_indices = []

        n_samples, n_features = X.shape

        for _ in range(self.n_estimators):
            if self.bootstrap:
                X_sample, y_sample = self._bootstrap_sample(X, y)
            else:
                X_sample, y_sample = X, y

            feat_idx = self._sample_features(n_features)
            self.feature_indices.append(feat_idx)

            tree = My_Decision_Tree_Classifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            tree.fit(self._slice_cols(X_sample, feat_idx), y_sample)
            self.trees.append(tree)

    def predict(self, X):
        predictions = []

        for tree, feat_idx in zip(self.trees, self.feature_indices):
            preds = tree.predict(self._slice_cols(X, feat_idx))
            predictions.append(preds)

        predictions = np.array(predictions).T

        final_preds = []
        for row in predictions:
            final_preds.append(Counter(row).most_common(1)[0][0])

        return np.array(final_preds)

In [19]:
# Код дерева из Лабы №3
class My_Decision_Tree_Regressor:
    def __init__(self,
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None


    def _variance(self, y):
        return np.var(y) if len(y) > 0 else 0

    def _best_split(self, X, y):
        m, n = X.shape
        if m < self.min_samples_split:
            return None, None

        best_feat, best_thr = None, None
        best_var = 1e18

        total_sum = y.sum()
        total_sq_sum = np.dot(y, y)

        for feature in range(n):
            sorted_idx = np.argsort(X[:, feature])
            X_f = X[sorted_idx, feature]
            y_f = y[sorted_idx]

            left_sum = 0.0
            left_sq_sum = 0.0
            left_n = 0


            right_sum = total_sum
            right_sq_sum = total_sq_sum
            right_n = m

            for i in range(1, m):
                yi = y_f[i - 1]


                left_sum += yi
                left_sq_sum += yi * yi
                left_n += 1

                right_sum -= yi
                right_sq_sum -= yi * yi
                right_n -= 1


                if X_f[i] == X_f[i - 1]:
                    continue

                if left_n < self.min_samples_leaf or right_n < self.min_samples_leaf:
                    continue


                left_var = left_sq_sum / left_n - (left_sum / left_n)**2

                right_var = right_sq_sum / right_n - (right_sum / right_n)**2


                weighted_var = left_n * left_var + right_n * right_var

                if weighted_var < best_var:
                    best_var = weighted_var
                    best_feat = feature
                    best_thr = (X_f[i] + X_f[i - 1]) / 2

        return best_feat, best_thr

    def _build_tree(self, X, y, depth=0):
        m = len(y)

        if (self.max_depth is not None and depth >= self.max_depth) or \
           m < self.min_samples_split or \
           np.var(y) < 1e-10:
            return {"leaf": True, "value": np.mean(y)}

        feat, thr = self._best_split(X, y)

        if feat is None:
            return {"leaf": True, "value": np.mean(y)}

        left_idx = X[:, feat] <= thr
        right_idx = ~left_idx

        return {
            "leaf": False,
            "feature": feat,
            "threshold": thr,
            "left": self._build_tree(X[left_idx], y[left_idx], depth + 1),
            "right": self._build_tree(X[right_idx], y[right_idx], depth + 1)
        }


    def fit(self, X, y):
        X, y = np.array(X), np.array(y, dtype=float)
        self.tree = self._build_tree(X, y)
        return self

    def _predict_one(self, x, node):
        while not node["leaf"]:
            if x[node["feature"]] <= node["threshold"]:
                node = node["left"]
            else:
                node = node["right"]
        return node["value"]

    def predict(self, X):
        X = np.array(X)
        return np.array([self._predict_one(x, self.tree) for x in X])


class My_Random_Forest_Regressor:
    def __init__(self,
                 n_estimators=100,
                 max_features="sqrt",
                 max_depth=None,
                 min_samples_split=2,
                 bootstrap=True,
                 random_state=None):

        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.bootstrap = bootstrap
        self.random_state = random_state

        self.trees = []
        self.feature_indices = []

        if random_state is not None:
            np.random.seed(random_state)

    def _slice_rows(self, X, idx):
        if isinstance(X, np.ndarray):
            return X[idx]
        else:
            return X.iloc[idx]

    def _slice_cols(self, X, idx):
        if isinstance(X, np.ndarray):
            return X[:, idx]
        else:
            return X.iloc[:, idx]

    def _sample_features(self, n_features):
        if self.max_features == "sqrt":
            k = int(np.sqrt(n_features))
        elif self.max_features == "log2":
            k = int(np.log2(n_features))
        elif isinstance(self.max_features, int):
            k = self.max_features
        else:
            k = n_features

        return np.random.choice(n_features, k, replace=False)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idx = np.random.choice(n_samples, n_samples, replace=True)
        return self._slice_rows(X, idx), self._slice_rows(y, idx)

    def fit(self, X, y):
        n_samples, n_features = X.shape

        self.trees = []
        self.feature_indices = []

        for _ in range(self.n_estimators):
            if self.bootstrap:
                X_sample, y_sample = self._bootstrap_sample(X, y)
            else:
                X_sample, y_sample = X, y

            feat_idx = self._sample_features(n_features)
            self.feature_indices.append(feat_idx)

            tree = My_Decision_Tree_Regressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )

            tree.fit(self._slice_cols(X_sample, feat_idx), y_sample)

            self.trees.append(tree)

    def predict(self, X):
        preds = []

        for tree, feat_idx in zip(self.trees, self.feature_indices):
            p = tree.predict(self._slice_cols(X, feat_idx))
            preds.append(p)

        preds = np.array(preds)
        return np.mean(preds, axis=0)

In [17]:
X_train, X_test, y_train, y_test = simple_classification(df_class)

model = My_Random_Forest_Classifier()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

accuracy_class = accuracy_score(y_test, predictions)
print(f"Точность My_Random_Forest_Classifier: {accuracy_class:.4f}")

Точность My_Random_Forest_Classifier: 0.8268


In [20]:
X_train, X_test, y_train, y_test = simple_regression(df_reg)

model = My_Random_Forest_Regressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse_reg = mean_squared_error(y_test, predictions)
print(f"Среднеквадратичная ошибка My_Random_Forest_Regressor: {mse_reg:.4f}")

Среднеквадратичная ошибка My_Random_Forest_Regressor: 677.0049


In [21]:
X_train, X_test, y_train, y_test = upgraded_classification(df_class)

In [22]:
def search_params(type_task, my_model,
                  X_train, X_test, y_train, y_test,
                  n_estimators=(250, 500),
                  max_depth=(15, 18),
                  min_samples_split=(3, 5)):

    best_score = -1e18 if type_task == 'clf' else 1e18
    best_params = None

    for n_est in n_estimators:
        for depth in max_depth:
            for splt in min_samples_split:

                model = my_model(
                    n_estimators=n_est,
                    max_depth=depth,
                    min_samples_split=splt,
                )

                model.fit(X_train, y_train)
                preds = model.predict(X_test)

                if type_task == 'clf':
                    score = accuracy_score(y_test, preds)
                    if score > best_score:
                        best_score = score
                        best_params = (n_est, depth, splt)

                else:
                    score = mean_squared_error(y_test, preds)
                    if score < best_score:
                        best_score = score
                        best_params = (n_est, depth, splt)

    return best_score, best_params


In [23]:
acc, best_params = search_params('clf',My_Random_Forest_Classifier,X_train,X_test,y_train,y_test)
print(f"Лучшая точность My_Random_Forest_Classifier: {acc:.4f}")
print(f"Best params: max_depth={best_params[0]}, min_sample_split={best_params[1]}, min_sample_leaf={best_params[2]}")

Лучшая точность My_Random_Forest_Classifier: 0.8268
Best params: max_depth=250, min_sample_split=15, min_sample_leaf=3


In [27]:
X_train, X_test, y_train, y_test = upgraded_regression(df_reg)

In [28]:
mse, best_params = search_params('reg',My_Random_Forest_Regressor,X_train,X_test,y_train,y_test)
print(f"Лучшая среднеквадратичная ошибка My_Random_Forest_Regressor: {mse:.4f}")
print(f"Best params: n_estimators={best_params[0]}, max_depth={best_params[1]}, min_samples_split={best_params[2]}")

Лучшая среднеквадратичная ошибка My_Random_Forest_Regressor: 671.4459
Best params: n_estimators=250, max_depth=18, min_samples_split=3


Стандартный бейзлайн:

Библиотечная реализация:
* Точность RandomForestClassifier: 0.8962
* Среднеквадратичная ошибка RandomForestRegressor: 476.6302

Имплементация алгоритма:
* Точность My_Random_Forest_Classifier: 0.8268
* Среднеквадратичная ошибка My_Random_Forest_Regressor: 677.0049

Улучшенный байзлайн:

Библиотечная реализация:
* Точность RandomForestClassifier: 0.8876
* Среднеквадратичная ошибка RandomForestRegressor: 433.52

Имплементация алгоритма:
* точность My_Random_Forest_Classifier: 0.8268
* среднеквадратичная ошибка My_Random_Forest_Regressor: 671.4459

Итог: Библиотечные модели демонстрируют высокое качество предсказаний, особенно для регрессии, и реагируют на улучшение бейзлайна снижением ошибки.

Собственная реализация алгоритма показала стабильную, но более низкую точность по сравнению с библиотечными моделями, с незначительным улучшением в регрессии.

Итоги подтверждают, что для практических задач библиотечные реализации Random Forest обеспечивают более высокую точность и надёжность, тогда как самостоятельная реализация требует дополнительной оптимизации для достижения сопоставимого качества.