# Реализация алгоритма градиентного бустинга над решающими деревьями

In [5]:
# Импорт необходимых библиотек
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [6]:
# Создание класса градиентного бустинга с двумя функциями потерь (mse и mae)
class GradientBoostingRegressor:
    def __init__(
            self,
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            min_samples_split=2,
            loss="mse",
            verbose=False,
            subsample_size=0.5,
            replace=False,
    ):
        if n_estimators <= 0:
            raise ValueError("n_estimators должно быть положительным числом.")

        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.verbose = verbose
        self.base_pred_ = None
        self.trees_ = []
        self.loss = self._get_loss_function(loss)
        self.subsample_size = subsample_size
        self.replace = replace

    def _get_loss_function(self, loss):
        if loss == "mse":
            return self._mse
        elif loss == "mae":
            return self._mae
        elif callable(loss):
            return loss
        else:
            raise ValueError(f"Функция потерь '{loss}' не поддерживается.")

    def _mse(self, y_true, y_pred):
        """Среднеквадратичная функция потерь и градиент"""
        loss = np.mean(np.square(y_pred - y_true))
        grad = y_pred - y_true
        return loss, grad

    def _mae(self, y_true, y_pred):
        """Средняя абсолютная функция потерь и градиент."""
        loss = np.mean(np.abs(y_pred - y_true))
        grad = np.sign(y_pred - y_true)
        return loss, grad

    def _subsample(self, X, y):
        """Разбиение данных на подмножества"""
        n_samples = X.shape[0]
        sample_size = int(self.subsample_size * n_samples)
        indices = np.random.choice(n_samples, size=sample_size, replace=self.replace)
        return X[indices], y[indices]

    def fit(self, X, y):
        """
        Обучение модели на основе данных.

        Args:
            X: массив размера (n_samples, n_features)
            y: массив размера (n_samples,)

        Returns:
            GradientBoostingRegressor: Обученная модель.
        """
        self.base_pred_ = np.mean(y)
        y_pred = np.full_like(y, self.base_pred_)

        for _ in range(self.n_estimators):
            loss, grad = self.loss(y, y_pred)
            sub_X, sub_y = self._subsample(X, -grad)
            tree = DecisionTreeRegressor(max_depth=self.max_depth,
                                         min_samples_split=self.min_samples_split)
            tree.fit(sub_X, sub_y)
            self.trees_.append(tree)
            y_pred = y_pred.astype(float) + self.learning_rate * tree.predict(X).astype(float)
            if self.verbose:
                print("Iteration:", _, "MSE loss:", loss)

    def predict(self, X):
        """Предсказание переменной таргета по новым данным.

        Args:
            X: массив размера (n_samples, n_features)

        Returns:
            y: массив размера (n_samples,)
            Предсказанное моделью значение.

        """
        predictions = np.full(X.shape[0], self.base_pred_)
        for tree in self.trees_:
            predictions = predictions.astype(float) + self.learning_rate * tree.predict(X).astype(float)
        return predictions

In [7]:
# Проверка работы класса GradientBoostingRegressor, в качестве переменной таргета колонка delay_days
df = pd.read_csv("Gradient boosting.csv")
df.head()

Unnamed: 0,age,income,dependents,has_property,has_car,credit_score,job_tenure,has_education,loan_amount,loan_period,delay_days
0,76,32181,3,0,1,814,28,1,142434,1770,0
1,69,52789,8,1,0,501,28,1,120887,1590,7
2,19,70535,1,0,1,325,26,1,188766,810,0
3,31,85271,1,0,1,525,29,1,406792,330,0
4,18,19974,2,0,1,618,34,1,155240,1560,43


In [8]:
X = df.drop("delay_days", axis=1)
y = df["delay_days"]
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=2, loss="mse",
                               verbose=True)
gb.fit(X.values, y.values)
gb.predict(X.values)

Iteration: 0 MSE loss: 409.149
Iteration: 1 MSE loss: 353.3357442753931
Iteration: 2 MSE loss: 307.10733521641896
Iteration: 3 MSE loss: 269.1502956450304
Iteration: 4 MSE loss: 235.45344600583047
Iteration: 5 MSE loss: 208.9611316640236
Iteration: 6 MSE loss: 186.90793301474463
Iteration: 7 MSE loss: 169.96178039798937
Iteration: 8 MSE loss: 149.1939358814003
Iteration: 9 MSE loss: 132.408750550455
Iteration: 10 MSE loss: 116.83191546169654
Iteration: 11 MSE loss: 104.83120127082117
Iteration: 12 MSE loss: 94.55201594317239
Iteration: 13 MSE loss: 84.81250801795605
Iteration: 14 MSE loss: 76.18338249582384
Iteration: 15 MSE loss: 68.94673626543113
Iteration: 16 MSE loss: 61.872375977291185
Iteration: 17 MSE loss: 56.10056870241931
Iteration: 18 MSE loss: 51.129471736190276
Iteration: 19 MSE loss: 46.58301233971029
Iteration: 20 MSE loss: 42.51632094405663
Iteration: 21 MSE loss: 39.382860285903355
Iteration: 22 MSE loss: 36.02459022060492
Iteration: 23 MSE loss: 33.17980560994534
Iter

array([ 1.98028571e+01,  7.74534573e+00,  6.59110976e-01,  2.01660990e-01,
        4.14109874e+01,  3.50410718e-01,  4.88730216e+00,  2.56589589e+00,
       -6.09347656e-01,  1.65500002e+01,  9.39623126e+00,  3.06602226e+01,
        3.60839558e+01,  5.93680997e+00, -4.67061420e-01,  1.13230299e+01,
       -8.08469063e-01,  3.25338519e+01,  1.67306456e+01,  1.93838867e+00,
        2.33178504e+01,  5.13339270e+00,  1.22815680e+01,  4.09314984e+00,
        9.93528845e-01, -6.54839446e-01,  3.61702953e+01,  4.37336222e+00,
        2.12074031e+01,  2.87684984e+01,  2.61270434e+00,  6.66168834e+00,
        1.98724559e+01,  2.25937761e+01,  1.18710837e+01,  2.60771185e+01,
        3.48476646e+00,  5.22875372e-01,  8.82982851e+00,  6.79158814e-01,
        3.65507512e+01, -8.86539897e-01,  5.76175940e+01,  7.69264176e-01,
        6.00230422e+00, -6.87126812e-01,  3.39298071e+00,  2.22408150e+00,
        8.24952108e+01,  2.37669205e+01,  1.71932106e+01,  1.26374186e+01,
        8.77746079e+00,  