In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

Регрессор

In [3]:
class GradientBoostingRegressorCustom:
    def __init__(self, n_estimators=10, max_depth=3, learning_rate=0.1, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        self.F0 = np.mean(y)
        F = np.full(y.shape, self.F0)

        for _ in range(self.n_estimators):
            residuals = y - F
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X, residuals)
            F += self.learning_rate * tree.predict(X)
            self.trees.append(tree)
        return self

    def predict(self, X):
        F = np.full(X.shape[0], self.F0)
        for tree in self.trees:
            F += self.learning_rate * tree.predict(X)
        return F

Классификатор

In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class GradientBoostingClassifierCustom:
    def __init__(self, n_estimators=100, max_depth=3, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.trees = []

    def fit(self, X, y):
        self.lb = LabelBinarizer()
        y_bin = self.lb.fit_transform(y).ravel()
        self.classes_ = self.lb.classes_

        p = np.clip(np.mean(y_bin), 1e-5, 1 - 1e-5)
        F = np.full(y_bin.shape, np.log(p / (1 - p)))
        self.F0 = F[0]

        for _ in range(self.n_estimators):
            p = sigmoid(F)
            g = p - y_bin
            h = p * (1 - p) + 1e-6

            pseudo_residual = - g / h
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, pseudo_residual, sample_weight=h)
            F += self.learning_rate * tree.predict(X)
            self.trees.append(tree)

        return self

    def predict(self, X):
        F = np.full(X.shape[0], self.F0)
        for tree in self.trees:
            F += self.learning_rate * tree.predict(X)
        p = sigmoid(F)
        return np.where(p >= 0.5, self.classes_[1], self.classes_[0])

Датасет классификации

In [52]:
df_fraud = pd.read_csv('creditcard.csv')

Xc = df_fraud.drop('Class', axis=1)
yc = df_fraud['Class']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=42, stratify=yc
)

Запуск классификации sklearn

In [53]:
sk_gb_c = GradientBoostingClassifier(n_estimators=10, max_depth=3, learning_rate=0.1, random_state=42)
sk_gb_c.fit(Xc_train, yc_train)
yc_pred_sk = sk_gb_c.predict(Xc_test)

Запуск собственной классификации

In [None]:
my_gb_c = GradientBoostingClassifierCustom(n_estimators=10, max_depth=3, learning_rate=0.1)
my_gb_c.fit(Xc_train, yc_train)
yc_pred_my = my_gb_c.predict(Xc_test)

Метрики

In [55]:

print(f"Sklearn accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}, f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(yc_test, yc_pred_my):.4f}, f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}, recall={recall_score(yc_test, yc_pred_my, pos_label=1)}")

Sklearn accuracy=0.9992, f1=0.9992, recall=0.7448979591836735
Custom  accuracy=0.9987, f1=0.9988, recall=0.7755102040816326


Регрессия

Загрузка датасета

In [18]:
df_crop = pd.read_csv('crop_yield.csv')

# df_crop = df_crop.sample(n=10000, random_state=42)

Xr = df_crop.drop('Yield_tons_per_hectare', axis=1)
yr = df_crop['Yield_tons_per_hectare']

Xr = pd.get_dummies(Xr, drop_first=True)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr, yr, test_size=0.2, random_state=42
)


Запуск регресии sklearn

In [19]:
sk_gb_r = GradientBoostingRegressor(n_estimators=10, max_depth=3, learning_rate=0.05, random_state=42)
sk_gb_r.fit(Xr_train, yr_train)
yr_pred_sk = sk_gb_r.predict(Xr_test)

Запуск собственной регресии

In [20]:
my_gb_r = GradientBoostingRegressorCustom(n_estimators=10, max_depth=3, learning_rate=0.05)
my_gb_r.fit(Xr_train, yr_train)
yr_pred_my = my_gb_r.predict(Xr_test)

Метрики

In [21]:

print(f"SkLearn RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_sk)):.4f}, R2={r2_score(yr_test, yr_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}, R2={r2_score(yr_test, yr_pred_my):.4f}")

SkLearn RMSE=1.1881, R2=0.5104
Custom  RMSE=1.1881, R2=0.5104


Теперь применим улучшения полученные на этапе анализа данных

Классификация

Скейлинг

In [60]:
scaler = StandardScaler()
Xc_train_scaled = scaler.fit_transform(Xc_train)
Xc_test_scaled = scaler.transform(Xc_test)

sklearn

In [None]:
sk_gb_c = GradientBoostingClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

grid_search = GridSearchCV(sk_gb_c, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(Xc_train_scaled, yc_train)

best_model = grid_search.best_estimator_
best_model.fit(Xc_train_scaled, yc_train)
yc_pred_sk = best_model.predict(Xc_test_scaled)

Метрики

In [None]:
print(f"accuracy={accuracy_score(yc_test, yc_pred_sk):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_sk, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_sk, pos_label=1)}")


Собственная реализация

In [None]:
my_gb_c = GradientBoostingClassifierCustom(n_estimators=10, max_depth=3, learning_rate=0.1)
my_gb_c.fit(Xc_train_scaled, yc_train)
yc_pred_my = my_gb_c.predict(Xc_test)

Метрики

In [64]:
print(f"accuracy={accuracy_score(yc_test, yc_pred_my):.4f}")
print(f"f1={f1_score(yc_test, yc_pred_my, average='weighted'):.4f}")
print(f"recall={recall_score(yc_test, yc_pred_my, pos_label=1)}")

accuracy=0.9984
f1=0.9985
recall=0.6326530612244898


Регрессия

Скейлинг

In [6]:
scaler = StandardScaler()
Xr_train_scaled = scaler.fit_transform(Xr_train)
Xr_test_scaled = scaler.transform(Xr_test)

sklearn

In [None]:
sk_gb_r = GradientBoostingRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.1]
}

grid_search = GridSearchCV(sk_gb_r, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xr_train_scaled, yr_train)

best_model = grid_search.best_estimator_

best_model.fit(Xr_train_scaled, yr_train)

Метрики

In [22]:
yr_pred_sk = best_model.predict(Xr_test_scaled)

print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_sk)):.4f}")
print(f"R2={r2_score(yr_test, yr_pred_sk):.4f}")

RMSE=0.5024
R2=0.9125


Собственная реализация

In [24]:
my_gb_r = GradientBoostingRegressorCustom(n_estimators=10, max_depth=3, learning_rate=0.1)
my_gb_r.fit(Xr_train_scaled, yr_train)


<__main__.GradientBoostingRegressorCustom at 0x1da27f68710>

Метрики

In [25]:
yr_pred_my = my_gb_r.predict(Xr_test_scaled)

print(f"RMSE={np.sqrt(mean_squared_error(yr_test, yr_pred_my)):.4f}")
print(f"R2={r2_score(yr_test, yr_pred_my):.4f}")

RMSE=0.8781
R2=0.7326
