# Лабораторная работа №5: Проведение исследований с градиентным бустингом

## 2. Создание бейзлайна и оценка качества

### Классификация

In [None]:
X_c = bl_cdata.drop('HeartDisease', axis=1) 

X_c_train, X_c_test, y_c_train, y_c_test = train_test_split(X_c, y_c, test_size=0.2, random_state=42)


scaler_c = StandardScaler()
X_c_train_scaled = scaler_c.fit_transform(X_c_train) 

X_c_test_scaled = scaler_c.transform(X_c_test) 

model_gb_c = GradientBoostingClassifier(random_state=42)


model_gb_c.fit(X_c_train_scaled, y_c_train)


y_c_pred = model_gb_c.predict(X_c_test_scaled)


accuracy_c = accuracy_score(y_c_test, y_c_pred) 
precision_c = precision_score(y_c_test, y_c_pred, average='weighted', zero_division=0) 
recall_c = recall_score(y_c_test, y_c_pred, average='weighted', zero_division=0)


print(f"\nМетрики для классификации (Градиентный бустинг, bl_cdata):")
print(f"  Accuracy: {accuracy_c:.4f}")
print(f"  Precision: {precision_c:.4f}")
print(f"  Recall: {recall_c:.4f}")

In [None]:
  Accuracy: 0.8804
  Precision: 0.8820
  Recall: 0.8804

### Регрессия

In [None]:
X_r = bl_rdata.drop('Depression_Yes', axis=1) 
y_r = bl_rdata['Depression_Yes'] 

X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(X_r, y_r, test_size=0.2, random_state=42)


scaler_r = StandardScaler()
X_r_train_scaled = scaler_r.fit_transform(X_r_train) 
X_r_test_scaled = scaler_r.transform(X_r_test) 


model_gb_r = GradientBoostingRegressor(random_state=42)


model_gb_r.fit(X_r_train_scaled, y_r_train)

y_r_pred = model_gb_r.predict(X_r_test_scaled)


mae_r = mean_absolute_error(y_r_test, y_r_pred) 
r2_r = r2_score(y_r_test, y_r_pred)

print(f"\nМетрики для регрессии (Градиентный бустинг, bl_rdata):")
print(f"  MAE: {mae_r:.4f}")
print(f"  R2: {r2_r:.4f}")

In [None]:
  MAE: 0.2185
  R2: 0.7108

## 3. Улучшение бейзлайна

Подбор оптимальных гиперпараметров для модели случайного леса с помощью GridSearchCV улучшит ее производительность по сравнению с использованием параметров по умолчанию. Гиперпараметры модели существенно влияют на ее способность к обобщению и точности предсказаний. Поиск наилучших параметров через GridSearchCV позволит обучить модель более эффективно.

### Для классификации

In [None]:
X_c = bl_cdata.drop('HeartDisease', axis=1)
y_c = bl_cdata['HeartDisease']


X_c_train, X_c_test, y_c_train, y_c_test = train_test_split(X_c, y_c, test_size=0.2, random_state=42)


scaler_c = StandardScaler()
X_c_train_scaled = scaler_c.fit_transform(X_c_train)
X_c_test_scaled = scaler_c.transform(X_c_test)



param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                            param_grid=param_grid,
                            cv=3,  
                            scoring='accuracy',
                            n_jobs=-1) 
grid_search.fit(X_c_train_scaled, y_c_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Best score: 0.8773558603769377

Обучаем модели с лучшими параметрами

In [None]:
best_model = RandomForestClassifier(random_state=42, **grid_search.best_params_)


best_model.fit(X_c_train_scaled, y_c_train)

y_c_pred = best_model.predict(X_c_test_scaled)


accuracy_c = accuracy_score(y_c_test, y_c_pred)
precision_c = precision_score(y_c_test, y_c_pred, average='weighted', zero_division=0)
recall_c = recall_score(y_c_test, y_c_pred, average='weighted', zero_division=0)

print(f"\nМетрики для классификации (Улучшенный бейзлайн - случайный лес):")
print(f"  Accuracy: {accuracy_c:.4f}")
print(f"  Precision: {precision_c:.4f}")
print(f"  Recall: {recall_c:.4f}")

In [None]:
  Accuracy: 0.8859
  Precision: 0.8857
  Recall: 0.8859

### Для регрессии

In [None]:
X_r = bl_rdata.drop('Depression_Yes', axis=1) 
y_r = bl_rdata['Depression_Yes'] 


X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(X_r, y_r, test_size=0.2, random_state=42)


scaler_r = StandardScaler()
X_r_train_scaled = scaler_r.fit_transform(X_r_train) 
X_r_test_scaled = scaler_r.transform(X_r_test) 

param_grid_r = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search_r = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42),
                            param_grid=param_grid_r,
                            cv=3,  
                            scoring='neg_mean_absolute_error', 
                            n_jobs=-1) 

grid_search_r.fit(X_r_train_scaled, y_r_train)


print("Best hyperparameters (regr):", grid_search_r.best_params_)
print("Best score (regr):", -grid_search_r.best_score_)

Обучаем модели с лучшими параметрами

In [None]:
  MAE: 0.1411
  R2: 0.7122

Улучшения видны как для классификации, так и для регрессии

## 4. Имплементация алгоритма машинного обучения

Напишем собственную реализацию градиентного бустинга для классификации и регрессии

In [None]:
class SimpleGradientBoostingClassifier:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        residual = y.copy().astype(float)  # Convert to float for residuals
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)  # Use regressor
            tree.fit(X, residual)
            self.trees.append(tree)
            predictions = tree.predict(X)
            residual = residual - self.learning_rate * predictions

    def predict(self, X):
        predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return np.round(predictions)  # Apply rounding


class SimpleGradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        residual = y.copy().astype(float)
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            self.trees.append(tree)
            predictions = tree.predict(X)
            residual = residual - self.learning_rate * predictions

    def predict(self, X):
        predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return predictions


Обучим модели и оценим их качество

In [None]:
model_c_imp = SimpleGradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
model_c_imp.fit(X_c_train_scaled, y_c_train)
y_c_pred_imp = model_c_imp.predict(X_c_test_scaled)

accuracy_c_imp = accuracy_score(y_c_test, y_c_pred_imp)
precision_c_imp = precision_score(y_c_test, y_c_pred_imp, average='weighted', zero_division=0)
recall_c_imp = recall_score(y_c_test, y_c_pred_imp, average='weighted', zero_division=0)

print("\nCustom Gradient Boosting Classifier Metrics:")
print(f"  Accuracy: {accuracy_c_imp:.4f}")
print(f"  Precision: {precision_c_imp:.4f}")
print(f"  Recall: {recall_c_imp:.4f}")


In [None]:
  Accuracy: 0.8641
  Precision: 0.8680
  Recall: 0.8641

In [None]:
model_r_imp = SimpleGradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
model_r_imp.fit(X_r_train_scaled, y_r_train)
y_r_pred_imp = model_r_imp.predict(X_r_test_scaled)

mae_r_imp = mean_absolute_error(y_r_test, y_r_pred_imp)
r2_r_imp = r2_score(y_r_test, y_r_pred_imp)

print("\nCustom Gradient Boosting Regressor Metrics:")
print(f"  MAE: {mae_r_imp:.4f}")
print(f"  R2: {r2_r_imp:.4f}")

In [None]:
  MAE: 0.2185
  R2: 0.7108

Как видно имплементрованный градиентныё бустинг работает немного хуже. Добавим техники из улучшенного бейзлайна

In [None]:
param_grid_c = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

best_params_c, best_score_c, results_c = tune_hyperparameters(X_c_train_scaled, y_c_train, X_c_test_scaled, y_c_test,
                                                              'classification', param_grid_c)
print("Best classification params:", best_params_c)
print("Best classification accuracy:", best_score_c)


param_grid_r = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

best_params_r, best_score_r, results_r = tune_hyperparameters(X_r_train_scaled, y_r_train, X_r_test_scaled, y_r_test,
                                                              'regression', param_grid_r)
print("Best regression params:", best_params_r)
print("Best regression MAE:", best_score_r)


best_model_c = SimpleGradientBoostingClassifier(**best_params_c)
best_model_c.fit(X_c_train_scaled, y_c_train)
y_c_pred_imp_best = best_model_c.predict(X_c_test_scaled)

accuracy_c_imp_best = accuracy_score(y_c_test, y_c_pred_imp_best)
precision_c_imp_best = precision_score(y_c_test, y_c_pred_imp_best, average='weighted', zero_division=0)
recall_c_imp_best = recall_score(y_c_test, y_c_pred_imp_best, average='weighted', zero_division=0)

print("\nCustom Gradient Boosting Classifier Metrics (with best params):")
print(f"  Accuracy: {accuracy_c_imp_best:.4f}")
print(f"  Precision: {precision_c_imp_best:.4f}")
print(f"  Recall: {recall_c_imp_best:.4f}")

best_model_r = SimpleGradientBoostingRegressor(**best_params_r)
best_model_r.fit(X_r_train_scaled, y_r_train)
y_r_pred_imp_best = best_model_r.predict(X_r_test_scaled)

mae_r_imp_best = mean_absolute_error(y_r_test, y_r_pred_imp_best)
r2_r_imp_best = r2_score(y_r_test, y_r_pred_imp_best)

print("\nCustom Gradient Boosting Regressor Metrics (with best params):")
print(f"  MAE: {mae_r_imp_best:.4f}")
print(f"  R2: {r2_r_imp_best:.4f}")

In [None]:
Custom Gradient Boosting Classifier Metrics (with best params):
  Accuracy: 0.8967
  Precision: 0.8989
  Recall: 0.8967

Custom Gradient Boosting Regressor Metrics (with best params):
  MAE: 0.1399
  R2: 0.5847

Видно, что после улучшения задача классификации стала работать немного быстрее