# Лабораторная работа 5 (Gradient Boosting)

In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from utils import regression_cross_validate, display_metrics_table, classification_cross_validate, display_metrics_classification_table


In [2]:
import warnings
warnings.filterwarnings("ignore")

### Regression

#### 1. Обработка данных

In [46]:
df = pd.read_csv('data/Salary_Data.csv')
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32,Male,Bachelor's,Software Engineer,5.0,90000
1,28,Female,Master's,Data Analyst,3.0,65000
2,45,Male,PhD,Senior Manager,15.0,150000
3,36,Female,Bachelor's,Sales Associate,7.0,60000
4,52,Male,Master's,Director,20.0,200000


In [47]:
X, y = df.drop(columns=['Salary', 'Job Title']), df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

gender_le = LabelEncoder()
el_le = LabelEncoder()

X_train['Gender'] = gender_le.fit_transform(X_train['Gender'])
X_test['Gender'] = gender_le.transform(X_test['Gender'])

X_train['Education Level'] = el_le.fit_transform(X_train['Education Level'])
X_test['Education Level'] = el_le.transform(X_test['Education Level'])


#### 2. Построение бейзлайна

In [18]:
metrics = regression_cross_validate(GradientBoostingRegressor, X_train.to_numpy(), y_train.to_numpy(), n_folds=5, random_state=42, n_estimators=50, max_depth=10)
display_metrics_table(*metrics)

gb = GradientBoostingRegressor(n_estimators=50, max_depth=10, random_state=42)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")


| Metric   |            Mean |        Std Dev |
|:---------|----------------:|---------------:|
| MAE      | 11563.4         | 1997.19        |
| MSE      |     3.71009e+08 |    1.05949e+08 |
| R2       |     0.835889    |    0.0595874   |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 298160016.58
Средняя абсолютная ошибка (MAE): 11296.05
Коэффициент детерминации (R^2): 0.86


Как можем увидеть значение метрики $R^2$ около 0.86, что означает что около 86% дисперсии данных объясняется моделью.

#### 3. Формулировка гипотез

Сформулируем несколько гипотез, которые могут помочь улучшить качество модели

1) Поменять Encoder категориальных признаков с `LabelEncoder` на `OneHotEncoder`
2) Отмасштабировать численные признаки
3) Уменьшить глубину и  увеличить число деревьев


In [50]:
onehot = OneHotEncoder(sparse_output=False, drop='first')

categorical_features = ['Gender', 'Education Level']

encoded_train_data = onehot.fit_transform(X_train[categorical_features])
encoded_test_data = onehot.transform(X_test[categorical_features])

encoded_df = pd.DataFrame(encoded_train_data, columns=onehot.get_feature_names_out(categorical_features))
X_train_upd = X_train.drop(columns=categorical_features).reset_index(drop=True)
X_train_upd = pd.concat([X_train_upd, encoded_df], axis=1)

encoded_df = pd.DataFrame(encoded_test_data, columns=onehot.get_feature_names_out(categorical_features))
X_test_upd = X_test.drop(columns=categorical_features).reset_index(drop=True)
X_test_upd = pd.concat([X_test_upd, encoded_df], axis=1)

scaler = StandardScaler()
num_features = ['Age', 'Years of Experience']
X_train_upd[num_features] = scaler.fit_transform(X_train[num_features])
X_test_upd[num_features] = scaler.transform(X_test[num_features])


In [59]:
metrics = regression_cross_validate(GradientBoostingRegressor, X_train_upd.to_numpy(), y_train.to_numpy(), n_folds=5, random_state=42, n_estimators=100, max_depth=3)
display_metrics_table(*metrics)

gb = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
gb.fit(X_train_upd, y_train)

y_pred = gb.predict(X_test_upd)

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |            Mean |       Std Dev |
|:---------|----------------:|--------------:|
| MAE      | 10431.1         | 896.828       |
| MSE      |     2.39143e+08 |   3.99802e+07 |
| R2       |     0.895984    |   0.0233652   |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 280076428.74
Средняя абсолютная ошибка (MAE): 10954.94
Коэффициент детерминации (R^2): 0.87


Можно увидеть, что в целом значения метрик улучшились. Результаты на тестовой выборке также показывают приросты.

#### 4. Реализация своего класса

In [None]:
class MyGradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.05, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_value = None

    def fit(self, X, y):
        # Initialize model with the mean of y
        self.initial_value = np.mean(y)
        residual = y - self.initial_value

        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            predictions = tree.predict(X)
            residual -= self.learning_rate * predictions
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.full(X.shape[0], self.initial_value)
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return predictions



In [48]:
metrics = regression_cross_validate(MyGradientBoostingRegressor, X_train.to_numpy(), y_train.to_numpy(), n_folds=5, n_estimators=50, max_depth=10)
display_metrics_table(*metrics)

gb = MyGradientBoostingRegressor(n_estimators=50, max_depth=10)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")


| Metric   |            Mean |        Std Dev |
|:---------|----------------:|---------------:|
| MAE      | 11477.3         | 2011.06        |
| MSE      |     3.67874e+08 |    1.06694e+08 |
| R2       |     0.83715     |    0.0599519   |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 292541898.19
Средняя абсолютная ошибка (MAE): 11204.65
Коэффициент детерминации (R^2): 0.87


In [56]:
metrics = regression_cross_validate(MyGradientBoostingRegressor, X_train_upd.to_numpy(), y_train.to_numpy(), n_folds=5, n_estimators=300, max_depth=3)
display_metrics_table(*metrics)

gb = GradientBoostingRegressor(n_estimators=100, max_depth=3)
gb.fit(X_train_upd, y_train)

y_pred = gb.predict(X_test_upd)

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |            Mean |       Std Dev |
|:---------|----------------:|--------------:|
| MAE      | 10618.1         | 966.691       |
| MSE      |     2.62705e+08 |   3.46773e+07 |
| R2       |     0.885892    |   0.0223521   |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 283508373.85
Средняя абсолютная ошибка (MAE): 10920.14
Коэффициент детерминации (R^2): 0.87


### Classification

 #### 1. Обработка данных

In [61]:
df = pd.read_csv('data/Student_Depression_Dataset.csv')
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [62]:
X, y = df.drop(columns=['Depression', 'id']), df['Depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

categorical_features = ['Gender', 'City', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
num_features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']

le = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=99)

X_train[categorical_features] = le.fit_transform(X_train[categorical_features])
X_test[categorical_features] = le.transform(X_test[categorical_features])

imputer = SimpleImputer(strategy='most_frequent') 
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

#### 2. Построение бейзлайна 

In [30]:
metrics = classification_cross_validate(GradientBoostingClassifier, X_train.to_numpy(), y_train.to_numpy(), n_folds=5, n_estimators=50, max_depth=10)
display_metrics_classification_table(*metrics)

gb = GradientBoostingClassifier(random_state=42, n_estimators=50, max_depth=10)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.833309 | 0.00314687 |
| Precision | 0.83273  | 0.00320975 |
| Recall    | 0.833309 | 0.00314687 |
| F1-score  | 0.832694 | 0.0030869  |

=== Результаты на Тесте ===
1. Accuracy: 82.50%
2. Precision: 82.46%
3. Recall: 82.50%
4. F1-score: 82.47%


#### 3. Формулировка гипотез

Сформулируем несколько гипотез, которые могут помочь улучшить качество модели

1) Поменять Encoder категориальных признаков с `LabelEncoder` на `OneHotEncoder`
2) Отмасштабировать численные признаки
3) Уменьшить глубину и  увеличить число деревьев

In [69]:
onehot = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

encoded_train_data = onehot.fit_transform(X_train[categorical_features])
encoded_test_data = onehot.transform(X_test[categorical_features])

encoded_df = pd.DataFrame(encoded_train_data, columns=onehot.get_feature_names_out(categorical_features))
X_train_upd = X_train.drop(columns=categorical_features).reset_index(drop=True)
X_train_upd = pd.concat([X_train_upd, encoded_df], axis=1)

encoded_df = pd.DataFrame(encoded_test_data, columns=onehot.get_feature_names_out(categorical_features))
X_test_upd = X_test.drop(columns=categorical_features).reset_index(drop=True)
X_test_upd = pd.concat([X_test_upd, encoded_df], axis=1)

X_train_upd[num_features] = scaler.fit_transform(X_train[num_features])
X_test_upd[num_features] = scaler.transform(X_test[num_features])


In [29]:
metrics = classification_cross_validate(GradientBoostingClassifier, X_train_upd.to_numpy(), y_train.to_numpy(), n_folds=5, random_state=42, n_estimators=100, max_depth=5)
display_metrics_classification_table(*metrics)

gb = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
gb.fit(X_train_upd, y_train)

y_pred = gb.predict(X_test_upd)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.847407 | 0.00576178 |
| Precision | 0.846933 | 0.00585733 |
| Recall    | 0.847407 | 0.00576178 |
| F1-score  | 0.846854 | 0.00575767 |

=== Результаты на Тесте ===
1. Accuracy: 83.60%
2. Precision: 83.55%
3. Recall: 83.60%
4. F1-score: 83.55%


Можно увидеть что в среднем значения метрик улучшились. 
Результаты на тестовой выборке также показывают приросты.

#### 4. Реализация своего класса

In [74]:
class MyGradientBoostingClassifier:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_value = None

    def _log_odds(self, y):
        p = np.clip(np.mean(y), 1e-15, 1 - 1e-15)
        return np.log(p / (1 - p))

    def fit(self, X, y):
        # Initialize with log-odds
        self.initial_value = self._log_odds(y)
        residual = y - self._sigmoid(self.initial_value)

        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            predictions = tree.predict(X)
            residual -= self.learning_rate * predictions
            self.trees.append(tree)

    def predict_proba(self, X):
        log_odds = np.full(X.shape[0], self.initial_value)
        for tree in self.trees:
            log_odds += self.learning_rate * tree.predict(X)
        proba = self._sigmoid(log_odds)
        return np.vstack([1 - proba, proba]).T

    def predict(self, X):
        proba = self.predict_proba(X)
        return (proba[:, 1] > 0.6).astype(int)

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

In [75]:
metrics = classification_cross_validate(MyGradientBoostingClassifier, X_train.to_numpy(), y_train.to_numpy(), n_folds=5, n_estimators=50, max_depth=10)
display_metrics_classification_table(*metrics)

gb = MyGradientBoostingClassifier(n_estimators=50, max_depth=10)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.818877 | 0.00498481 |
| Precision | 0.824859 | 0.00390499 |
| Recall    | 0.818877 | 0.00498481 |
| F1-score  | 0.819985 | 0.00481491 |

=== Результаты на Тесте ===
1. Accuracy: 81.48%
2. Precision: 82.13%
3. Recall: 81.48%
4. F1-score: 81.59%


In [77]:
metrics = classification_cross_validate(MyGradientBoostingClassifier, X_train_upd.to_numpy(), y_train.to_numpy(), n_folds=5, n_estimators=100, max_depth=5)
display_metrics_classification_table(*metrics)

gb = MyGradientBoostingClassifier(n_estimators=100, max_depth=5)
gb.fit(X_train_upd, y_train)

y_pred = gb.predict(X_test_upd)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.837372 | 0.00630136 |
| Precision | 0.844266 | 0.00526634 |
| Recall    | 0.837372 | 0.00630136 |
| F1-score  | 0.838431 | 0.00611599 |

=== Результаты на Тесте ===
1. Accuracy: 82.10%
2. Precision: 82.94%
3. Recall: 82.10%
4. F1-score: 82.21%


Результаты показывают что собственная имплементация модели в среднем работает примерно на том же уровне качества, что и модель из `sklearn`

### Заключение

Внесённые изменения, включая нормализацию данных и увеличение числа деревьев и уменьшение глубины в алгоритме бустинга, улучшают точность модели. Проведённые эксперименты демонстрируют, что как собственная реализация, так и применение моделей из sklearn, дают схожие результаты.

| Модель                    |      MSE  |        MAE |      $R^2$ |
|:--------------------------|----------:|-----------:|-----------:|
| Sklearn (до улучшения)    | 2.98e+08  | 11296.05   |  0.86      |
| Sklearn (после улучшения) | 2.20e+08  | 10954.36   |  0.87      |
| Собственная имплементация (до улучшения)   | 2.92e+08   | 11204.65    |  0.87      |
| Собственная имплементация (после улучшения)| 2.835e+08  | 10920.08    |  0.87      |

| Модель                    | Accuracy | Precision | Recall | F1-score |
|:--------------------------|----------:|-----------:|--------:|---------:|
| Sklearn (до улучшения)    |  82.50%   |   82.46%   |  82.50% |   82.47% |
| Sklearn (после улучшения) |  83.60%   |   84.55%   |  83.60% |   83.55% |
| Собственная имплементация (до улучшения)   |  81.48%   |   82.13%   |  81.48% |   81.59% |
| Собственная имплементация (после улучшения)|  82.10%   |   82.94%   |  82.10% |   82.21% |