## Градиентный бустинг
[Статья на Хабре про градиентный бустинг](https://habr.com/ru/company/ods/blog/327250/)

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


In [16]:
df = pd.read_csv('../data/bank.csv') 
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [17]:
df_dirty = df.copy()

df.loc[df['Tenure'].isna(),'Tenure'] = df['Tenure'].median()
df['Gender'] = df['Gender'].map({'Female':0, 'Male':1})
df_binary = df[['Gender', 'HasCrCard', 'IsActiveMember', 'Exited']]
df_dummies = pd.get_dummies(df['Geography'])

y = df['EstimatedSalary']
df = df.drop(columns=df_binary.columns)
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Geography','EstimatedSalary'])

scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

X = pd.concat([df,df_binary, df_dummies], axis=1)
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,Gender,HasCrCard,IsActiveMember,Exited,France,Germany,Spain
0,-0.326221,0.293517,-1.086246,-1.225848,-0.911583,0,1,1,1,1,0,0
1,-0.440036,0.198164,-1.448581,0.11735,-0.911583,0,0,1,0,0,0,1
2,-1.536794,0.293517,1.087768,1.333053,2.527057,0,1,0,1,1,0,0
3,0.501521,0.007457,-1.448581,-1.225848,0.807737,0,0,0,0,1,0,0
4,2.063884,0.388871,-1.086246,0.785728,-0.911583,0,1,1,0,0,0,1


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=10)

### AdaBoost
[Пошаговое описание на английском](https://www.youtube.com/watch?v=LsK-xG1cLYA)

In [19]:
model = AdaBoostRegressor(random_state=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

49811.68758083673

### GradientBoosting
[Пошаговое описание на английском](https://www.youtube.com/watch?v=3CC4N4z3GJc)


In [20]:
model = GradientBoostingRegressor(random_state=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

49923.9328619293

### XGBoost

In [24]:
xgb_model = XGBRegressor(random_state=10, n_jobs=-1)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mean_absolute_error(y_test, y_pred)

52109.81890140626

In [29]:
params = {
    'learning_rate': [0.05, 0.1],
    'max_depth': [2, 4, 8],
    'n_estimators': range(10, 150, 50),
    'colsample_bytree': np.arange(0.2, 0.5, 0.1)
}

clf = GridSearchCV(xgb_model, params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
clf.fit(X_train,y_train)

print(clf.best_score_)
print(clf.best_params_)

-49657.01877133928
{'colsample_bytree': 0.2, 'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 110}


### LightGBM

In [31]:
lgb_model = LGBMRegressor()
clf = GridSearchCV(lgb_model, params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
clf.fit(X_train,y_train)

print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
-49603.83863353761
{'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 10}


### CatBoost

In [32]:
cb_model = CatBoostRegressor()
cb_model.fit(X_train, y_train)
y_pred = cb_model.predict(X_test)
mean_absolute_error(y_test, y_pred)

Learning rate set to 0.055681
0:	learn: 57434.6462667	total: 171ms	remaining: 2m 50s
1:	learn: 57406.8273475	total: 177ms	remaining: 1m 28s
2:	learn: 57396.4308247	total: 181ms	remaining: 1m
3:	learn: 57373.3408020	total: 185ms	remaining: 46s
4:	learn: 57364.7615379	total: 189ms	remaining: 37.6s
5:	learn: 57352.4214856	total: 192ms	remaining: 31.9s
6:	learn: 57340.9579372	total: 195ms	remaining: 27.6s
7:	learn: 57323.5216874	total: 197ms	remaining: 24.4s
8:	learn: 57303.1776851	total: 200ms	remaining: 22s
9:	learn: 57272.1953184	total: 202ms	remaining: 20s
10:	learn: 57246.1351830	total: 204ms	remaining: 18.4s
11:	learn: 57224.2635037	total: 208ms	remaining: 17.1s
12:	learn: 57209.2790304	total: 210ms	remaining: 15.9s
13:	learn: 57184.4624287	total: 212ms	remaining: 14.9s
14:	learn: 57153.0332842	total: 215ms	remaining: 14.1s
15:	learn: 57127.7686959	total: 217ms	remaining: 13.4s
16:	learn: 57112.1779637	total: 219ms	remaining: 12.7s
17:	learn: 57095.3396129	total: 221ms	remaining: 12.

50833.56630191948

In [33]:
X = df_dirty.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'EstimatedSalary'])
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=10)
cb_model = CatBoostRegressor()
cat_features = np.where(X.dtypes == "object")[0].tolist()

In [36]:
cb_model.fit(X_train, y_train, cat_features, verbose=0)
y_pred = cb_model.predict(X_test)
mean_absolute_error(y_test, y_pred)

50593.545603709994

🛠 Уменьшить MAE в модели `CatBoost` с помощью `GridSearchCV`

🛠 Применить классификатор `XGBoost`, `LightGBM` или `CatBoost` для предсказания параметра `Exited`. Рассчитать roc_auc на отложенной выборке.

In [None]:
# Ваш код здесь

In [3]:
from sklearn.metrics import roc_auc_score
df = pd.read_csv('../data/bank.csv') 
y=df['Exited']

# Ваш код здесь