## Gradient Boosting Machines (GBM)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale
from sklearn import model_selection

In [2]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])
y = df["Salary"] 
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis= 1).astype("float64")
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis= 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
gbm_model = GradientBoostingRegressor()
gbm_model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Tahmin

In [5]:
gbm_model = GradientBoostingRegressor()
gbm_model.fit(X_train, y_train)
y_pred = gbm_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

346.80660845142813

### Model Tuning

In [6]:
gbm_params = {"learning_rate": [0.001, 0.01, 0.1, 0.2],
            "max_depth": list(range(1,10)),
            "max_features": [3, 5, 8, 50, 100],
            "n_estimators":[200, 500, 1000, 2000],
             "subsample": [1, 0.5, 0.75]}

In [7]:
gbm = GradientBoostingRegressor()
gbm_cv_model = GridSearchCV(gbm, gbm_params, cv= 10, n_jobs= -1, verbose= 2)
gbm_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 2160 candidates, totalling 21600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 1436 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2633 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3955 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 5730 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 7656 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 9954 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 12523 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 15422 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 18336 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 21566 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 21600 out of 21600 | elapsed: 16.2min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_...
                            