# Gradient Boosting Machines

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

In [3]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])
y = df["Salary"]
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64")
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42)

### MODELING AND TESTING

In [12]:
gbm = GradientBoostingRegressor().fit(X_train, y_train)

In [13]:
dir(gbm)

['_SUPPORTED_LOSS',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_initialized',
 '_check_n_features',
 '_check_params',
 '_clear_state',
 '_compute_partial_dependence_recursion',
 '_estimator_type',
 '_fit_stage',
 '_fit_stages',
 '_get_param_names',
 '_get_tags',
 '_init_state',
 '_is_initialized',
 '_make_estimator',
 '_more_tags',
 '_raw_predict',
 '_raw_predict_init',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_resize_state',
 '_rng',
 '_staged_raw_predict',
 '_validate_data',
 '_va

In [14]:
y_pred = GBM.predict(X_test)

In [15]:
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
RMSE

351.95848484996344

In [16]:
r2_score(y_test, y_pred)

0.42776629295781665

### MODEL TUNING

In [17]:
GradientBoostingRegressor?

[1;31mInit signature:[0m
[0mGradientBoostingRegressor[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mloss[0m[1;33m=[0m[1;34m'ls'[0m[1;33m,[0m[1;33m
[0m    [0mlearning_rate[0m[1;33m=[0m[1;36m0.1[0m[1;33m,[0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0msubsample[0m[1;33m=[0m[1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'friedman_mse'[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;36m3[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_split[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minit[0m[1;33m=[0m[1;32m

In [23]:
gbm_params = {"learning_rate": [.1, .01, .001],
              "max_depth": [3, 5, 8],
              "n_estimators": [100, 200, 500],
              "subsample": [1, .5, .8],
              "loss": ['ls', 'lad', 'quantile']} # roboost yöntemler --> dayanıklı --> aykırı gözlemlere karşı

In [24]:
gbm_cv_model = GridSearchCV(gbm,
                            gbm_params,
                            cv=10, 
                            verbose=2,
                            n_jobs=-1).fit(X_train,
                                           y_train)

Fitting 10 folds for each of 243 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 19.4min
[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed: 27.0min finished


In [25]:
gbm_cv_model.best_params_

{'learning_rate': 0.1,
 'loss': 'lad',
 'max_depth': 8,
 'n_estimators': 100,
 'subsample': 0.5}

In [37]:
gbm_tuned = GradientBoostingRegressor(learning_rate=gbm_cv_model.best_params_.get("learning_rate"),
                                      max_depth=gbm_cv_model.best_params_.get("max_depth"),
                                      n_estimators=gbm_cv_model.best_params_.get("n_estimators"),
                                      subsample=gbm_cv_model.best_params_.get("subsample"),
                                      loss=gbm_cv_model.best_params_.get("loss")).fit(X_train,
                                                                                      y_train)

In [38]:
y_pred = gbm_tuned.predict(X_test)

In [39]:
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
RMSE

330.5786054831101

In [33]:
r2_score(y_test, y_pred)

0.4883723768080882