## eXtreme Gradient Boosting (XGBoost)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale
from sklearn import model_selection

In [2]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])
y = df["Salary"] 
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis= 1).astype("float64")
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis= 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)

In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.1.1-py3-none-win_amd64.whl (54.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.1.1


In [4]:
import xgboost as xgb

In [5]:
DM_train = xgb.DMatrix(data= X_train, label= y_train)
DM_test = xgb.DMatrix(data= X_test, label= y_test)

In [10]:
from xgboost import XGBRegressor

In [11]:
xgb_model = XGBRegressor().fit(X_train, y_train)

### Tahmin

In [13]:
y_pred = xgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

355.4651481224188

### Model Tuning

In [24]:
xgb_model

# booster='gbtree'
# colsample_bynode=1 --> ağaç başına değişkenlerden alınacak örnek oranı
# learning_rate=0.300000012 --> overfitin'i engellemek için kullanarız 0-1 arasında olur. Boyutu tahmin süresi ve başarıyı değiştirir.
# max_depth=6 --> karmaşıklık parametresi overfiting'i engeller.
# n_estimators=100

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', random_state=None, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=None,
             tree_method=None, validate_parameters=None, verbosity=None)

In [25]:
gbm_grid = {"colsample_bynode": [0.4, 0.5, 0.6, 0.9, 1],
           "n_estimators": [100, 200, 500, 1000],
           "max_depth": [2, 3, 4, 5, 6, 7],
           "learning_rate": [0.1, 0.01, 0.5]}

In [26]:
xgb_model = XGBRegressor()

xgb_cv_model = GridSearchCV(xgb_model, param_grid= gbm_grid, cv= 10, n_jobs= -1, verbose= 2)

xgb_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 360 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 228 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 634 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 1930 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 2820 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:  1.6min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estim...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=-1,
             param_grid={

In [27]:
xgb_cv_model.best_params_

{'colsample_bynode': 0.4,
 'learning_rate': 0.5,
 'max_depth': 2,
 'n_estimators': 200}

In [37]:
xgb_tuned = XGBRegressor(colsample_bynode= 0.9, learning_rate= 0.1, max_depth= 2, n_estimators= 1000)
xgb_tuned.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=0.9, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [38]:
y_pred = xgb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

346.7456908437493