# XGBoost

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[K     |████████████████████████████████| 193.6 MB 81 kB/s s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.7.4


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=42)

In [5]:
DM_train = xgb.DMatrix(data = X_train, label = y_train)
DM_test = xgb.DMatrix(data = X_test, label = y_test)

In [6]:
xgb_model = XGBRegressor().fit(X_train, y_train)

## Tahmin

In [7]:
y_pred = xgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

355.46515176059927

## Model Tuning

In [8]:
xgb_model

In [9]:
xgb_grid = {
     'colsample_bytree': [0.4, 0.5,0.6,0.9,1],
     'n_estimators':[100, 200, 500, 1000],
     'max_depth': [2,3,4,5,6],
     'learning_rate': [0.1, 0.01, 0.5]
}


In [10]:
xgb = XGBRegressor()

xgb_cv = GridSearchCV(xgb,
                      param_grid = xgb_grid,
                      cv = 10,
                      n_jobs = -1,
                      verbose = 2)
xgb_cv.fit(X_train, y_train)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=500; total time=   0.2s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=2, n_estimators=1000; total time=   0.5s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   0.3s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=1000; total time=   0.4s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=500; total time=   0.2s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END colsample_bytree=0.4, le

In [11]:
xgb_cv.best_params_

{'colsample_bytree': 0.6,
 'learning_rate': 0.1,
 'max_depth': 2,
 'n_estimators': 1000}

In [12]:
xgb_tuned = XGBRegressor(colsample_bytree = 0.9,
                         learning_rate = 0.01,
                         max_depth = 5,
                         n_estimators = 1000)

xgb_tuned = xgb_tuned.fit(X_train,y_train)

In [13]:
y_pred = xgb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

357.18507083923674