# eXtreme Gradient Boosting - XGBoost

XGBoost, GBM'in hız ve tahmin performansını arttırmak üzere optimize edilmiş; ölçeklenebilir ve farklı platformlara entegre edilebilir halidir.

* R, Python, Hadoop, Scala, Julia ile kullanılabilir.
* Hızlıdır.
* Tahmin başarısı yüksektir.
* Birçok kaggle yarışmasında başarısını kanıtlamıştır.

Hadoop, Scala dünyasına entegre edildiğinde daha performanslı çalışır. (Hadoop cache bellek kullanır)

In [12]:
# bilgilerin gözükmesi için
from sklearn import set_config
set_config(print_changed_only=False)

# XGBoost - Model & Tahmin

In [1]:
import pandas as pd
# veri setimizi import edelim
diabetes = pd.read_csv("diabetes.csv")

In [2]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                   random_state=42)

In [4]:
from xgboost import XGBClassifier

In [5]:
# model kurulumu
xgb_model = XGBClassifier().fit(X_train, y_train)

In [6]:
# y test tahmin değerleri
y_pred = xgb_model.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score
# doğruluk oranı
accuracy_score(y_test, y_pred)

0.7359307359307359

# XGBoost - Model Tuning

In [9]:
xgb_model

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)

In [10]:
?xgb_model

[0;31mType:[0m        XGBClassifier
[0;31mString form:[0m
XGBClassifier(base_score=None, booster=None, callbacks=None,
           colsample_bylevel=None <...> obs=None, num_parallel_tree=None,
           objective='binary:logistic', predictor=None, ...)
[0;31mFile:[0m        /opt/anaconda3/lib/python3.9/site-packages/xgboost/sklearn.py
[0;31mDocstring:[0m  
Implementation of the scikit-learn API for XGBoost classification.


Parameters
----------

    n_estimators : int
        Number of boosting rounds.

    max_depth :  Optional[int]
        Maximum tree depth for base learners.
    max_leaves :
        Maximum number of leaves; 0 indicates no limit.
    max_bin :
        If using histogram-based algorithm, maximum number of bins per feature
    grow_policy :
        Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
        depth-wise. 1: favor splitting at nodes with highest loss change.
    learning_rate : Optional[float]
        Boosting learn

In [13]:
# denenecek olan değerler
xgb_params ={
    "n_estimators": [100, 500, 1000, 2000],
    "subsample": [0.6, 0.8, 1.0],
    "max_depth": [3,4,5,6],
    "learning_rate": [0.1, 0.01, 0.02, 0.05],
    "min_samples_split": [2,5,10]
}

In [14]:
# model nesnesi oluşturuldu
xgb = XGBClassifier()

In [15]:
# cv nesnesi
from sklearn.model_selection import GridSearchCV
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv=10, n_jobs=-1, verbose=2)

In [16]:
# cv model kuruldu
xgb_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.6; total time=   0.0s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.0s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.1s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100, subsample=1.0; total time=   0.0s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=500, subsample=0.6; total time=   0.2s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=

GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                   

In [17]:
# bulunan en iyi parametrelere bakalım
xgb_cv_model.best_params_

{'learning_rate': 0.02,
 'max_depth': 3,
 'min_samples_split': 2,
 'n_estimators': 100,
 'subsample': 0.6}

In [18]:
# final modelimizi optimum parametrelerle oluşturalım
xgb = XGBClassifier(learning_rate = 0.02,
                   max_depth=3,
                   min_samples_split=2,
                   n_estimators=100,
                   subsample=0.6)

In [19]:
# modeli kurdum
xgb_tuned = xgb.fit(X_train, y_train)

Parameters: { "min_samples_split" } are not used.

Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.05, max_depth=5, min_samples_split=5, n_estimators=500, subsample=1.0; total time=   0.2s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.05, max_depth=5, min_samples_split=5, n_estimators=1000, subsample=0.6; total time=   0.5s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.05, max_depth=5, min_samples_split=5, n_estimators=1000, subsample=0.8; total time=   0.5s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.05, max_depth=5, min_samples_split=5, n_estimators=1000, subsample=1.0; total time=   0.5s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.05, max_depth=5, min_samples_split=5, n_estimators=2000, subsample=0.6; total time=   0.8s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.05, max_depth=5, min_samples_split=5, n

In [20]:
# tahmin edilen y test değerleri
y_pred = xgb_tuned.predict(X_test)

In [21]:
# doğruluk oranı
accuracy_score(y_test, y_pred)

0.7575757575757576