# XGBoost

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
diabetes = pd.read_csv("diabetes.csv")
df = diabetes.copy()
df = df.dropna()
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30,
                                                    random_state=42)

In [4]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [5]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7359307359307359

## Model Tuning

In [6]:
xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_samples_split": [2,5,10]}

In [7]:
xgb = XGBClassifier()

xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 10, n_jobs = -1, verbose = 2)

In [8]:
xgb_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.6; total time=   0.1s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.1s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100, subsample=1.0; total time=   0.1s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=500, subsample=0.6; total time=   0.4s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=500, subsample=1.0; total time=   0.3s
Parameters: { "min_samples_split" } are not used.

[CV] END learning_rate=0.1, max_depth=3, min_samples_split=

In [9]:
xgb_cv_model.best_params_

{'learning_rate': 0.02,
 'max_depth': 3,
 'min_samples_split': 2,
 'n_estimators': 100,
 'subsample': 0.6}

In [10]:
xgb = XGBClassifier(learning_rate = 0.02,
                    max_depth = 3,
                    min_samples_split = 2,
                    n_estimators = 100,
                    subsample = 0.6)

In [11]:
xgb_tuned =  xgb.fit(X_train,y_train)

Parameters: { "min_samples_split" } are not used.



In [12]:
y_pred = xgb_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

0.7575757575757576