In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import datasets
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv')
df.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure (mm Hg),Triceps skinfold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age,Is Diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.isna().sum()

Number of times pregnant                          0
Plasma glucose concentration                      0
Diastolic blood pressure (mm Hg)                  0
Triceps skinfold thickness (mm)                   0
2-Hour serum insulin (mu U/ml)                    0
Body mass index (weight in kg/(height in m)^2)    0
Diabetes pedigree function                        0
Age                                               0
Is Diabetic                                       0
dtype: int64

In [4]:
cols = ['Plasma glucose concentration',
       'Diastolic blood pressure (mm Hg)', 'Triceps skinfold thickness (mm)',
       '2-Hour serum insulin (mu U/ml)',
       'Body mass index (weight in kg/(height in m)^2)',
       'Diabetes pedigree function', 'Age']

In [6]:
# as mentioned in the data description, the missing values have been replaced by zeroes. So, we are replacing zeroes with nan
for col in cols:
    df[col]=df[col].replace(0, np.nan)

In [8]:
df.isna().sum()

Number of times pregnant                            0
Plasma glucose concentration                        5
Diastolic blood pressure (mm Hg)                   35
Triceps skinfold thickness (mm)                   227
2-Hour serum insulin (mu U/ml)                    374
Body mass index (weight in kg/(height in m)^2)     11
Diabetes pedigree function                          0
Age                                                 0
Is Diabetic                                         0
dtype: int64

In [11]:
# imputing the missing values
df['Plasma glucose concentration']=df['Plasma glucose concentration'].fillna(df['Plasma glucose concentration'].mode()[0])
df['Diastolic blood pressure (mm Hg)']=df['Diastolic blood pressure (mm Hg)'].fillna(df['Diastolic blood pressure (mm Hg)'].mode()[0])
df['Triceps skinfold thickness (mm)']=df['Triceps skinfold thickness (mm)'].fillna(df['Triceps skinfold thickness (mm)'].mean())
df['2-Hour serum insulin (mu U/ml)']=df['2-Hour serum insulin (mu U/ml)'].fillna(df['2-Hour serum insulin (mu U/ml)'].mean())
df['Body mass index (weight in kg/(height in m)^2)']=df['Body mass index (weight in kg/(height in m)^2)'].fillna(df['Body mass index (weight in kg/(height in m)^2)'].mean())

In [12]:
df.isna().sum()

Number of times pregnant                          0
Plasma glucose concentration                      0
Diastolic blood pressure (mm Hg)                  0
Triceps skinfold thickness (mm)                   0
2-Hour serum insulin (mu U/ml)                    0
Body mass index (weight in kg/(height in m)^2)    0
Diabetes pedigree function                        0
Age                                               0
Is Diabetic                                       0
dtype: int64

In [13]:
x = df.drop(labels = "Is Diabetic", axis = 1)
y = df['Is Diabetic']

In [14]:
x.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure (mm Hg),Triceps skinfold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [15]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Is Diabetic, dtype: int64

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(x)

In [19]:
scaled_data[5]

array([ 3.42980797e-01, -1.81782624e-01,  1.40874882e-01,  8.08793628e-16,
       -3.34507888e-16, -9.98077308e-01, -8.18078579e-01, -2.75759658e-01])

In [22]:
from sklearn.model_selection import train_test_split
train_x, test_x,train_y, test_y = train_test_split(scaled_data, y, test_size = 0.3, random_state = 42) 

In [23]:
model = XGBClassifier(objective = "binary:logistic")
model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [24]:
y_pred = model.predict(train_x)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(train_y,predictions)
accuracy

0.9050279329608939

In [25]:
# cheking initial test accuracy
y_pred = model.predict(test_x)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(test_y,predictions)
accuracy

0.7402597402597403

In [30]:
y_pred

array([1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [31]:
predictions

[1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0]

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
param_grid={
   
    ' learning_rate':[1,0.5,0.1,0.01,0.001],
    'max_depth': [3,5,10,20],
    'n_estimators':[10,50,100,200]
    
}

In [34]:
grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid, verbose=3)

In [35]:
grid.fit(train_x,train_y)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.778, total=   0.7s
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.731, total=   0.0s
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.785, total=   0.0s
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.692, total=   0.0s
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.729, total=   0.0s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.815, total=   0.1s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.731, total=   0.1s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.776, total=   0.0s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.729, total=   0.1s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.785, total=   0.1s
[CV]  learning_rate=1, max_depth=3, n_estimators=100 .................
[CV]   learning_rate=1, max_depth=3, n_estimators=100, score=0.815, total=   0.2s
[CV]  learning_rate=1, max_depth=3, n_estimators=100 .................
[CV]   learning_rate=1, max_depth=3, n_estimators=100, score=0.722, total=   0.2s
[CV] 

[CV]   learning_rate=1, max_depth=10, n_estimators=200, score=0.748, total=   0.7s
[CV]  learning_rate=1, max_depth=20, n_estimators=10 .................
[CV]   learning_rate=1, max_depth=20, n_estimators=10, score=0.787, total=   0.1s
[CV]  learning_rate=1, max_depth=20, n_estimators=10 .................
[CV]   learning_rate=1, max_depth=20, n_estimators=10, score=0.778, total=   0.0s
[CV]  learning_rate=1, max_depth=20, n_estimators=10 .................
[CV]   learning_rate=1, max_depth=20, n_estimators=10, score=0.766, total=   0.0s
[CV]  learning_rate=1, max_depth=20, n_estimators=10 .................
[CV]   learning_rate=1, max_depth=20, n_estimators=10, score=0.720, total=   0.0s
[CV]  learning_rate=1, max_depth=20, n_estimators=10 .................
[CV]   learning_rate=1, max_depth=20, n_estimators=10, score=0.738, total=   0.1s
[CV]  learning_rate=1, max_depth=20, n_estimators=50 .................
[CV]   learning_rate=1, max_depth=20, n_estimators=50, score=0.778, total=   0.2s

[CV]   learning_rate=0.5, max_depth=5, n_estimators=100, score=0.720, total=   0.3s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=100 ...............
[CV]   learning_rate=0.5, max_depth=5, n_estimators=100, score=0.748, total=   0.3s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=5, n_estimators=200, score=0.769, total=   0.5s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=5, n_estimators=200, score=0.759, total=   0.5s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=5, n_estimators=200, score=0.776, total=   0.3s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=5, n_estimators=200, score=0.710, total=   0.3s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=5, n_estimators=200, score=0.729, 

[CV]   learning_rate=0.1, max_depth=3, n_estimators=50, score=0.731, total=   0.1s
[CV]  learning_rate=0.1, max_depth=3, n_estimators=50 ................
[CV]   learning_rate=0.1, max_depth=3, n_estimators=50, score=0.776, total=   0.1s
[CV]  learning_rate=0.1, max_depth=3, n_estimators=50 ................
[CV]   learning_rate=0.1, max_depth=3, n_estimators=50, score=0.729, total=   0.1s
[CV]  learning_rate=0.1, max_depth=3, n_estimators=50 ................
[CV]   learning_rate=0.1, max_depth=3, n_estimators=50, score=0.785, total=   0.1s
[CV]  learning_rate=0.1, max_depth=3, n_estimators=100 ...............
[CV]   learning_rate=0.1, max_depth=3, n_estimators=100, score=0.815, total=   0.2s
[CV]  learning_rate=0.1, max_depth=3, n_estimators=100 ...............
[CV]   learning_rate=0.1, max_depth=3, n_estimators=100, score=0.722, total=   0.1s
[CV]  learning_rate=0.1, max_depth=3, n_estimators=100 ...............
[CV]   learning_rate=0.1, max_depth=3, n_estimators=100, score=0.776, tota

[CV]   learning_rate=0.1, max_depth=10, n_estimators=200, score=0.748, total=   0.7s
[CV]  learning_rate=0.1, max_depth=20, n_estimators=10 ...............
[CV]   learning_rate=0.1, max_depth=20, n_estimators=10, score=0.787, total=   0.0s
[CV]  learning_rate=0.1, max_depth=20, n_estimators=10 ...............
[CV]   learning_rate=0.1, max_depth=20, n_estimators=10, score=0.778, total=   0.0s
[CV]  learning_rate=0.1, max_depth=20, n_estimators=10 ...............
[CV]   learning_rate=0.1, max_depth=20, n_estimators=10, score=0.766, total=   0.0s
[CV]  learning_rate=0.1, max_depth=20, n_estimators=10 ...............
[CV]   learning_rate=0.1, max_depth=20, n_estimators=10, score=0.720, total=   0.0s
[CV]  learning_rate=0.1, max_depth=20, n_estimators=10 ...............
[CV]   learning_rate=0.1, max_depth=20, n_estimators=10, score=0.738, total=   0.1s
[CV]  learning_rate=0.1, max_depth=20, n_estimators=50 ...............
[CV]   learning_rate=0.1, max_depth=20, n_estimators=50, score=0.778,

[CV]   learning_rate=0.01, max_depth=5, n_estimators=100, score=0.776, total=   0.2s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=100, score=0.720, total=   0.3s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=100, score=0.748, total=   0.3s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=200 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=200, score=0.769, total=   0.7s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=200 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=200, score=0.759, total=   0.6s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=200 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=200, score=0.776, total=   0.5s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=200 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=200, score=

[CV]   learning_rate=0.001, max_depth=3, n_estimators=50, score=0.815, total=   0.1s
[CV]  learning_rate=0.001, max_depth=3, n_estimators=50 ..............
[CV]   learning_rate=0.001, max_depth=3, n_estimators=50, score=0.731, total=   0.1s
[CV]  learning_rate=0.001, max_depth=3, n_estimators=50 ..............
[CV]   learning_rate=0.001, max_depth=3, n_estimators=50, score=0.776, total=   0.1s
[CV]  learning_rate=0.001, max_depth=3, n_estimators=50 ..............
[CV]   learning_rate=0.001, max_depth=3, n_estimators=50, score=0.729, total=   0.1s
[CV]  learning_rate=0.001, max_depth=3, n_estimators=50 ..............
[CV]   learning_rate=0.001, max_depth=3, n_estimators=50, score=0.785, total=   0.1s
[CV]  learning_rate=0.001, max_depth=3, n_estimators=100 .............
[CV]   learning_rate=0.001, max_depth=3, n_estimators=100, score=0.815, total=   0.1s
[CV]  learning_rate=0.001, max_depth=3, n_estimators=100 .............
[CV]   learning_rate=0.001, max_depth=3, n_estimators=100, scor

[CV]   learning_rate=0.001, max_depth=10, n_estimators=200, score=0.738, total=   0.6s
[CV]  learning_rate=0.001, max_depth=10, n_estimators=200 ............
[CV]   learning_rate=0.001, max_depth=10, n_estimators=200, score=0.748, total=   0.6s
[CV]  learning_rate=0.001, max_depth=20, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=20, n_estimators=10, score=0.787, total=   0.0s
[CV]  learning_rate=0.001, max_depth=20, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=20, n_estimators=10, score=0.778, total=   0.0s
[CV]  learning_rate=0.001, max_depth=20, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=20, n_estimators=10, score=0.766, total=   0.0s
[CV]  learning_rate=0.001, max_depth=20, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=20, n_estimators=10, score=0.720, total=   0.0s
[CV]  learning_rate=0.001, max_depth=20, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=20, n_estimators=1

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.6min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={' learning_rate': [1, 0.5, 0.1, 0.01, 0.001],
                         'max_depth': [3, 5, 10, 20],
                         'n_estimators': [10, 50,

In [36]:
grid.best_params_

{' learning_rate': 1, 'max_depth': 5, 'n_estimators': 50}

In [43]:
new_model = XGBClassifier(learning_rate = 1, max_depth = 5, n_estimators = 50)
new_model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1,
              max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
              n_estimators=50, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [39]:
y_pred_new = new_model.predict(test_x)
predictions_new = [round(value) for value in y_pred_new]
accuracy_new = accuracy_score(test_y,predictions_new)
accuracy_new

0.7445887445887446

In [41]:
# Trying a random prediction
d=scaler.transform([[6,148,72,35,80,33.6,0.627,50]])
pred=new_model.predict(d)
print('This data belongs to class :',pred[0])

This data belongs to class : 1
