In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [3]:
RANDOM_STATE = 1

In [4]:
df = pd.read_csv("diabetes_prediction_dataset.csv")

In [5]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [6]:
cols = list(df.columns)
cols

['gender',
 'age',
 'hypertension',
 'heart_disease',
 'smoking_history',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level',
 'diabetes']

In [7]:
cat_features = ['gender', 'smoking_history']

In [8]:
# one-hot encoding
df = pd.get_dummies(data = df, prefix = cat_features, columns = cat_features)
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,0,1,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,0,1,0,0,0,0


In [9]:
print(df.columns)

Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes', 'gender_Female', 'gender_Male',
       'gender_Other', 'smoking_history_No Info', 'smoking_history_current',
       'smoking_history_ever', 'smoking_history_former',
       'smoking_history_never', 'smoking_history_not current'],
      dtype='object')


In [10]:
features = [x for x in df.columns if x not in ['diabetes']]
print(features)

['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'gender_Female', 'gender_Male', 'gender_Other', 'smoking_history_No Info', 'smoking_history_current', 'smoking_history_ever', 'smoking_history_former', 'smoking_history_never', 'smoking_history_not current']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df['diabetes'], train_size = 0.8, random_state = RANDOM_STATE)

In [12]:
sum(y_train)/len(y_train)

0.0849875

## Decision Tree Classifier

In [13]:
model = DecisionTreeClassifier(max_depth = 10, random_state = RANDOM_STATE)
model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10, random_state=1)

In [14]:
pred_train = model.predict(X_train)
print(accuracy_score(y_train, pred_train))

0.9724375


In [15]:
pred_test = model.predict(X_test)
print(accuracy_score(y_test, pred_test))

0.97115


## Random Forest

In [16]:
model2 = RandomForestClassifier(n_estimators = 100, max_depth = 13, random_state = RANDOM_STATE)
model2.fit(X_train,y_train)

RandomForestClassifier(max_depth=13, random_state=1)

In [17]:
pred_train = model2.predict(X_train)
print(accuracy_score(y_train, pred_train))

0.9740625


In [18]:
pred_test = model2.predict(X_test)
print(accuracy_score(y_test, pred_test))

0.9724


## XGBoost

In [19]:
# splitting training set into fitting set and evaluation test
n = int(len(X_train)*0.8)
X_train_fit, X_train_eval, y_train_fit, y_train_eval = X_train[:n], X_train[n:], y_train[:n], y_train[n:]

In [20]:
model3 = XGBClassifier(n_estimators = 700, learning_rate = 0.05, random_state = RANDOM_STATE)
model3.fit(X_train_fit, y_train_fit, eval_set = [(X_train_eval, y_train_eval)], early_stopping_rounds = 10)

[0]	validation_0-logloss:0.64909
[1]	validation_0-logloss:0.60923
[2]	validation_0-logloss:0.57299
[3]	validation_0-logloss:0.53989
[4]	validation_0-logloss:0.50962
[5]	validation_0-logloss:0.48182
[6]	validation_0-logloss:0.45622
[7]	validation_0-logloss:0.43260
[8]	validation_0-logloss:0.41072
[9]	validation_0-logloss:0.39047
[10]	validation_0-logloss:0.37169
[11]	validation_0-logloss:0.35422
[12]	validation_0-logloss:0.33796
[13]	validation_0-logloss:0.32281
[14]	validation_0-logloss:0.30868
[15]	validation_0-logloss:0.29548




[16]	validation_0-logloss:0.28314
[17]	validation_0-logloss:0.27159
[18]	validation_0-logloss:0.26079
[19]	validation_0-logloss:0.25066
[20]	validation_0-logloss:0.24114
[21]	validation_0-logloss:0.23221
[22]	validation_0-logloss:0.22383
[23]	validation_0-logloss:0.21595
[24]	validation_0-logloss:0.20852
[25]	validation_0-logloss:0.20156
[26]	validation_0-logloss:0.19501
[27]	validation_0-logloss:0.18881
[28]	validation_0-logloss:0.18300
[29]	validation_0-logloss:0.17751
[30]	validation_0-logloss:0.17234
[31]	validation_0-logloss:0.16747
[32]	validation_0-logloss:0.16286
[33]	validation_0-logloss:0.15851
[34]	validation_0-logloss:0.15441
[35]	validation_0-logloss:0.15055
[36]	validation_0-logloss:0.14689
[37]	validation_0-logloss:0.14343
[38]	validation_0-logloss:0.14015
[39]	validation_0-logloss:0.13707
[40]	validation_0-logloss:0.13417
[41]	validation_0-logloss:0.13141
[42]	validation_0-logloss:0.12880
[43]	validation_0-logloss:0.12634
[44]	validation_0-logloss:0.12398
[45]	validatio

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=700, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=1, ...)

In [21]:
pred_train = model3.predict(X_train)
print(accuracy_score(y_train, pred_train))

0.9727375


In [22]:
pred_test = model3.predict(X_test)
print(accuracy_score(y_test, pred_test))

0.97255
