In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [3]:
df.shape

(100000, 9)

In [4]:
from sklearn.metrics import confusion_matrix
def confusion(model):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    return cm

In [5]:
display(df.head())

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [6]:
display(df.info())
display(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


None

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [7]:
from sklearn.model_selection import train_test_split
training_column = 'diabetes'
drop_columns = ['']
X = df.drop(columns=[training_column])
y = df[training_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preprocessing

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

transformer = ColumnTransformer(transformers=[
    ('num', StandardScaler(), ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']),
    ('cat', OneHotEncoder(), ['gender', 'smoking_history'])
])  

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline(steps=[
    ('transformer', transformer),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

In [24]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier

pos = (y_train==1).sum(); neg = (y_train==0).sum()
pos_weight = neg / pos
param_grid = [
    {
        'classifier': [RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]

    },
    {
        "classifier": [xgb.XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",
        random_state=42,
        scale_pos_weight=pos_weight,
    )],
        "classifier__n_estimators": [300, 600],
        "classifier__max_depth": [3, 5, 7],
        "classifier__learning_rate": [0.03, 0.1],
        "classifier__subsample": [0.7, 1.0],
    },
    {
        'classifier': [HistGradientBoostingClassifier(class_weight='balanced', random_state=42)],
        'classifier__max_iter': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__learning_rate': [0.03, 0.1]
    }
]
grid_search_recall = GridSearchCV(pipeline, param_grid, cv=5, scoring='recall', n_jobs=-1)
grid_search_recall.fit(X_train, y_train)



0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"[{'classifier': [RandomForestC...ndom_state=42)], 'classifier__max_depth': [None, 10, ...], 'classifier__min_samples_split': [2, 5, ...], 'classifier__n_estimators': [50, 100, ...]}, {'classifier': [XGBClassifier...ree=None, ...)], 'classifier__learning_rate': [0.03, 0.1], 'classifier__max_depth': [3, 5, ...], 'classifier__n_estimators': [300, 600], ...}, ...]"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [25]:
display(grid_search_recall.best_params_)

{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               feature_weights=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, ...),
 'classifier__learning_rate': 0.03,
 'classifier__max_depth': 3,
 'classifier__n_estimators': 300,
 'classifier__subsample': 0.7}

In [26]:
display(grid_search_recall.best_score_)

np.float64(0.9243221159108371)

In [27]:
confusion(grid_search_recall)

array([[16457,  1835],
       [  128,  1580]])

In [28]:
grid_search_recall.cv_results_['mean_test_score']

array([0.6877199 , 0.68683711, 0.68713123, 0.7196688 , 0.71834343,
       0.71701849, 0.76369244, 0.76310226, 0.76501705, 0.8944354 ,
       0.89811544, 0.90194276, 0.89929277, 0.90061847, 0.90150104,
       0.8992931 , 0.90032359, 0.90400418, 0.7108352 , 0.70847901,
       0.71068727, 0.73954437, 0.73704177, 0.73660038, 0.7803299 ,
       0.78062337, 0.78209634, 0.92432212, 0.92093608, 0.92108379,
       0.92167224, 0.92182006, 0.92373388, 0.90842147, 0.91416315,
       0.91136643, 0.91696084, 0.87455755, 0.89502428, 0.91858033,
       0.91946409, 0.91003998, 0.91328004, 0.89193247, 0.90091248,
       0.85394593, 0.86763875, 0.83185982, 0.85026491, 0.77105253,
       0.78901514, 0.91534124, 0.90974619, 0.91813807, 0.91298603,
       0.91534124, 0.91092428, 0.90680295, 0.90680295, 0.91004128,
       0.91004128, 0.90724456, 0.90724456])