<a href="https://colab.research.google.com/github/TonyQ-Lab/supervised-learning/blob/main/diabetics-prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preparation

In [14]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("iammustafatz/diabetes-prediction-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'diabetes-prediction-dataset' dataset.
Path to dataset files: /kaggle/input/diabetes-prediction-dataset


In [1]:
# Import dependencies
import os
import pandas as pd
import numpy as np

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Loading and checking data

In [15]:
diabetic_df = pd.read_csv(os.path.join(path, "diabetes_prediction_dataset.csv"))

In [16]:
diabetic_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [17]:
diabetic_df = pd.get_dummies(diabetic_df)
diabetic_df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,True,False,False,False,False,False,False,True,False
1,54.0,0,0,27.32,6.6,80,0,True,False,False,True,False,False,False,False,False
2,28.0,0,0,27.32,5.7,158,0,False,True,False,False,False,False,False,True,False
3,36.0,0,0,23.45,5.0,155,0,True,False,False,False,True,False,False,False,False
4,76.0,1,1,20.14,4.8,155,0,False,True,False,False,True,False,False,False,False


In [18]:
X = diabetic_df.drop("diabetes", axis=1)
y = diabetic_df["diabetes"].values

In [19]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training and validating

In [20]:
# Define pipeline
steps = [
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression())
]
pipeline = Pipeline(steps)

In [21]:
params = {
    "logreg__C": np.linspace(0.1, 2, 10)
}
kf = KFold(n_splits=5)
grid = GridSearchCV(pipeline, param_grid=params, scoring="accuracy", cv=kf)

In [22]:
grid.fit(X_train, y_train)

In [23]:
print("Best params: ", grid.best_params_)
print("Best score: ", grid.best_score_)

Best params:  {'logreg__C': np.float64(1.788888888888889)}
Best score:  0.9605875000000001


In [25]:
# Evaluation
y_pred = grid.predict(X_test)

In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.959

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.86      0.62      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000


Confusion Matrix:
 [[18126   166]
 [  654  1054]]
