In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer


## Data loading

In [40]:
df = pd.read_csv("diabetes.csv")

In [41]:
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [42]:
df.describe

<bound method NDFrame.describe of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627  

In [43]:
print("Data Shape: ", df.shape)

Data Shape:  (768, 9)


## Data preprossesing

In [44]:
## Handle missing value
cols_with_zeros = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

## Feature engineering
df["BMI Categori"] = np.where(df["BMI"] < 25 , 0, 1)
df["glu_bmi"] = df["Glucose"] * df["BMI"]

#outlier detection and correction
for col in df.columns:
    if col != "Outcome":
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 -Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR
        df[col] = np.clip(df[col], lower, upper)


df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI Categori,glu_bmi
0,6.0,148.0,72.0,35.0,,33.6,0.627,50.0,1,1,4972.8
1,1.0,85.0,66.0,29.0,,26.6,0.351,31.0,0,1,2261.0
2,8.0,183.0,64.0,,,23.3,0.672,32.0,1,1,4263.9
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0,1,2500.9
4,0.0,137.0,40.0,35.0,168.0,43.1,1.200,33.0,1,1,5904.7
...,...,...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0,1,3322.9
764,2.0,122.0,70.0,27.0,,36.8,0.340,27.0,0,1,4489.6
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0,1,3170.2
766,1.0,126.0,60.0,,,30.1,0.349,47.0,1,1,3792.6


## Split feature and target

In [45]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]


## Train test split 

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size= 0.2, random_state= 42
)


In [47]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features)
    ]
)

## Model selection

In [48]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(kernel="rbf")
}

In [49]:
results = {}
print("Cross-Validation Results:")

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    cv_scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=5,
        scoring="roc_auc"
    )

    results[name] = cv_scores.mean()
    print(f"{name}: ROC-AUC = {cv_scores.mean():}")

Cross-Validation Results:
LogisticRegression: ROC-AUC = 0.8380595887508033
RandomForest: ROC-AUC = 0.8199179945449325
GradientBoosting: ROC-AUC = 0.8213116087663892
SVM: ROC-AUC = 0.8371581012537085


In [50]:
best_model_name = max(results, key=results.get)
print("Best Model Selected:", best_model_name)

Best Model Selected: LogisticRegression


## Pipeline creation

In [51]:


pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", models[best_model_name])
])


## Model selection:
### Logistic Regression is selected as the best model for the dataset based on cross-validated ROC-AUC performance, stability, and interpretability.
*   It achieves the highest or comparable cross-validated ROC-AUC.
*   Generalizes well on small clinical datasets.
*   Avoids overfitting through regularization
*   Provides interpretable results, and offers reliable probability estimates-making it specially suitable for medical diagnosis problems.

In [52]:
#train model
pipeline.fit(X_train, y_train)

## Cross validation

In [64]:
cv_scores = cross_val_score(
    pipeline, X_train, y_train, cv = 5, scoring="roc_auc"
)

print("CV_Mean: ", cv_scores.mean())
print("CV_std: ", cv_scores.std())

CV_Mean:  0.8380595887508033
CV_std:  0.017617232404370065


## Hyperparameter tuning

In [54]:
param_grid = {
    "LogisticRegression": {
        "model__C": [0.01, 0.1, 1, 10]
    },
    "RandomForest": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [None, 5, 10],
    },
    "GradientBoosting": {
        "model__n_estimators": [100, 200],
        "model__learning_rate": [0.05, 0.1],
        "model__max_depth": [3, 5]
    },
    "SVM": {
        "model__C": [0.1, 1, 10],
        "model__gamma": ["scale", "auto"]
    }
}

In [60]:

# param_grid = {
#     "model__n_estimators": [50, 100, 200],   
#     "model__max_depth": [None, 5, 10],       
#     "model__min_samples_split": [2, 5],     
#     "model__min_samples_leaf": [1, 2]       
# }

grid = GridSearchCV(
    pipeline,
    param_grid[best_model_name],
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fit(X_train, y_train)


In [61]:
print("Best Param: ", grid.best_params_)
print("Best Score:", grid.best_score_)

Best Param:  {'model__C': 0.1}
Best Score: 0.8386401125193114


In [62]:
# best model selection
best_model = grid.best_estimator_


In [63]:
y_pred = best_model.predict(X_test)

print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

Test accuracy: 0.7597402597402597
Confusion matrix: [[82 17]
 [20 35]]
Classification report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82        99
           1       0.67      0.64      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154

