#1. Load Required Libraries

We begin by importing all necessary Python libraries for data manipulation, visualization, model building, and evaluation.

In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,roc_curve,confusion_matrix,classification_report
import joblib
import warnings
warnings.filterwarnings("ignore")

#2. Load Dataset

We load the dataset using pandas and perform initial data inspection using .info() and .describe() to understand the structure, types, and statistical summary.

In [5]:
data_path="data/diabetes.csv"
df=pd.read_csv(data_path)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


#3. Replace Zeroes with Column Means

In some datasets (e.g. Pima Indians Diabetes), zeroes in certain features are invalid and treated as missing values. We replace them with the column mean.


In [9]:
cols_to_replace=["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
for col in cols_to_replace:
    df[col]=df[col].replace(0,df[col].mean())

#4. Prepare Features and Labels

We separate the dataset into features (X) and the target/label (y).


In [11]:
X=df.drop("Outcome",axis=1)
y=df["Outcome"]

#5. Train-Test Split and Feature Standardization

We split the data into training and testing sets using train_test_split, and then standardize the feature values using StandardScaler.

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [14]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

#6. Define Models

We define a dictionary of machine learning models to evaluate. These models can include classifiers like Logistic Regression, Random Forest, Support Vector Machine, etc.


In [16]:
models={
    "Logistic Regression":LogisticRegression(max_iter=1000),
    "Random Forest":RandomForestClassifier(random_state=42),
    "Gradient Boosting":GradientBoostingClassifier(random_state=42),
    "SVM": SVC(probability=True,random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss",use_label_encoder=False,random_state=42)
}



#7. Train, Predict and Evaluate

Each model is trained on the training set, predictions are made on the test set, and performance is evaluated using metrics like accuracy, F1-score, and ROC AUC.

In [18]:
results = []
best_score = 0
best_model = None
best_y_test = None
best_y_pred = None
best_name = ""

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Get predicted probabilities for ROC-AUC
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    
    # Save the results for this model
    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1-score": f1,
        "ROC-AUC-Score": roc
    })
    
    # Update best model if accuracy improved
    if acc > best_score:
        best_score = acc
        best_model = model
        best_y_test = y_test
        best_y_pred = y_pred
        best_name = name



#8. Plot Confusion Matrix for Best Model

We visualize the confusion matrix for the best-performing model to better understand its classification performance (true positives, false positives, etc.).

In [20]:
cm = confusion_matrix(best_y_test, best_y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title(f"Confusion Matrix - {best_name}")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()

# Save plot to results folder with the model name
plt.savefig(f"results/confusion_matrix_{best_name.lower().replace(' ', '_')}.png")
plt.close()


#9.Model Comparison Table
The performance of all models is stored and compared in a DataFrame. This allows for easy comparison based on accuracy, F1-score, and ROC-AUC.

In [22]:
results_df=pd.DataFrame(results).sort_values(by="Accuracy",ascending=False)
print("\n Model Comparison:\n",results_df)


 Model Comparison:
                  Model  Accuracy  F1-score  ROC-AUC-Score
4              XGBoost  0.759740  0.647619       0.819259
2    Gradient Boosting  0.753247  0.620000       0.830926
1        Random Forest  0.740260  0.600000       0.820926
3                  SVM  0.740260  0.600000       0.799074
0  Logistic Regression  0.701299  0.530612       0.814444


#10.Plot ROC Curves

To compare models visually in terms of classification thresholds, we plot their ROC curves.

In [24]:
plt.figure(figsize=(8,6))
for name,model in models.items():
    y_proba=model.predict_proba(X_test)[:,1]
    fpr,tpr,_=roc_curve(y_test,y_proba)
    plt.plot(fpr,tpr,label=f"{name}(AUC={roc_auc_score(y_test,y_proba):.2f}")

plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-Curves-Model Comparison")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("results/roc_curves_comparison.png")
plt.close()

#Save Best Model

We save the best model to a .pkl file using joblib, so it can be reused later without retraining.

In [26]:
best_model_name=results_df.iloc[0]["Model"]
best_model=models[best_model_name]
joblib.dump(best_model,f"results/{best_model_name.replace('','_').lower()}_best_model.pkl")
print(f"\nBest model ({best_model_name}) saved to results/ folder.")


Best model (XGBoost) saved to results/ folder.
