In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
import joblib

In [27]:
data = pd.read_csv('diabetes.csv')

In [28]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal weight"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

data["BMI_category"] = data["BMI"].apply(categorize_bmi)

In [29]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Outcome"])

In [30]:
numeric_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
categorical_features = ["BMI_category"]

scaler = StandardScaler()
one_hot_encoder = OneHotEncoder()

preprocessor = ColumnTransformer([
    ("num", scaler, numeric_features),
    ("cat", one_hot_encoder, categorical_features)
])

In [31]:
X_train = preprocessor.fit_transform(train_data.drop(columns=["Outcome"]))
X_val = preprocessor.transform(val_data.drop(columns=["Outcome"]))
y_train = train_data["Outcome"].values
y_val = val_data["Outcome"].values


In [32]:
best_knn_f1 = 0
best_knn_model = None
best_knn_acc = 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    acc = accuracy_score(y_val, y_pred)
    print(f"KNN (k={k}) - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    if f1 > best_knn_f1:
        best_knn_f1 = f1
        best_knn_model = knn
        best_knn_acc = acc

KNN (k=3) - Accuracy: 0.7013, F1 Score: 0.5577
KNN (k=5) - Accuracy: 0.7273, F1 Score: 0.5882
KNN (k=7) - Accuracy: 0.7273, F1 Score: 0.5800


In [33]:
best_dt_f1 = 0
best_dt_model = None
best_dt_acc = 0
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    acc = accuracy_score(y_val, y_pred)
    print(f"Decision Tree (max_depth={depth}) - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    if f1 > best_dt_f1:
        best_dt_f1 = f1
        best_dt_model = dt
        best_dt_acc = acc

Decision Tree (max_depth=3) - Accuracy: 0.6948, F1 Score: 0.3733
Decision Tree (max_depth=5) - Accuracy: 0.7922, F1 Score: 0.7037
Decision Tree (max_depth=7) - Accuracy: 0.7662, F1 Score: 0.5909


In [34]:
best_model = best_knn_model if best_knn_f1 > best_dt_f1 else best_dt_model

# Save preprocessing and model
joblib.dump(scaler, "scaler.pkl")
joblib.dump(one_hot_encoder, "one_hot_encoder.pkl")
joblib.dump(best_model, "best_model.pkl")

['best_model.pkl']

In [35]:
def predict_sample(sample):
    # Load preprocessing and model
    scaler = joblib.load("scaler.pkl")
    encoder = joblib.load("one_hot_encoder.pkl")
    model = joblib.load("best_model.pkl")
    
    # Process input
    sample_df = pd.DataFrame([sample])
    sample_df["BMI_category"] = sample_df["BMI"].apply(categorize_bmi)
    
    transformed_sample = preprocessor.transform(sample_df)
    
    # Predict
    prediction = model.predict(transformed_sample)
    return prediction[0]

In [36]:
sample_data = val_data.sample(5, random_state=42).drop(columns=["Outcome"])
for _, row in sample_data.iterrows():
    print(f"Predicted Class: {predict_sample(row.to_dict())}")


Predicted Class: 1
Predicted Class: 1
Predicted Class: 1
Predicted Class: 0
Predicted Class: 0
