In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import joblib
import warnings
warnings.filterwarnings("ignore")

# ===========================
# 1. Load Dataset
# ===========================
print("Loading dataset...")
df = pd.read_csv("diabetes_data.csv")

# ===========================
# 2. Encode Categorical Variables
# ===========================
print("Encoding categorical columns...")

binary_map = {'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0}

for col in df.columns:
    if df[col].dtype == 'object' and col != 'class':
        df[col] = df[col].map(binary_map)

# Encode target: Positive = 1, Negative = 0
df['class'] = df['class'].map({'Positive': 1, 'Negative': 0})

# ===========================
# 3. Separate Features and Target
# ===========================
X = df.drop(columns=['class'])
y = df['class']

# ===========================
# 4. Train-Test Split
# ===========================
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===========================
# 5. Scale Only 'Age' Column
# ===========================
print("Scaling the Age column...")
scaler = StandardScaler()
X_train['Age'] = scaler.fit_transform(X_train[['Age']])
X_test['Age'] = scaler.transform(X_test[['Age']])

# Save the scaler
joblib.dump(scaler, "scaler.pkl")
print("Scaler saved as scaler.pkl")

# ===========================
# 6. Define Models
# ===========================
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42),
    "Naive Bayes": GaussianNB()
}

# ===========================
# 7. Train & Evaluate Models
# ===========================
results = []
print("Training models...")

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A"

    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:\n")
print(results_df)

# ===========================
# 8. Select Best Model
# ===========================
best_model_name = results_df.sort_values(by="F1 Score", ascending=False).iloc[0]['Model']
best_model = models[best_model_name]

print(f"\nBest Model Selected: {best_model_name}")

# ===========================
# 9. Save Best Model
# ===========================
joblib.dump(best_model, "best_diabetes_model.pkl")
print("Best model saved as best_diabetes_model.pkl")
