In [24]:

# Requires: cleaned_heart_disease.csv and selected_features.txt

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
import joblib

#  Load cleaned dataset
DATA_CLEAN = "cleaned_heart_disease.csv"
assert os.path.exists(DATA_CLEAN), "Run preprocessing first."
df = pd.read_csv(DATA_CLEAN)

target_col = df.columns[-1]

# One-hot encode categorical columns so names match selected_features.txt
df = pd.get_dummies(df, drop_first=False)

#Load selected features
from google.colab import files
uploaded = files.upload()

sel_path = "selected_features.txt"


X = df[selected]
y = df[target_col]

#  Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#  Train-test split (stratify for class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Baseline models
models = {
    "LogisticRegression": LogisticRegression(max_iter=5000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42)
}

results = []
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    try:
        y_proba = model.predict_proba(X_test)[:, 1]
    except:
        y_proba = model.decision_function(X_test)

    metrics = {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "auc": roc_auc_score(y_test, y_proba)
    }
    results.append(metrics)

    # Save classification report
    with open(f"{name}_classification_report.txt", "w") as f:
        f.write(classification_report(y_test, y_pred))

    # Save model
    joblib.dump(model, f"{name}_baseline.pkl")

# Save results
res_df = pd.DataFrame(results).sort_values(by="auc", ascending=False)
res_df.to_csv("evaluation_baseline_models.csv", index=False)
print(" Baseline evaluation:\n", res_df)


Saving selected_features .txt to selected_features  (2).txt
Training LogisticRegression...
Training DecisionTree...
Training RandomForest...
Training SVM...
✅ Baseline evaluation:
                 model  accuracy  precision    recall        f1       auc
1        DecisionTree  1.000000   1.000000  1.000000  1.000000  1.000000
2        RandomForest  1.000000   1.000000  1.000000  1.000000  1.000000
3                 SVM  0.892683   0.884615  0.841463  0.862500  0.957466
0  LogisticRegression  0.824390   0.780488  0.780488  0.780488  0.878743
