In [4]:
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np

# Load cleaned dataset
DATA_CLEAN = "cleaned_heart_disease.csv"
assert os.path.exists(DATA_CLEAN), f"{DATA_CLEAN} not found."
df = pd.read_csv(DATA_CLEAN)
target_col = df.columns[-1]


#Load selected features
from google.colab import files
uploaded = files.upload()  # upload selected_features.txt

# Load selected features
sel_path = "selected_features.txt"
if os.path.exists(sel_path):
    with open(sel_path, "r") as f:
        selected = [line.strip() for line in f if line.strip()]
    print("Using selected features:", selected)
else:
    selected = df.columns[:-1].tolist()




X = df[selected]
y = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling (needed for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1) GridSearch for RandomForest
rf = RandomForestClassifier(random_state=42)
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5]
}
gs_rf = GridSearchCV(rf, param_grid, cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)
gs_rf.fit(X_train, y_train)

print("Best RF params:", gs_rf.best_params_)
best_rf = gs_rf.best_estimator_
joblib.dump(gs_rf, "rf_gridsearch.pkl")
joblib.dump(best_rf, "rf_best_model.pkl")

y_proba_rf = best_rf.predict_proba(X_test)[:, 1]
auc_rf = roc_auc_score(y_test, y_proba_rf)
print("RF test AUC:", auc_rf)
with open("rf_best_report.txt", "w") as f:
    f.write(classification_report(y_test, best_rf.predict(X_test)))

#  2) RandomizedSearch for SVM
svm = SVC(probability=True, random_state=42)
param_dist = {
    "C": [0.1, 1, 10, 100],
    "gamma": ['scale', 'auto', 0.01, 0.1, 1],
    "kernel": ['rbf', 'poly']
}
rs_svm = RandomizedSearchCV(svm, param_dist, n_iter=10, cv=StratifiedKFold(5),
                            scoring='roc_auc', n_jobs=-1, random_state=42)
rs_svm.fit(X_train_scaled, y_train)

print("Best SVM params:", rs_svm.best_params_)
best_svm = rs_svm.best_estimator_
joblib.dump(rs_svm, "svm_randomsearch.pkl")
joblib.dump(best_svm, "svm_best_model.pkl")

y_proba_svm = best_svm.predict_proba(X_test_scaled)[:, 1]
auc_svm = roc_auc_score(y_test, y_proba_svm)
print("SVM test AUC:", auc_svm)
with open("svm_best_report.txt", "w") as f:
    f.write(classification_report(y_test, best_svm.predict(X_test_scaled)))

# 3) Select Final Model
if auc_rf >= auc_svm:
    final_model = best_rf
    final_auc = auc_rf
    chosen = "RandomForest"
else:
    final_model = best_svm
    final_auc = auc_svm
    chosen = "SVM"

print(f"Chosen final model: {chosen} with AUC {final_auc:.4f}")
joblib.dump(final_model, "final_model.pkl")

# Save evaluation summary
with open("evaluation_metrics.txt", "w") as f:
    f.write(f"RF best params: {gs_rf.best_params_}\n")
    f.write(f"RF test AUC: {auc_rf}\n")
    f.write(f"SVM best params: {rs_svm.best_params_}\n")
    f.write(f"SVM test AUC: {auc_svm}\n")
    f.write(f"Chosen final model: {chosen} with AUC {final_auc}\n")

print("Evaluation summary saved to evaluation_metrics.txt")


Saving selected_features .txt to selected_features  (3).txt
Best RF params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
RF test AUC: 1.0
Best SVM params: {'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
SVM test AUC: 1.0
Chosen final model: RandomForest with AUC 1.0000
Evaluation summary saved to evaluation_metrics.txt
