In [8]:
%pip install pandas numpy scikit-learn joblib


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [15]:
# 2. Load Reduced Dataset
df = pd.read_csv("D:/Omar/Projects/Ai/Sprints/Heart_Disease_Project/data/reduced_heart.csv")

X = df.drop("target", axis=1)
y = (df["target"] > 0).astype(int)  

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)


Train shape: (242, 13)  Test shape: (61, 13)


In [16]:
# 3. Baseline Models (untuned)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

print("Baseline Performance:\n")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.3f}")


Baseline Performance:

Logistic Regression Accuracy: 0.885
Decision Tree Accuracy: 0.787
Random Forest Accuracy: 0.836
SVM Accuracy: 0.852


In [17]:
# 4. GridSearchCV - Random Forest
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)

print("Best Params (GridSearchCV):", grid.best_params_)
print("Best Score (GridSearchCV):", grid.best_score_)

best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test)
print("Test Accuracy (GridSearchCV):", accuracy_score(y_test, y_pred))


Best Params (GridSearchCV): {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Best Score (GridSearchCV): 0.8303571428571429
Test Accuracy (GridSearchCV): 0.8524590163934426


In [18]:
# 5. RandomizedSearchCV - SVM
param_dist = {
    "C": [0.1, 1, 10, 100],
    "gamma": ["scale", "auto"],
    "kernel": ["linear", "rbf"]
}

rand = RandomizedSearchCV(SVC(probability=True, random_state=42), param_dist, cv=5, n_iter=5, scoring="accuracy", random_state=42)
rand.fit(X_train, y_train)

print("Best Params (RandomizedSearchCV):", rand.best_params_)
print("Best Score (RandomizedSearchCV):", rand.best_score_)

best_svm = rand.best_estimator_
y_pred = best_svm.predict(X_test)
print("Test Accuracy (RandomizedSearchCV):", accuracy_score(y_test, y_pred))


Best Params (RandomizedSearchCV): {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}
Best Score (RandomizedSearchCV): 0.8220238095238097
Test Accuracy (RandomizedSearchCV): 0.8688524590163934


In [None]:
# 6. Compare Tuned Models
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

print("\nBest Tuned Models:")

final_models = {
    "Random Forest (GridSearchCV)": best_rf,
    "SVM (RandomizedSearchCV)": best_svm
}

best_model_name = None
best_model_score = 0
best_model = None

for name, model in final_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.3f}")
    print(classification_report(y_test, y_pred))
    
    if acc > best_model_score:
        best_model_score = acc
        best_model_name = name
        best_model = model

print(f"\n Best Model: {best_model_name} with accuracy {best_model_score:.3f}")

# Wrap best model in a pipeline (with scaling)
best_model_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", best_model)
])
best_model_pipeline.fit(X_train, y_train)

# Save best model
joblib.dump(best_model_pipeline, "final_model.pkl")
print("✔️ Best model saved as final_model.pkl")



Best Tuned Models:
Random Forest (GridSearchCV) Accuracy: 0.787
              precision    recall  f1-score   support

           0       0.75      0.91      0.82        33
           1       0.86      0.64      0.73        28

    accuracy                           0.79        61
   macro avg       0.80      0.78      0.78        61
weighted avg       0.80      0.79      0.78        61

SVM (RandomizedSearchCV) Accuracy: 0.672
              precision    recall  f1-score   support

           0       1.00      0.39      0.57        33
           1       0.58      1.00      0.74        28

    accuracy                           0.67        61
   macro avg       0.79      0.70      0.65        61
weighted avg       0.81      0.67      0.64        61


 Best Model: Random Forest (GridSearchCV) with accuracy 0.787




✔️ Best model saved as best_model.pkl
