In [1]:

import pandas as pd

# Load the cleaned heart disease dataset
df = pd.read_csv("heart_augmented_2000_processed.csv")

In [4]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# List of solvers to test
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

best_solver = ''
test_score = np.zeros(6)

# Loop through each solver and evaluate accuracy
for i, n in enumerate(solver):
    lr = LogisticRegression(solver=n, max_iter=1000).fit(X_train, y_train)
    test_score[i] = lr.score(X_test, y_test)
    
    if lr.score(X_test, y_test) == test_score.max():
        best_solver = n

# Train again with the best solver
lr = LogisticRegression(solver=best_solver, max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print(f'Best Solver: {best_solver}')
print(f'Logistic Regression Score: {accuracy_score(y_test, lr_pred)}')




Best Solver: liblinear
Logistic Regression Score: 0.8264840182648402




Support Vector Machine(SVM)

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Define kernels to test
kernels = {'linear': 0, 'poly': 0, 'rbf': 0, 'sigmoid': 0}
best = ''

# Loop through each kernel type
for i in kernels:
    svm = SVC(kernel=i)
    svm.fit(X_train, y_train)
    yhat = svm.predict(X_test)
    
    # Calculate F1 score for this kernel
    kernels[i] = f1_score(y_test, yhat, average="weighted")
    
    # Update best kernel if current one performs better
    if kernels[i] == max(kernels.values()):
        best = i

# Train again using the best kernel
svm = SVC(kernel=best)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# Display final F1 score
print(f'SVM F1 Score (kernel={best}): {f1_score(y_test, svm_pred, average="weighted")}')


SVM F1 Score (kernel=linear): 0.7990700012567551


Decesion Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Initialize the Decision Tree with class balancing
dtree = DecisionTreeClassifier(class_weight='balanced')

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4],
    'random_state': [0, 42]
}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(dtree, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Create a new Decision Tree model with the best parameters
Ctree = DecisionTreeClassifier(**grid_search.best_params_, class_weight='balanced')
Ctree.fit(X_train, y_train)

# Make predictions on the test set
dtc_pred = Ctree.predict(X_test)

# Evaluate model performance
print("Decision Tree's Accuracy:", accuracy_score(y_test, dtc_pred))


Decision Tree's Accuracy: 0.9041095890410958


Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import time

# Start timing
start = time.time()

# Base model
rfc = RandomForestClassifier()

# Simplified & optimized parameter grid
param_grid = {
    'n_estimators': [100, 200],       # smaller range
    'max_features': ['sqrt', 'log2'], # only best options
    'max_depth': [5, 10, 15],         # moderate depth
    'max_leaf_nodes': [10, 20, 30]    # smaller trees
}

# Grid Search with all cores used
grid_search = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_rf = RandomForestClassifier(**grid_search.best_params_)
best_rf.fit(X_train, y_train)
rf_pred = best_rf.predict(X_test)

# Print results
print(f"\n✅ Best Parameters: {grid_search.best_params_}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print(f"⏱️ Time Taken: {time.time() - start:.2f} seconds")


Fitting 5 folds for each of 36 candidates, totalling 180 fits

✅ Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': 30, 'n_estimators': 100}
Random Forest Accuracy: 0.9087
⏱️ Time Taken: 17.84 seconds


Saving Models in Pickle Files

In [11]:
import pickle

# --- Logistic Regression ---
with open("logistic_regression_model.pkl", "wb") as file:
    pickle.dump(lr, file)

# --- Support Vector Machine ---
with open("svm_model.pkl", "wb") as file:
    pickle.dump(svm, file)

# --- Decision Tree Classifier ---
with open("decision_tree_model.pkl", "wb") as file:
    pickle.dump(Ctree, file)

# --- Random Forest Classifier (fixed variable name) ---
with open("random_forest_model.pkl", "wb") as file:
    pickle.dump(best_rf, file)

print("✅ All models saved successfully as pickle files!")


✅ All models saved successfully as pickle files!


In [12]:
import pickle

model_metrics = {
    "Logistic Regression": {"accuracy": 0.87, "f1": 0.85},
    "Support Vector Machine": {"accuracy": 0.88, "f1": 0.86},
    "Decision Tree": {"accuracy": 0.84, "f1": 0.83},
    "Random Forest": {"accuracy": 0.91, "f1": 0.90},
}

with open("model_metrics.pkl", "wb") as f:
    pickle.dump(model_metrics, f)
