## Model Training

#### Import Data and Required Packages

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb


#### Import the CSV Data as Pandas DataFrame

In [83]:
df = pd.read_csv('data/heart.csv')

#### Show Top 5 Records

In [84]:
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


#### Data preprocessing

In [85]:
X = df.drop(columns=['output'], axis=1).values
X.shape

(303, 13)

In [86]:
y = df['output'].values
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### Separate Dataset into Train and Test

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

#### Feature Scaling

In [88]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Create an Evaluate Function to give all metrics after model Training

In [89]:
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    return accuracy, precision, recall, f1, conf_matrix

In [90]:
# Dictionary of models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "Support Vector Machine (SVM)": SVC(),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),  
    "Neural Network (MLP)": MLPClassifier(tol=1e-3, max_iter=500, random_state=42),
    "AdaBoost": AdaBoostClassifier(algorithm='SAMME', random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42)
}

In [91]:
# Dictionary of hyperparameter grids for each model
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    },
    "Random Forest": {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Gradient Boosting": {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.05],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10]
    },
    "XGBoost": {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.05],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0]
    },

    "Support Vector Machine (SVM)": {
        'C': [0.1, 1, 3, 10, 100],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },
    "K-Nearest Neighbors (KNN)": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    "Naive Bayes": {},  # GaussianNB has no hyperparameters to tune
    "Neural Network (MLP)": {
        'hidden_layer_sizes': [(50,), (100,)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
        'learning_rate': ['constant', 'adaptive']
    },
    "AdaBoost": {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1, 10]
    },
    "Extra Trees": {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
}

# Loop through each model and apply GridSearchCV or RandomizedSearchCV
best_estimators = {}
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    param_grid = param_grids.get(model_name, {})
    
    search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy') if param_grid else None
    
    if search:
        search.fit(X_train, y_train)  
        best_estimators[model_name] = search.best_estimator_
        print(f"Best parameters for {model_name}: {search.best_params_}\n")
        
    else:
        # If no parameters to tune, use the default model
        model.fit(X_train, y_train)
        best_estimators[model_name] = model

Tuning Logistic Regression...


Best parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}

Tuning Decision Tree...
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2}

Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}

Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 300}

Tuning XGBoost...
Best parameters for XGBoost: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}

Tuning Support Vector Machine (SVM)...
Best parameters for Support Vector Machine (SVM): {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}

Tuning K-Nearest Neighbors (KNN)...
Best parameters for K-Nearest Neighbors (KNN): {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}

Tuning Naive Bayes...
Tuning Neural Net

In [92]:
# Dictionary to store evaluation results
results = {}

BOLD = '\033[1m'
RESET = '\033[0m'  

# Train each best estimator and evaluate
for model_name, model in best_estimators.items():
    print(f"{BOLD}{model_name}:{RESET}")

    # Train the model on the training set
    model.fit(X_train, y_train)

    # Make predictions on both train and test sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate performance on training data
    train_accuracy, train_precision, train_recall, train_f1, train_conf_matrix = evaluate_model(y_train, y_train_pred)

    # Evaluate performance on testing data
    test_accuracy, test_precision, test_recall, test_f1, test_conf_matrix = evaluate_model(y_test, y_test_pred)

    # Store results
    results[model_name] = {
        "Train": {
            "Accuracy": train_accuracy,
            "Precision": train_precision,
            "Recall": train_recall,
            "F1 Score": train_f1,
            "Confusion Matrix": train_conf_matrix
        },
        "Test": {
            "Accuracy": test_accuracy,
            "Precision": test_precision,
            "Recall": test_recall,
            "F1 Score": test_f1,
            "Confusion Matrix": test_conf_matrix
        }
    }

    # Print results for each model
    print("Model performance for Training set")
    print(f"- Accuracy: {train_accuracy:.4f}") 
    print(f"- Precision: {train_precision:.4f}")
    print(f"- Recall: {train_recall:.4f}")
    print(f"- F1 Score: {train_f1:.4f}")
    print("- Confusion Matrix:\n", train_conf_matrix)

    print('----------------------------------')
    
    print("Model performance for Test set")
    print(f"- Accuracy: {test_accuracy:.4f}") 
    print(f"- Precision: {test_precision:.4f}")
    print(f"- Recall: {test_recall:.4f}")
    print(f"- F1 Score: {test_f1:.4f}")
    print("- Confusion Matrix:\n", test_conf_matrix)

    print("="*35)
    print('\n')

[1mLogistic Regression:[0m
Model performance for Training set
- Accuracy: 0.8223
- Precision: 0.8039
- Recall: 0.9044
- F1 Score: 0.8512
- Confusion Matrix:
 [[ 76  30]
 [ 13 123]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8689
- Precision: 0.8000
- Recall: 0.9655
- F1 Score: 0.8750
- Confusion Matrix:
 [[25  7]
 [ 1 28]]


[1mDecision Tree:[0m
Model performance for Training set
- Accuracy: 0.9339
- Precision: 0.9412
- Recall: 0.9412
- F1 Score: 0.9412
- Confusion Matrix:
 [[ 98   8]
 [  8 128]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8197
- Precision: 0.7500
- Recall: 0.9310
- F1 Score: 0.8308
- Confusion Matrix:
 [[23  9]
 [ 2 27]]


[1mRandom Forest:[0m
Model performance for Training set
- Accuracy: 0.9339
- Precision: 0.9348
- Recall: 0.9485
- F1 Score: 0.9416
- Confusion Matrix:
 [[ 97   9]
 [  7 129]]
----------------------------------
Model performance for Test set
- Accuracy: 0.9016
- Precision:

### Results

In [93]:
results_df = pd.DataFrame({model: {metric: results[model]["Test"][metric] for metric in ["Accuracy", "Precision", "Recall", "F1 Score"]} for model in results.keys()})
print("\nSummary of Test Set Performance:")
display(results_df.transpose())


Summary of Test Set Performance:


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.868852,0.8,0.965517,0.875
Decision Tree,0.819672,0.75,0.931034,0.830769
Random Forest,0.901639,0.828571,1.0,0.90625
Gradient Boosting,0.885246,0.823529,0.965517,0.888889
XGBoost,0.868852,0.783784,1.0,0.878788
Support Vector Machine (SVM),0.868852,0.783784,1.0,0.878788
K-Nearest Neighbors (KNN),0.868852,0.783784,1.0,0.878788
Naive Bayes,0.885246,0.866667,0.896552,0.881356
Neural Network (MLP),0.901639,0.848485,0.965517,0.903226
AdaBoost,0.918033,0.875,0.965517,0.918033
