# Baseline Models and optimization steps for raw PTB data
 (Logistic Regression, KNN, Naive Bayes, Random Forest)

as a seperate file we have applied decision tree model on PTB data (run the notebook: modeling_ptb_decision_tree.ipynb)


Input file: raw data

ptbdb_test_clean.csv
ptbdb_train_clean.csv

Output:
accuracy and classification reports of each model

In [None]:
import sys
import os

data_path = ''
model_output_path = ''
# check if the enviorment is Google Colab

if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    # !pip install scikit-learn -q
    # !pip install pandas -q
    # !pip install numpy -q
    # !pip install imbalanced-learn -q

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive.
    data_path = '/content/drive/MyDrive/Heartbeat_Project_me/preprocessed_data/'
    model_output_path = '/content/drive/MyDrive/Heartbeat_Project_me/model_output/'

else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/processed/'
    model_output_path = '../models/'

Running on Google Colab
Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_curve, auc, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

RawFiles = dict({
    'test': data_path + 'ptbdb_test_clean.csv', # Standard Sscaled data already
    'train': data_path + 'ptbdb_train_clean.csv', # Standard Sscaled data already
})

OutputFiles = dict({
    'model': model_output_path +  'baseline_models_PTB_raw.csv',
    'Optimization' : model_output_path + 'optimization_baseline_models_PTB_raw.csv'
})


train = pd.read_csv(RawFiles.get('train'),sep=',',header=0)

test = pd.read_csv(RawFiles.get('test'),sep=',',header=0)

y_train = train['target']
X_train = train.drop('target', axis=1)

y_test = test['target']
X_test = test.drop('target', axis=1)

# Baseline model 1: Logistic Regression
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_log_reg))

# Baseline model 2: KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn_model = knn_model.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn_model))
print("KNN Report:\n", classification_report(y_test, y_pred_knn_model))

from sklearn.naive_bayes import GaussianNB

# Baseline model 3: Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Report:\n", classification_report(y_test, y_pred_nb))

from sklearn.ensemble import RandomForestClassifier

# Baseline model 4: Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

Logistic Regression Accuracy: 0.8131226382686362
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.72      0.57      0.64       838
           1       0.84      0.91      0.87      2073

    accuracy                           0.81      2911
   macro avg       0.78      0.74      0.76      2911
weighted avg       0.81      0.81      0.81      2911

KNN Accuracy: 0.932669185846788
KNN Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89       838
           1       0.96      0.94      0.95      2073

    accuracy                           0.93      2911
   macro avg       0.91      0.92      0.92      2911
weighted avg       0.93      0.93      0.93      2911

Naive Bayes Accuracy: 0.602885606320852
Naive Bayes Report:
               precision    recall  f1-score   support

           0       0.41      0.81      0.54       838
           1       0.87      0.52      0.65      2073

In [None]:
# Baseline model 2: Decision Tree
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)
y_pred_dec_tree = dec_tree.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dec_tree))
print("Decision Tree Report:\n", classification_report(y_test, y_pred_dec_tree))

Decision Tree Accuracy: 0.9223634489866025
Decision Tree Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       838
           1       0.95      0.94      0.95      2073

    accuracy                           0.92      2911
   macro avg       0.90      0.91      0.91      2911
weighted avg       0.92      0.92      0.92      2911



In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Function to evaluate models and store results
def evaluate_model(model, X_test, y_test, model_name, results):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(f"{model_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n {classification_report(y_test, y_pred)}")
    results[model_name] = {
        'accuracy': accuracy,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score']
    }

# Dictionary to store results
results = {}

# Baseline model 1: Logistic Regression
evaluate_model(log_reg, X_test, y_test, "Logistic Regression", results)

# Baseline model 2: KNN
evaluate_model(knn_model, X_test, y_test, "KNN", results)


# Baseline model 3: Naive Bayes
evaluate_model(nb, X_test, y_test, "Naive Bayes", results)

# Baseline model 4: Random Forest
evaluate_model(rf, X_test, y_test, "Random Forest", results)

# Create a DataFrame to display the results
results_df = pd.DataFrame(results).T
print("\nComparison of Baseline Models:")
print(results_df)



Logistic Regression:
Accuracy: 0.7877018206801787
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.83      0.69       814
           1       0.92      0.77      0.84      2097

    accuracy                           0.79      2911
   macro avg       0.75      0.80      0.76      2911
weighted avg       0.83      0.79      0.80      2911

KNN:
Accuracy: 0.908278941944349
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.95      0.85       814
           1       0.98      0.89      0.93      2097

    accuracy                           0.91      2911
   macro avg       0.88      0.92      0.89      2911
weighted avg       0.92      0.91      0.91      2911

Naive Bayes:
Accuracy: 0.6214359326691858
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.83      0.55       814
           1       0.89      0.54      0

In [None]:
from sklearn.model_selection import GridSearchCV
results = {}
# Hyperparameter tuning for Logistic Regression
param_grid_log_reg = {'C': [0.1, 1, 10, 100]}
grid_log_reg = GridSearchCV(LogisticRegression(), param_grid_log_reg, cv=3, n_jobs=-1)
grid_log_reg.fit(X_train, y_train)
best_log_reg = grid_log_reg.best_estimator_
print(f"Best parameters for Logistic Regression: {grid_log_reg.best_params_}")
evaluate_model(best_log_reg, X_test, y_test, "Tuned Logistic Regression", results)


# Hyperparameter tuning for KNN
param_grid_knn = {
     'n_neighbors': [3, 5, 9, 11],
     'weights': ['uniform', 'distance'],
     'metric': ['euclidean', 'manhattan', 'minkowski']
     }
grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=3, n_jobs=-1)
grid_knn.fit(X_train, y_train)
best_KNN = grid_knn.best_estimator_
print(f"Best parameters for KNN: {grid_knn.best_params_}")
evaluate_model(best_KNN, X_test, y_test, "Tuned KNN", results)


# # Hyperparameter tuning for Support Vector Machine
# param_grid_svm = {'C': [0.1, 10, 100], 'kernel': ['linear']}
# grid_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, n_jobs=-1)
# grid_svm.fit(X_train, y_train)
# evaluate_model(grid_svm, X_test, y_test, "Tuned SVM", results)

# Hyperparameter tuning for Naive Bayes
param_grid_nb = {'var_smoothing': [1e-9, 1e-8, 1e-7]}
grid_nb = GridSearchCV(GaussianNB(), param_grid_nb, cv=3, n_jobs=-1)
grid_nb.fit(X_train, y_train)
best_nb = grid_nb.best_estimator_
print(f"Best parameters for Logistic Regression: {grid_nb.best_params_}")
evaluate_model(best_nb, X_test, y_test, "Tuned Naive Bayes", results)

# Hyperparameter tuning for Random Forest
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
print(f"Best parameters for Logistic Regression: {grid_rf.best_params_}")
evaluate_model(best_rf, X_test, y_test, "Tuned Random Forest", results)



# Create a DataFrame to display the results
results_df_grid = pd.DataFrame(results).T
print("\nComparison of Hyperparameter Tuned Models:")
print(results_df_grid)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for Logistic Regression: {'C': 100}
Tuned Logistic Regression:
Accuracy: 0.7842665750601168
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.83      0.68       814
           1       0.92      0.77      0.84      2097

    accuracy                           0.78      2911
   macro avg       0.75      0.80      0.76      2911
weighted avg       0.82      0.78      0.79      2911



  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Tuned KNN:
Accuracy: 0.95499828237719
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92       814
           1       0.99      0.95      0.97      2097

    accuracy                           0.95      2911
   macro avg       0.93      0.96      0.95      2911
weighted avg       0.96      0.95      0.96      2911

Best parameters for Logistic Regression: {'var_smoothing': 1e-09}
Tuned Naive Bayes:
Accuracy: 0.6214359326691858
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.83      0.55       814
           1       0.89      0.54      0.67      2097

    accuracy                           0.62      2911
   macro avg       0.65      0.68      0.61      2911
weighted avg       0.76      0.62      0.64      2911

Best parameters for Logistic Regression: {'max_depth': 3

In [None]:
# Save the DataFrame as a CSV file
results_df.to_csv(OutputFiles['model'], index=False)
print(f"DataFrame saved as CSV file at: {OutputFiles['model']}")

results_df_grid.to_csv(OutputFiles['Optimization'], index=False)
print(f"DataFrame saved as CSV file at: {OutputFiles['Optimization']}")

DataFrame saved as CSV file at: /content/drive/MyDrive/Heartbeat_Project_me/model_output/baseline_models_PTB.csv
DataFrame saved as CSV file at: /content/drive/MyDrive/Heartbeat_Project_me/model_output/optimization_baseline_models_PTB.csv


In [None]:
from datetime import datetime
# Display the running time
print("Current time:", datetime.now())

Current time: 2024-11-06 14:08:57.926304
