### Baseline Models and optimization steps for MIT data
 (Logistic Regression, Decision Tree, Naive Bayes, Random Forest)

as a seperate file we have applied KNN model on MIT data (run the notebook: modeling_mit_knn.ipynb)

As we concluded before, for MIT data, we apply the following preprocessing:
resampling: Oversampling
rescaling: MinMax Scaler

If you don't have the original files: run the notebook preprocessing_mit_minmax_oversampling.ipynb

Input file: (The preprocessed data)

mitbih_train_clean_minmax_oversampling.csv
mitbih_test_clean_minmax_oversampling.csv

Output:
accuracy and classification reports of each model

In [None]:
import sys
import os

data_path = ''
model_output_path = ''
# check if the enviorment is Google Colab

if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    # !pip install scikit-learn -q
    # !pip install pandas -q
    # !pip install numpy -q
    # !pip install imbalanced-learn -q

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive.
    data_path = '/content/drive/MyDrive/Heartbeat_Project_me/preprocessed_data/'
    model_output_path = '/content/drive/MyDrive/Heartbeat_Project_me/model_output/'

else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/processed/'
    model_output_path = '../models/'

Running on Google Colab
Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

RawFiles = dict({
    'train': data_path + 'mitbih_train_clean.csv',
    'test': data_path + 'mitbih_test_clean.csv'
})


OutputFiles = dict({
    'model': model_output_path +  'baseline_models_mit_raw.csv',
    'Optimization' : model_output_path + 'optimization_baseline_models_mit_raw.csv'
})


train = pd.read_csv(RawFiles.get('train'),sep=',',header=0)
test = pd.read_csv(RawFiles.get('test'),sep=',',header=0)

y_train = train['target']
X_train = train.drop('target', axis=1)

y_test = test['target']
X_test = test.drop('target', axis=1)

# Baseline model 1: Logistic Regression
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_log_reg))

# Baseline model 2: Decision Tree
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)
y_pred_dec_tree = dec_tree.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dec_tree))
print("Decision Tree Report:\n", classification_report(y_test, y_pred_dec_tree))



from sklearn.naive_bayes import GaussianNB

# Baseline model 4: Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Report:\n", classification_report(y_test, y_pred_nb))

from sklearn.ensemble import RandomForestClassifier

# Baseline model 5: Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

Logistic Regression Accuracy: 0.9114080063103924
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95     18118
           1       0.73      0.27      0.39      2166

    accuracy                           0.91     20284
   macro avg       0.83      0.63      0.67     20284
weighted avg       0.90      0.91      0.89     20284

Decision Tree Accuracy: 0.9631236442516269
Decision Tree Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     18118
           1       0.84      0.81      0.82      2166

    accuracy                           0.96     20284
   macro avg       0.91      0.90      0.90     20284
weighted avg       0.96      0.96      0.96     20284

Naive Bayes Accuracy: 0.8454939854072175
Naive Bayes Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91     18118
           1       0.31      0.

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Function to evaluate models and store results
def evaluate_model(model, X_test, y_test, model_name, results):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(f"{model_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n {classification_report(y_test, y_pred)}")
    results[model_name] = {
        'accuracy': accuracy,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score']
    }

# Dictionary to store results
results = {}

# Baseline model 1: Logistic Regression
evaluate_model(log_reg, X_test, y_test, "Logistic Regression", results)

# Baseline model 2: Decision Tree
evaluate_model(dec_tree, X_test, y_test, "Decision Tree", results)

# Baseline model 3: Support Vector Machine
# evaluate_model(svm, X_test, y_test, "Support Vector Machine", results)

# Baseline model 4: Naive Bayes
evaluate_model(nb, X_test, y_test, "Naive Bayes", results)

# Baseline model 5: Random Forest
evaluate_model(rf, X_test, y_test, "Random Forest", results)

# Create a DataFrame to display the results
results_df = pd.DataFrame(results).T
print("\nComparison of Baseline Models:")
print(results_df)



Logistic Regression:
Accuracy: 0.9114080063103924
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95     18118
           1       0.73      0.27      0.39      2166

    accuracy                           0.91     20284
   macro avg       0.83      0.63      0.67     20284
weighted avg       0.90      0.91      0.89     20284

Decision Tree:
Accuracy: 0.9631236442516269
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     18118
           1       0.84      0.81      0.82      2166

    accuracy                           0.96     20284
   macro avg       0.91      0.90      0.90     20284
weighted avg       0.96      0.96      0.96     20284

Naive Bayes:
Accuracy: 0.8454939854072175
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91     18118
           1       0.31      

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# Baseline model 2: KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn_model = knn_model.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn_model))
print("KNN Report:\n", classification_report(y_test, y_pred_knn_model))

KNN Accuracy: 0.9789489252612897
KNN Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99     18118
           1       0.95      0.85      0.90      2166

    accuracy                           0.98     20284
   macro avg       0.97      0.92      0.94     20284
weighted avg       0.98      0.98      0.98     20284



In [None]:
from sklearn.model_selection import GridSearchCV
results = {}
# Hyperparameter tuning for Logistic Regression
param_grid_log_reg = {'C': [0.1, 1, 10, 100]}
grid_log_reg = GridSearchCV(LogisticRegression(), param_grid_log_reg, cv=3, n_jobs=-1)
grid_log_reg.fit(X_train, y_train)
best_log_reg = grid_log_reg.best_estimator_
print(f"Best parameters for Logistic Regression: {grid_log_reg.best_params_}")
evaluate_model(best_log_reg, X_test, y_test, "Tuned Logistic Regression", results)


# Hyperparameter tuning for Decision Tree
param_grid_dec_tree = {'max_depth': [3, 5, 7, 10]}
grid_dec_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_dec_tree, cv=3, n_jobs=-1)
grid_dec_tree.fit(X_train, y_train)
best_dec_tree = grid_dec_tree.best_estimator_
print(f"Best parameters for Logistic Regression: {grid_dec_tree.best_params_}")
evaluate_model(best_dec_tree, X_test, y_test, "Tuned Decision Tree", results)

# # Hyperparameter tuning for Support Vector Machine
# param_grid_svm = {'C': [0.1, 10, 100], 'kernel': ['linear']}
# grid_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, n_jobs=-1)
# grid_svm.fit(X_train, y_train)
# evaluate_model(grid_svm, X_test, y_test, "Tuned SVM", results)

# Hyperparameter tuning for Naive Bayes
param_grid_nb = {'var_smoothing': [1e-9, 1e-8, 1e-7]}
grid_nb = GridSearchCV(GaussianNB(), param_grid_nb, cv=3, n_jobs=-1)
grid_nb.fit(X_train, y_train)
best_nb = grid_nb.best_estimator_
print(f"Best parameters for Logistic Regression: {grid_nb.best_params_}")
evaluate_model(best_nb, X_test, y_test, "Tuned Naive Bayes", results)

# Hyperparameter tuning for Random Forest
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
print(f"Best parameters for Logistic Regression: {grid_rf.best_params_}")
evaluate_model(best_rf, X_test, y_test, "Tuned Random Forest", results)



# Create a DataFrame to display the results
results_df_grid = pd.DataFrame(results).T
print("\nComparison of Hyperparameter Tuned Models:")
print(results_df_grid)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for Logistic Regression: {'C': 100}
Tuned Logistic Regression:
Accuracy: 0.8067935318477618
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.81      0.88     18118
           1       0.33      0.77      0.46      2166

    accuracy                           0.81     20284
   macro avg       0.65      0.79      0.67     20284
weighted avg       0.90      0.81      0.84     20284

Best parameters for Logistic Regression: {'max_depth': 10}
Tuned Decision Tree:
Accuracy: 0.9413330704003156
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.97     18118
           1       0.69      0.83      0.75      2166

    accuracy                           0.94     20284
   macro avg       0.83      0.89      0.86     20284
weighted avg       0.95      0.94      0.94     20284

Best parameters for Logistic Regression: {'var_smoothing': 1e-09}
Tuned Naive Bay

In [None]:
# Save the DataFrame as a CSV file
results_df.to_csv(OutputFiles['model'], index=False)
print(f"DataFrame saved as CSV file at: {OutputFiles['model']}")

results_df_grid.to_csv(OutputFiles['Optimization'], index=False)
print(f"DataFrame saved as CSV file at: {OutputFiles['Optimization']}")

DataFrame saved as CSV file at: /content/drive/MyDrive/Heartbeat_Project_me/model_output/baseline_models_mit.csv
DataFrame saved as CSV file at: /content/drive/MyDrive/Heartbeat_Project_me/model_output/optimization_baseline_models_mit.csv


In [None]:
from datetime import datetime
# Display the running time
print("Current time:", datetime.now())

Current time: 2024-11-06 13:38:41.831722
