# MIT bagging_boosting models
  

Using raw data as Input with dummy coded target variable, because no need for neither scaling nor oversamling:  (run `preprocessing_mit_clean.ipynb` before to run this notebook)    
mitbih_test_clean.csv   
mitbih_train_clean.csv

Output:
accuracy and classification reports of each model


In [None]:
import sys
import os

data_path = ''
model_output_path = ''
# check if the enviorment is Google Colab

if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    !pip install dask[dataframe]
    # Install required libraries
    # !pip install scikit-learn -q
    # !pip install pandas -q
    # !pip install numpy -q
    # !pip install imbalanced-learn -q

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive.
    data_path = '/content/drive/MyDrive/Heartbeat_Project_me/preprocessed_data/'
    model_output_path = '/content/drive/MyDrive/Heartbeat_Project_me/model_output/'

else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/processed/'
    model_output_path = '../models/'

Running on Google Colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np


RawFiles = dict({
    'train': data_path + 'mitbih_train_clean.csv',
    'test': data_path + 'mitbih_test_clean.csv'
})


OutputFiles = dict({
    'model': model_output_path +  'bagging_bossting_models_mit.csv'
})

train = pd.read_csv(RawFiles.get('train'),sep=',',header=0)
test = pd.read_csv(RawFiles.get('test'),sep=',',header=0)

y_train = train['target']
X_train = train.drop('target', axis=1)

y_test = test['target']
X_test = test.drop('target', axis=1)

In [None]:


# Define ensemble models to evaluate
ensemble_models = {
    "Balanced Random Forest": RandomForestClassifier(n_estimators=50, max_depth=5, class_weight='balanced', random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=50, max_depth=3, eval_metric='mlogloss', use_label_encoder=False, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=50, max_depth=3, class_weight='balanced', random_state=42)
}

# Evaluate ensemble models without scaling or resampling
def evaluate_models(X, y, models):
    results = {}
    skf = StratifiedKFold(n_splits=5)

    for model_name, model in models.items():
        f_scores = []
        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        detailed_reports = []

        print(f"Model: {model_name}", end="\n\n")

        for train_index, test_index in skf.split(X, y):
            X_train_, y_train_ = X.loc[train_index], y.loc[train_index]
            X_test_, y_test_ = X.loc[test_index], y.loc[test_index]

            model.fit(X_train_, y_train_)

            y_pred_ = model.predict(X_test_)

            accuracy = accuracy_score(y_test_, y_pred_)
            accuracy_scores.append(accuracy)

            report = classification_report(y_test_, y_pred_, output_dict=True)
            detailed_reports.append(report)

            f_scores.append(report['weighted avg']['f1-score'])

        mean_f1_score = np.mean(f_scores)
        mean_accuracy = np.mean(accuracy_scores)
        avg_precision = np.mean([rep['weighted avg']['precision'] for rep in detailed_reports])
        avg_recall = np.mean([rep['weighted avg']['recall'] for rep in detailed_reports])

        print("The scores: ", end="\n\n")
        print([round(f, 2) for f in f_scores], end="\n\n")
        print('F1-Score mean=%.5f' % (mean_f1_score), end="\n\n")

        # Collecting detailed performance metrics for each model
        results[model_name] = {
            'accuracy': mean_accuracy,
            'precision': avg_precision,
            'recall': avg_recall,
            'f1-score': mean_f1_score,
            'detailed report': detailed_reports[0] # Storing one detailed report as an example
        }

    return results

# Apply evaluation without resampling
results_ensemble = evaluate_models(X_train, y_train, ensemble_models)

# Prepare data for CSV
rows = []

for model_name, metrics in results_ensemble.items():
    report = metrics['detailed report']
    rows.append({
        'Model': model_name,
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1-score'],
        'Class 0 Precision': report['0']['precision'],
        'Class 0 Recall': report['0']['recall'],
        'Class 0 F1-Score': report['0']['f1-score'],
        'Class 1 Precision': report['1']['precision'],
        'Class 1 Recall': report['1']['recall'],
        'Class 1 F1-Score': report['1']['f1-score'],
        'Macro Avg Precision': report['macro avg']['precision'],
        'Macro Avg Recall': report['macro avg']['recall'],
        'Macro Avg F1-Score': report['macro avg']['f1-score'],
        'Weighted Avg Precision': report['weighted avg']['precision'],
        'Weighted Avg Recall': report['weighted avg']['recall'],
        'Weighted Avg F1-Score': report['weighted avg']['f1-score'],
        'binary F1-Score': report["1"]['f1-score']
    })

# Convert to DataFrame and save to CSV
results_df = pd.DataFrame(rows)
results_df.to_csv(OutputFiles['model'], index=False)
print(f"DataFrame saved as CSV file at: {OutputFiles['model']}")

# Summarize the results
for model_name, metrics in results_ensemble.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']:.5f}")
    print(f"Precision: {metrics['precision']:.5f}")
    print(f"Recall: {metrics['recall']:.5f}")
    print(f"F1-Score: {metrics['f1-score']:.5f}")
    print(f"Detailed Classification Report: {metrics['detailed report']}")
    print("\n")


Model: Balanced Random Forest

The scores: 

[0.89, 0.92, 0.93, 0.92, 0.9]

F1-Score mean=0.91429

Model: Gradient Boosting

The scores: 

[0.87, 0.96, 0.97, 0.96, 0.93]

F1-Score mean=0.93801

Model: XGBoost



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



The scores: 

[0.92, 0.97, 0.98, 0.97, 0.95]

F1-Score mean=0.95755

Model: LightGBM

[LightGBM] [Info] Number of positive: 6922, number of negative: 57976
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47429
[LightGBM] [Info] Number of data points in the train set: 64898, number of used features: 187
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 6921, number of negative: 57977
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47560
[LightGBM] [Info] Number o

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,72471
1,8652


In [None]:
from datetime import datetime
# Display the running time
print("Current time:", datetime.now())