# training-A.ipynb
1. This code is intended for training three models with *windowed features* data from MIT-BIH arrhythmia database and additional ECG data with a format of .bin
2. Classes/labels: N, S, V, F, and/without Q
3. The data only utilizes single-lead from MIT-BIH, which is MLII

## **LIBRARY IMPORTS**

In [None]:
# Import Libraries
import cudf
import os
import joblib
import pywt
import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import wfdb  # For reading MIT-BIH data
import keras_tuner as kt
import seaborn as sns
import tensorflow as tf
import neurokit2 as nk

# Scikit-learn and Imbalanced-learn imports
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve,
    auc,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score
)
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import entropy
from collections import Counter
from scipy.signal import find_peaks, resample, butter, filtfilt, iirnotch, spectrogram
from sklearn.utils import class_weight
from glob import glob

# Model imports
from sklearn.svm import SVC
from imblearn.ensemble import BalancedRandomForestClassifier
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, Activation, MaxPooling1D, Dropout, Add, GlobalAveragePooling1D, Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from cuml.ensemble import RandomForestClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.utils.class_weight import compute_class_weight

# Additional setups
# Checking cUML
print(cudf.Series([1, 2, 3]))

# Setting TensorFlow flags
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Checking GPU
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print(f"TensorFlow has detected {len(gpu_devices)} GPU(s):")
    for device in gpu_devices:
        print(f"- {device}")
else:
    print("TensorFlow did not detect any GPUs. Training will run on the CPU.")


## **DATA PREPARATION**

### DATA PREPARATION FUNCTIONS
1. Database: MIT-BIH Arrhythmia Database & additional ECG data with a format of .bin files and from heartbeat simulator
2. Preparation: *Windowed Features* of RR Intervals

In [None]:
# Labels
label_map = { 'N': 0, 'L': 0, 'R': 0, 'e': 0, 'j': 0,  # Normal Beats (N)
              'V': 1, 'E': 1,                          # Ventricular Ectopic (VEB)
              'S': 2, 'A': 2, 'a': 2, 'J': 2,          # Supraventricular Ectopic (SVEB)
              'F': 3}                                  # Fusion Beat (F)

# Membagi data menjadi sumbu X dan sumbu Y
def create_windowed_features(rr_intervals, labels, window_size):
    """RR intervals features extraction"""
    X, y = [], []
    for i in range(len(rr_intervals) - window_size):
        segment = rr_intervals[i:i+window_size]
        # Label sesuai dengan detak di akhir jendela
        label = labels[i + window_size - 1]
        X.append(segment)
        y.append(label)
    return np.array(X), np.array(y)

In [None]:
def load_mitbih_data(record_names, db_path):
    """
    Loads ECG signals and annotations from MIT-BIH data.
    Args:
        record_names (list): List of record names (e.g., ['100', '101']).
        db_path (str): Database directory on PhysioNet (e.g., 'mit-bih').

    Return:
        Returns the RAW ECG data from each record as a list of numpy arrays (signals),
        and the annotations as a list of annotation objects.
        tuple: (signals, annotations)
    """
    signals, annotations = [], []
    for record in record_names:
        record_path = f'{db_path}/{record}'
        # Read signal from the first channel (usually MLII)
        signal = wfdb.rdrecord(record_path, channels=[0]).p_signal.flatten()
        annotation = wfdb.rdann(record_path, 'atr')
        signals.append(signal)
        annotations.append(annotation)
    return signals, annotations

# Read annotations and labels from each loaded data
def extract_rr_intervals_and_labels(annotations):
    """Extracts RR intervals and corresponding heartbeat labels."""
    all_rr, labels = [], []
    for ann in annotations:
        r_peaks = ann.sample
        beat_symbols = ann.symbol
        # Need at least two R-peaks to calculate an interval
        for i in range(1, len(r_peaks)):
            symbol = beat_symbols[i]
            if symbol in label_map:
                # Use the record-specific sampling frequency
                rr_interval = (r_peaks[i] - r_peaks[i-1]) / ann.fs
                all_rr.append(rr_interval)
                labels.append(label_map[symbol])
    return np.array(all_rr), np.array(labels)

In [None]:
def load_ecg_from_bin(file_path, dtype=np.int16):
    """
    Loads a raw ECG signal from a binary file.

    Args:
        file_path (str): Path to the .bin file.
        dtype (numpy.dtype): Data type of the signal in the binary file.

    Return:
        numpy.ndarray: The ECG signal as a numpy array.
    """
    try:
        # Read the binary file and convert it into a numpy array
        signal = np.fromfile(file_path, dtype=dtype)
        print(f"Successfully read {len(signal)} samples from {file_path}")
        return signal
    except IOError as e:
        print(f"Error reading file: {e}")
        return None

def detect_r_peaks(signal, fs):
    """
    Detects R-peaks from an ECG signal.

    Args:
        signal (numpy.ndarray): The raw ECG signal.
        fs (int): The sampling frequency of the signal.

    Return:
        numpy.ndarray: An array containing the locations (indices) of the detected R-peaks.
    """
    print("\n--- Step 1: Detecting R-Peaks ---")
    # The 'height' and 'distance' parameters can be adjusted for your signal
    height_threshold = np.max(signal) * 0.5
    distance_threshold = fs * 0.4  # Minimum distance between beats

    r_peaks, _ = find_peaks(signal, height=height_threshold, distance=distance_threshold)

    print(f"Detected {len(r_peaks)} R-peaks.")
    return r_peaks

def extract_rr_and_apply_label_ecg_bin(r_peaks, fs, record_label):
    """
    Calculates RR intervals from a single record and assigns the same label
    to all of them.

    Args:
        r_peaks (numpy.ndarray): Array of R-peak locations (in sample indices) from a single record.
        fs (int): The sampling frequency of the signal.
        record_label (any): A single label (e.g., string or integer) to be
                            applied to this entire record.

    Return:
        tuple: A tuple containing (rr_intervals, labels).
               - rr_intervals (numpy.ndarray): Array of RR intervals in seconds.
               - labels (numpy.ndarray): Array containing the same label for each RR interval.
    """
    print(f"\n--- Step 2: RR Extraction and Labeling for the Record ---")

    # Ensure there are enough R-peaks to calculate at least one interval
    if len(r_peaks) < 2:
        print("Warning: Not enough R-peaks to calculate RR intervals.")
        return np.array([]), np.array([])

    # Calculate all RR intervals in seconds
    rr_intervals = np.diff(r_peaks) / fs

    # Create a label array where each element is 'record_label'
    # The size of this label array is the same as the number of calculated RR intervals
    num_rr_intervals = len(rr_intervals)
    labels = np.full(shape=num_rr_intervals, fill_value=record_label)

    return rr_intervals, labels

### DATA PREPARATION EXECUTION
1. For .bin data, the program will detect R-peaks first, create RR interval with those R-peaks, and then create windowed features that has multiple RR intervals for each window
2. For MIT-BIH data, the program only read labels and annotations from the ECG data and then create windowed features that has multiple RR intervals for each window

In [None]:
if __name__ == '__main__':
    # --- 0. INITIAL PARAMETERS FOR .BIN AND MIT-BIH DATA PREPARATION ---
    mitbih_dir = '../data/raw/MIT-BIH/mit-bih-arrhythmia-database-1.0.0/mit-bih-arrhythmia-database-1.0.0/'
    window_size = 10
    
    # DS1 is used for training
    ds1 = ['101', '106', '108', '109', '112', '114', '115', '116', '118', '119',
           '122', '124', '201', '203', '205', '207', '208', '209', '215', '220',
           '223', '230'] 
    # DS2 is used for evaluation
    ds2 = ['100', '103', '105', '111', '113', '117', '121', '123', '200', '202',
           '210', '212', '213', '214', '219', '221', '222', '228', '231', '232',
           '233', '234'] 
           
    FS_CUSTOM = 500  # IMPORTANT: Adjust to the sampling frequency of your .bin data
    custom_file_paths = {
        'Arrhythmia': '../data/raw/Arrhythmia/ECG_WAVE.bin',
        'Normal': '../data/raw/Normal/ecg_normal.bin'
    }
    custom_file_labels = {'Arrhythmia': 2, 'Normal': 0}

    print("="*60)
    print("🚀 STARTING DATASET PREPARATION PROCESS 🚀")
    print("="*60)

    # --- 1. PROCESS TRAINING DATA (ds1) ---
    print("\n--- [Step 1/5] Processing Training Data (ds1) ---")
    signals_train, annotations_train = load_mitbih_data(ds1, mitbih_dir)
    rr_train, labels_train = extract_rr_intervals_and_labels(annotations_train)
    X_train, y_train = create_windowed_features(rr_train, labels_train, window_size)
    print(f"Raw training data ready: X_train={X_train.shape}, y_train={y_train.shape}")

    # --- 2. PROCESS TESTING DATA (Combination of ds2 and .bin) ---
    print("\n--- [Step 2/5] Processing Testing Data ---")

    # Part A: Process testing data from MIT-BIH (ds2)
    print("\nProcessing testing part 1 (ds2)...")
    signals_test_mitbih, annotations_test_mitbih = load_mitbih_data(ds2, mitbih_dir)
    rr_test_mitbih, labels_test_mitbih = extract_rr_intervals_and_labels(annotations_test_mitbih)
    X_test_mitbih, y_test_mitbih = create_windowed_features(rr_test_mitbih, labels_test_mitbih, window_size)
    print(f"MIT-BIH testing data ready: X_test_mitbih={X_test_mitbih.shape}, y_test_mitbih={y_test_mitbih.shape}")

    # Part B: Process testing data from .bin files
    print("\nProcessing testing part 2 (.bin)...")
    all_rr_custom = []
    all_labels_custom = []
    for category, path in custom_file_paths.items():
        signal_custom = load_ecg_from_bin(path)
        if signal_custom is not None:
            r_peaks_custom = detect_r_peaks(signal_custom, fs=FS_CUSTOM)
            rr_intervals_c, labels_c = extract_rr_and_apply_label_ecg_bin(
                r_peaks_custom, fs=FS_CUSTOM, record_label=custom_file_labels[category]
            )
            all_rr_custom.append(rr_intervals_c)
            all_labels_custom.append(labels_c)

    rr_test_custom = np.concatenate(all_rr_custom)
    labels_test_custom = np.concatenate(all_labels_custom)
    X_test_custom, y_test_custom = create_windowed_features(rr_test_custom, labels_test_custom, window_size)

    # --- 3. SCALING & COMBINING TESTING DATA ---
    print("\n--- [Step 3/5] Scaling and Finalizing Data ---")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    print("Scaler trained on training data.")

    # Apply the scaler to all parts of the testing data
    X_test_mitbih_scaled = scaler.transform(X_test_mitbih)
    X_test_custom_scaled = scaler.transform(X_test_custom)
    print("Scaler applied to all testing data.")

    # Combine all scaled testing data
    X_test_final = np.concatenate((X_test_mitbih_scaled, X_test_custom_scaled), axis=0)
    y_test_final = np.concatenate((y_test_mitbih, y_test_custom), axis=0)
    print(f"Final testing data combined: X_test_final={X_test_final.shape}, y_test_final={y_test_final.shape}")

    # --- 4. TRAINING SET SPLIT & OVERSAMPLING (SMOTE) ---
    print("\n--- [Step 4/5] Finalizing Training Data (Split & SMOTE) ---")
    print("Creating validation set from training data (80/20)...")
    X_train_fold, X_val, y_train_fold, y_val = train_test_split(
        X_train_scaled, y_train, test_size=0.2, random_state=42, stratify=y_train
    )

    print("Applying SMOTE only to the training fold...")
    print("Training class distribution before SMOTE:", Counter(y_train_fold))
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_fold, y_train_fold)
    print("Training class distribution after SMOTE:", Counter(y_train_resampled))

    # --- 5. FINAL DATA PREPARATION FOR MODELS ---
    print("\n--- [Step 5/5] Preparing Final Datasets for Models ---")

    # Define output_dim based on the number of unique classes in the original training data
    output_dim = len(np.unique(y_train))

    # One-hot encode labels for Keras
    y_train_encoded = to_categorical(y_train_resampled, num_classes=output_dim)
    y_val_encoded = to_categorical(y_val, num_classes=output_dim)
    y_test_final_encoded = to_categorical(y_test_final, num_classes=output_dim)

    # 🧠 Data for MLP
    X_train_mlp, y_train_mlp = X_train_resampled, y_train_encoded
    X_val_mlp, y_val_mlp = X_val, y_val_encoded
    X_test_mlp, y_test_mlp = X_test_final, y_test_final_encoded

    # ⚡ Data for 1D-CNN
    X_train_cnn = X_train_mlp.reshape((X_train_mlp.shape[0], X_train_mlp.shape[1], 1))
    X_val_cnn = X_val_mlp.reshape((X_val_mlp.shape[0], X_val_mlp.shape[1], 1))
    X_test_cnn = X_test_mlp.reshape((X_test_mlp.shape[0], X_test_mlp.shape[1], 1))
    y_train_cnn, y_val_cnn, y_test_cnn = y_train_mlp, y_val_mlp, y_test_mlp

    # 📊 Data for RandomForest
    X_train_rf, y_train_rf = X_train_resampled, y_train_resampled
    X_val_rf, y_val_rf = X_val, y_val
    X_test_rf, y_test_rf = X_test_final, y_test_final

    # --- FINAL RESULTS ---
    print("\n" + "="*60)
    print("✅ DATA PREPARATION COMPLETE ✅")
    print("The following variables are ready for training and evaluation:")
    print("="*60)

    print("\n--- For MLP ---")
    print(f"  Training:   X_train_mlp: {X_train_mlp.shape}, y_train_mlp: {y_train_mlp.shape}")
    print(f"  Validation: X_val_mlp: {X_val_mlp.shape}, y_val_mlp: {y_val_mlp.shape}")
    print(f"  Testing:    X_test_mlp: {X_test_mlp.shape}, y_test_mlp: {y_test_mlp.shape}")

    print("\n--- For 1D-CNN ---")
    print(f"  Training:   X_train_cnn: {X_train_cnn.shape}, y_train_cnn: {y_train_cnn.shape}")
    print(f"  Validation: X_val_cnn: {X_val_cnn.shape}, y_val_cnn: {y_val_cnn.shape}")
    print(f"  Testing:    X_test_cnn: {X_test_cnn.shape}, y_test_cnn: {y_test_cnn.shape}")

    print("\n--- For RandomForest ---")
    print(f"  Training:   X_train_rf: {X_train_rf.shape}, y_train_rf: {y_train_rf.shape}")
    print(f"  Validation: X_val_rf: {X_val_rf.shape}, y_val_rf: {y_val_rf.shape}")
    print(f"  Testing:    X_test_rf: {X_test_rf.shape}, y_test_rf: {y_test_rf.shape}")

## **MACHINE LEARNING MODEL TRAINING & SAVING**

### MACHINE LEARNING MODEL FUNCTIONS

In [None]:
def create_mlp_model(input_dim, output_dim):
    """Creates and compiles a Keras MLP model."""
    model = Sequential([
        # Hyperparameters tuning
        Dense(512, input_dim=input_dim, activation='relu'),
        Dropout(0.1),
        Dense(512, activation='relu'),
        Dropout(0.4),
        Dense(output_dim, activation='softmax') # Softmax for multi-class classification
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy', # Suitable for one-hot labels
        metrics=[
            'accuracy',
            # tf.keras.metrics.Precision(name='precision'),
            # tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.F1Score(average='weighted', name='f1_score'),
            tf.keras.metrics.SpecificityAtSensitivity(0.9, name='specificity')
        ]
    )
    return model
# 1st CNN model
def create_cnn_model(input_shape, output_dim):
    """Creates and compiles a Keras 1D-CNN model."""
    # Input shape for CNN must be 3D: (samples, steps, features)
    # Example: (10000, 187, 1)

    model = Sequential([
        Conv1D(filters=512, kernel_size=6, activation='relu', # Reduced filters
               input_shape=input_shape),
        Dropout(0.1),
        MaxPooling1D(pool_size=2),

        Conv1D(filters=512, kernel_size=3, activation='relu'), # Reduced filters
        Dropout(0.2),
        MaxPooling1D(pool_size=2),

        Flatten(), # Now flattens a much smaller tensor

        Dense(512, activation='relu'), # Reduced dense units
        Dropout(0.4),

        Dense(output_dim, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.F1Score(average='weighted', name='f1_score'),
            tf.keras.metrics.SpecificityAtSensitivity(0.9, name='specificity')
        ]
    )
    return model
# 1D-CNN optimized based on paper
def create_cnn_model_optimized(input_shape, output_dim, hp=None):
    """
    Creates and compiles an optimized 1D-CNN model.
    If 'hp' is provided, it builds a tunable model for KerasTuner.
    Otherwise, it builds a model with default hyperparameters.
    """
    # Define a default hyperparameter object if none is passed
    if hp is None:
        hp = kt.HyperParameters()
        # Set default values for when not tuning
        hp.values['conv4_filters'] = 100
        hp.values['dense_units'] = 256
        hp.values['learning_rate'] = 0.0001

    inputs = Input(shape=input_shape)
    x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='valid', name='conv1d_1_freezed')(inputs)
    x = MaxPooling1D(pool_size=2, name='maxpool1d_1_freezed')(x)
    x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', name='conv1d_2_freezed')(x)
    x = MaxPooling1D(pool_size=2, name='maxpool1d_2_freezed')(x)
    x = Conv1D(filters=4, kernel_size=3, activation='relu', padding='same', name='conv1d_3_freezed')(x)
    x = MaxPooling1D(pool_size=2, name='maxpool1d_3_freezed')(x)
    # ===============================================
    #           Trainable Layers
    # ===============================================
    x = Conv1D(filters=hp.values['conv4_filters'], kernel_size=3, activation='relu', padding='same', name='conv1d_4_trainable')(x)
    x = Flatten(name='flatten_layer')(x)
    x = Dense(units=hp.values['dense_units'], activation='relu', name='dense_1_trainable')(x)
    outputs = Dense(units=output_dim, activation='softmax', name='output_layer_trainable')(x)
    model = Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer=Adam(learning_rate=hp.values['learning_rate']),
        loss='categorical_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.F1Score(average='weighted', name='f1_score'),
            tf.keras.metrics.SpecificityAtSensitivity(0.9, name='specificity')
        ]
    )
    return model

# 1D-ResNet
# def create_cnn_model_optimized(input_shape, output_dim, hp=None):
#     """
#     Creates and compiles an optimized 1D-CNN model.
#     If 'hp' is provided, it builds a tunable model for KerasTuner.
#     Otherwise, it builds a model with default hyperparameters.
#     """
#     # Define a default hyperparameter object if none is passed
#     if hp is None:
#         hp = kt.HyperParameters()
#         # Set default values for when not tuning
#         hp.values['initial_filters'] = 384
#         hp.values['res_block_1_filters'] = 384
#         hp.values['res_block_2_filters'] = 384
#         hp.values['kernel_size_initial'] = 7
#         hp.values['kernel_size_res'] = 5
#         hp.values['dropout_1'] = 0.1
#         hp.values['dropout_2'] = 0.3
#         hp.values['dense_units'] = 512
#         hp.values['dense_dropout'] = 0.5
#         hp.values['learning_rate'] = 0.0001

#     def residual_block(x, filters, kernel_size):
#         y = Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
#         y = BatchNormalization()(y)
#         y = Activation('relu')(y)
#         y = Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(y)
#         y = BatchNormalization()(y)
#         shortcut = Conv1D(filters=filters, kernel_size=1, padding='same')(x) if x.shape[-1] != filters else x
#         res_output = Add()([shortcut, y])
#         return Activation('relu')(res_output)

#     inputs = Input(shape=input_shape)
#     x = Conv1D(filters=hp.values['initial_filters'], kernel_size=hp.values['kernel_size_initial'], padding='same')(inputs)
#     x = BatchNormalization()(x)
#     x = Activation('relu')(x)
#     x = MaxPooling1D(pool_size=2)(x)
#     x = residual_block(x, filters=hp.values['res_block_1_filters'], kernel_size=hp.values['kernel_size_res'])
#     x = MaxPooling1D(pool_size=2)(x)
#     x = Dropout(hp.values['dropout_1'])(x)
#     x = residual_block(x, filters=hp.values['res_block_2_filters'], kernel_size=hp.values['kernel_size_res'])
#     x = MaxPooling1D(pool_size=2)(x)
#     x = Dropout(hp.values['dropout_2'])(x)
#     x = GlobalAveragePooling1D()(x)
#     x = Dense(hp.values['dense_units'], activation='relu')(x)
#     x = Dropout(hp.values['dense_dropout'])(x)
#     outputs = Dense(output_dim, activation='softmax')(x)
    
#     model = Model(inputs=inputs, outputs=outputs)
    
#     model.compile(
#         optimizer=Adam(learning_rate=hp.values['learning_rate']),
#         loss='categorical_crossentropy',
#         metrics=[
#             # 'accuracy',
#             tf.keras.metrics.Precision(name='precision'),
#             tf.keras.metrics.Recall(name='recall'),
#             # Note: F1Score might require a different setup in some TF versions.
#             # If it causes issues, consider a custom callback to calculate it.
#             tf.keras.metrics.F1Score(average='weighted', name='f1_score'),
#             tf.keras.metrics.SpecificityAtSensitivity(0.9, name='specificity')
#         ]
#     )
#     return model

# Function to create the RandomForest model
def create_rf_model():
    """Creates an instance of the GPU-accelerated RandomForestClassifier model using cuML."""
    # Hyperparameters are similar to imblearn's
    return RandomForestClassifier(
        n_estimators=200, 
        max_depth=30, 
        random_state=42
    )

### MACHINE LEARNING MODEL TRAINING EXECUTION

In [None]:
# Training Multiple Models
input_shape_cnn = (X_train_cnn.shape[1], X_train_cnn.shape[2])
input_dim = X_train_mlp.shape[1]
output_dim = y_train_mlp.shape[1]

# Saving/exporting models
output_dir = '../models'
os.makedirs(output_dir, exist_ok=True)

# --- MANUAL CLASS WEIGHT CALCULATION ---
print("--- Manually Calculating Class Weights for Cost-Sensitive Learning ---")
# Ensure y_train_fold contains single integer labels (e.g., [0, 1, 2, 0, ...])
# not one-hot encoded vectors.
if y_train_fold.ndim > 1 and y_train_fold.shape[1] > 1:
    y_labels = np.argmax(y_train_fold, axis=1)
else:
    y_labels = y_train_fold.flatten() # Ensure it's a 1D array
# Count the number of samples in each class.
class_counts = np.bincount(y_labels)
total_samples = len(y_labels)
num_classes = len(np.unique(y_labels))
# Calculate weight for each class using the formula:
# weight = total_samples / (num_classes * count_for_that_class)
class_weights = total_samples / (num_classes * class_counts)
# Create the dictionary required by Keras and Scikit-learn.
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Manually Calculated Weights:", class_weights_dict)

models = {
    "1D-CNN": create_cnn_model_optimized(input_shape_cnn, output_dim),
    "RandomForest": create_rf_model(),
    "MLP": create_mlp_model(input_dim, output_dim)
}

# Dictionary to store the final results
results = {}

# --- TRAINING AND EVALUATING EACH MODEL ---

for name, model in models.items():
    print(f"\n{'='*20} TRAINING MODEL: {name} {'='*20}")

    # 🧠 Training
    if name == "1D-CNN":
        model.fit(
            X_train_cnn, y_train_cnn,
            epochs=150, # Reduced for quick example
            batch_size=100,
            verbose=1, # Set to 0 to keep output clean
            validation_data=(X_val_cnn, y_val_cnn),
            class_weight=class_weights_dict
        )
    elif name == "MLP":
        model.fit(
            X_train_mlp, y_train_mlp,
            epochs=150, # Reduced for quick example
            batch_size=100,
            verbose=1,
            validation_data=(X_val_mlp, y_val_mlp),
            class_weight=class_weights_dict
        )
    else: # 📊 RandomForest
        model.fit(X_train_rf, y_train_rf)

    # ⚡ Prediction on the Test Set
    print(f"Evaluating model {name}...")
    if name in ["MLP", "1D-CNN"]:
        y_pred_raw = model.predict(X_test_mlp if name == "MLP" else X_test_cnn)
        y_pred = np.argmax(y_pred_raw, axis=1)
    else: # RandomForest
        y_pred = model.predict(X_test_rf)

    # Store prediction results and ground truth for final evaluation
    results[name] = {'y_pred': y_pred, 'y_true': y_test_final}

# --- PRINT ALL RESULTS SIMULTANEOUSLY ---
class_names = ['Normal (N)', 'Ventricular (V)', 'Supraventricular (S)', 'Fusion (F)']

print(f"\n{'='*25} FINAL EVALUATION RESULTS {'='*25}")

for name, result_data in results.items():
    y_true = result_data['y_true']
    y_pred = result_data['y_pred']

    print(f"\n\n{'~'*15} REPORT FOR MODEL: {name} {'~'*15}")
    
    # --- Classification Report ---
    print("\nClassification Report:")
    report = classification_report(y_true, y_pred, target_names=class_names, zero_division=0)
    print(report)

    # --- Confusion Matrix Visualization ---
    print("Confusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix for {name}', fontsize=16)
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.show()

    # SAVING MODEL AFTER TRAINING
    print(f"--- Saving model: {name} ---")
    if name in ["1D-CNN", "MLP"]:
        # TensorFlow/Keras models
        model_path = os.path.join(output_dir, f"model_{name.lower()}_saved")
        model.export(model_path) # Saving models
        print(f"✅ Model {name} has been saved on: {model_path}")
    else: #RandomForest/other scikit-learn models
        model_path = os.path.join(output_dir, f"model_{name.lower()}.joblib")
        joblib.dump(model, model_path) # Saving models
        print(f"✅ Model {name} has been saved on: {model_path}")

In [None]:
# -- Ten-Fold Cross Validation --
# --- [Step 1] Combine Pre-processed Data for Cross-Validation ---
print("--- Combining pre-processed training and validation sets for CV ---")

# Combine the feature sets for MLP/RandomForest
X_cv_features = np.concatenate((X_train_rf, X_val_rf), axis=0)

# Combine the raw/reshaped data for CNN
X_cv_cnn = np.concatenate((X_train_cnn, X_val_cnn), axis=0)

# Combine the 1D integer labels. StratifiedKFold needs this format.
y_cv_labels = np.concatenate((y_train_rf, y_val_rf), axis=0)

print(f"Total data for Cross-Validation (Features): {X_cv_features.shape}")
print(f"Total data for Cross-Validation (CNN): {X_cv_cnn.shape}")
print(f"Total labels for Cross-Validation: {y_cv_labels.shape}")

# --- [Step 2] 10-Fold Cross-Validation Training and Evaluation ---
print("\n" + "="*60)
print("--- Starting 10-Fold CV on Pre-Balanced Data ---")
print("="*60)

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
unique_classes = np.unique(y_cv_labels)
output_dim = len(unique_classes)
class_names = ['Normal (N)', 'Ventricular (V)', 'Supraventricular (S)', 'Fusion (F)']

results = {
    'RandomForest': {'accuracy': [], 'recall_macro': [], 'f1_macro': [], 'y_true': [], 'y_pred': []},
    'MLP': {'accuracy': [], 'recall_macro': [], 'f1_macro': [], 'y_true': [], 'y_pred': []},
    '1D-CNN': {'accuracy': [], 'recall_macro': [], 'f1_macro': [], 'y_true': [], 'y_pred': []}
}

for fold, (train_index, val_index) in enumerate(skf.split(X_cv_features, y_cv_labels)):
    print(f"\n--- FOLD {fold+1}/{n_splits} ---")
    
    X_train_feat_fold, X_val_feat_fold = X_cv_features[train_index], X_cv_features[val_index]
    y_train_fold, y_val_fold = y_cv_labels[train_index], y_cv_labels[val_index]
    X_train_cnn_fold, X_val_cnn_fold = X_cv_cnn[train_index], X_cv_cnn[val_index]

    # --- RandomForest Training & Evaluation ---
    print("\nTraining RandomForest...")
    rf_model = create_rf_model()
    rf_model.fit(X_train_feat_fold, y_train_fold)
    y_pred_rf = rf_model.predict(X_val_feat_fold)
    results['RandomForest']['accuracy'].append(accuracy_score(y_val_fold, y_pred_rf))
    results['RandomForest']['recall_macro'].append(recall_score(y_val_fold, y_pred_rf, average='macro', zero_division=0))
    results['RandomForest']['f1_macro'].append(f1_score(y_val_fold, y_pred_rf, average='macro', zero_division=0))
    results['RandomForest']['y_true'].extend(y_val_fold)
    results['RandomForest']['y_pred'].extend(y_pred_rf)
    print(f"RandomForest Fold {fold+1} Accuracy: {results['RandomForest']['accuracy'][-1]:.4f}")

    # --- MLP Training & Evaluation ---
    print("\nTraining MLP...")
    y_train_mlp_cat = to_categorical(y_train_fold, num_classes=output_dim)
    mlp_model = create_mlp_model(input_dim=X_train_feat_fold.shape[1], output_dim=output_dim)
    mlp_model.fit(X_train_feat_fold, y_train_mlp_cat, epochs=100, batch_size=200, verbose=0)
    y_pred_mlp_prob = mlp_model.predict(X_val_feat_fold)
    y_pred_mlp = np.argmax(y_pred_mlp_prob, axis=1)
    results['MLP']['accuracy'].append(accuracy_score(y_val_fold, y_pred_mlp))
    results['MLP']['recall_macro'].append(recall_score(y_val_fold, y_pred_mlp, average='macro', zero_division=0))
    results['MLP']['f1_macro'].append(f1_score(y_val_fold, y_pred_mlp, average='macro', zero_division=0))
    results['MLP']['y_true'].extend(y_val_fold)
    results['MLP']['y_pred'].extend(y_pred_mlp)
    print(f"MLP Fold {fold+1} Accuracy: {results['MLP']['accuracy'][-1]:.4f}")

    # --- 1D-CNN Training & Evaluation ---
    print("\nTraining 1D-CNN...")
    y_train_cnn_cat = to_categorical(y_train_fold, num_classes=output_dim)
    cnn_input_shape = (X_train_cnn_fold.shape[1], 1)
    cnn_model = create_cnn_model_optimized(input_shape=cnn_input_shape, output_dim=output_dim)
    cnn_model.fit(X_train_cnn_fold, y_train_cnn_cat, epochs=150, batch_size=100, verbose=0)
    y_pred_cnn_prob = cnn_model.predict(X_val_cnn_fold)
    y_pred_cnn = np.argmax(y_pred_cnn_prob, axis=1)
    results['1D-CNN']['accuracy'].append(accuracy_score(y_val_fold, y_pred_cnn))
    results['1D-CNN']['recall_macro'].append(recall_score(y_val_fold, y_pred_cnn, average='macro', zero_division=0))
    results['1D-CNN']['f1_macro'].append(f1_score(y_val_fold, y_pred_cnn, average='macro', zero_division=0))
    results['1D-CNN']['y_true'].extend(y_val_fold)
    results['1D-CNN']['y_pred'].extend(y_pred_cnn)
    print(f"1D-CNN Fold {fold+1} Accuracy: {results['1D-CNN']['accuracy'][-1]:.4f}")

# --- [Step 3] Final Model Training and Saving ---
print("\n" + "="*60)
print("--- Training and Saving Final Models on All Data ---")
print("="*60)

output_dir = '../models'
os.makedirs(output_dir, exist_ok=True)

# 1. Train and Save RandomForest
print("\nTraining final RandomForest model...")
final_rf_model = create_rf_model()
final_rf_model.fit(X_cv_features, y_cv_labels)
rf_path = os.path.join(output_dir, "final_randomforest_model.joblib")
joblib.dump(final_rf_model, rf_path)
print(f"✅ RandomForest model saved to: {rf_path}")

# Prepare labels for Keras models
y_cv_keras = to_categorical(y_cv_labels, num_classes=output_dim)

# 2. Train and Save MLP
print("\nTraining final MLP model...")
final_mlp_model = create_mlp_model(input_dim=X_cv_features.shape[1], output_dim=output_dim)
final_mlp_model.fit(X_cv_features, y_cv_keras, epochs=10, batch_size=128, verbose=0)
mlp_path = os.path.join(output_dir, "final_mlp_model.keras")
final_mlp_model.save(mlp_path)
print(f"✅ MLP model saved to: {mlp_path}")

# 3. Train and Save 1D-CNN
print("\nTraining final 1D-CNN model...")
cnn_input_shape = (X_cv_cnn.shape[1], 1)
final_cnn_model = create_cnn_model_optimized(input_shape=cnn_input_shape, output_dim=output_dim)
final_cnn_model.fit(X_cv_cnn, y_cv_keras, epochs=10, batch_size=128, verbose=0)
cnn_path = os.path.join(output_dir, "final_1d_cnn_model.keras")
final_cnn_model.save(cnn_path)
print(f"✅ 1D-CNN model saved to: {cnn_path}")


# --- [Step 4] Final Results Summary ---
print("\n" + "="*60)
print("--- CROSS-VALIDATION SUMMARY ---")
print("="*60)

for model_name, model_results in results.items():
    print(f"\n--- {model_name} ---")
    for metric in ['accuracy', 'recall_macro', 'f1_macro']:
        avg_metric = np.mean(model_results[metric])
        std_metric = np.std(model_results[metric])
        print(f"Average {metric.replace('_', ' ').title()}: {avg_metric:.4f} (+/- {std_metric:.4f})")
    
    print("\nAggregated Classification Report:")
    aggregated_report = classification_report(
        model_results['y_true'], 
        model_results['y_pred'], 
        target_names=class_names,
        zero_division=0
    )
    print(aggregated_report)

    print("Aggregated Confusion Matrix:")
    cm = confusion_matrix(model_results['y_true'], model_results['y_pred'], labels=unique_classes)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{model_name} - Aggregated Confusion Matrix')
    plt.show()