# Load Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import time
import joblib
import json
import csv

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from collections import Counter
from pprint import pprint
%matplotlib inline

# Preprocessing
from sklearn.preprocessing import (
    StandardScaler,
    scale
    )
from sklearn.model_selection import (
    KFold,
    StratifiedKFold
    ) 
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    roc_curve,
    roc_auc_score, 
    precision_recall_curve,
    auc,
    precision_score, 
    recall_score, 
    f1_score
    )

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.feature_selection import SelectPercentile, f_classif

from xgboost import XGBClassifier


# Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import Callback

# Imbalance learning
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from functions_model import get_NN_model, history_loss_acc, evaluate_matrix, ROC_PR_curve

2025-09-18 03:44:09.734402: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-18 03:44:09.925600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-18 03:44:10.028094: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-18 03:44:10.035518: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-18 03:44:10.132332: I tensorflow/core/platform/cpu_feature_guar

# Load Data

In [2]:
# List of models to evaluate
models_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(kernel='rbf', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "NB": GaussianNB(),
    "NN": MLPClassifier(hidden_layer_sizes=(100,)),
    "GB": GradientBoostingClassifier(),
    "Keras_NN": None,
}

In [3]:
list_datasets = [
    # ['fsdkaggle'],    # 2% cough Counter({0: 1570, 1: 30})
    # # ['virufy'],       # 100% cough Counter({1: 121})
    # ['esc50'],        # 2% cough Counter({0: 1960, 1: 40})
    # ['coughvid'],     # 30% cough Counter({1: 19777, 0: 10267})
    # ['coswara'],      # 25% cough Counter({0: 18914, 1: 5408})
    ['coswara', 'coughvid', 'esc50', 'fsdkaggle', 'virufy'], 
      
]

In [5]:
for segment_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    df_results = []
    for datasets in list_datasets:
        datasets.sort()
        print('')
        print('#'*60)
        print(', '.join(datasets))
        print(f'Window length: {segment_length}')
        print('#'*60)
        
        dataset_str = '_'.join(datasets)
        
        ############################################################
        # Load data
        ############################################################
        df_all_combined = pd.DataFrame()
        for dataset in datasets:    
            # Load onset data
            df = pd.read_csv(f'Results_Onset/Features/ML/data_extracted_{dataset}_{segment_length}s_onset_label.csv')
            df_all_combined = pd.concat([df_all_combined, df], axis=0)
        df_all_combined = df_all_combined.reset_index(drop=True)
        df_all_combined = df_all_combined.fillna(0)
        
        ############################################################
        # Get label distribution
        ############################################################
        df_all_combined = df_all_combined[df_all_combined['mean_amplitude'] > 0.005].reset_index(drop=True)
        df_all_combined = df_all_combined.sample(frac=1).groupby('label_onset').head(1000).reset_index(drop=True)
        list_labels = df_all_combined['label_onset'].tolist()
        count_labels = dict(Counter(list_labels))
        pprint(count_labels)
        
        ############################################################
        # Get features and labels
        ############################################################
        y = df_all_combined['label_onset'].tolist()
        X = df_all_combined.drop(columns=['label_onset'])
        
        ############################################################
        # Loop different models
        ############################################################
        for model_name, model_selected in models_dict.items():

            path_model_save = f'Results_Onset/Model_Onset/{dataset_str}/{model_name}_{segment_length}s/'
            
            if not os.path.exists(path_model_save):
                os.makedirs(path_model_save)
            
            print('')
            print('#'*50)
            print(' '.join(datasets), '-', model_name, '-', segment_length)
            print('#'*50)
        
            ############################################################
            # Performance Store
            ############################################################
            list_cm = 0
            list_roc_auc, list_pr_auc = [], []
            list_pre, list_rec, list_f1 = [], [], []
            list_acc, list_spe, list_sen = [], [], []
            
            ############################################################
            # K-fold Cross Validation model evaluation
            ############################################################
            kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            fold_idx = 1
            for train_ids, test_ids in kfold.split(X, y):
                ############################################################
                # Get dataset split
                ############################################################
                df_train = X.loc[train_ids]
                y_train = np.array(y)[train_ids]
    
                df_test = X.loc[test_ids]
                y_test = np.array(y)[test_ids]
    
                ############################################################
                # Drop useless columns
                ############################################################
                def drop_columns(df):
                    columns = [
                        'dataset', 'filename', 'filepath', 'age', 'gender', 'status',
                        'duration', 'duration_segment', 'sample_frequency',
                        'mean', 'variance', 'std_dev', 'skewness', 'kurtosis',
                        'median', 'range_val', 'iqr', 'mean_amplitude',
                        'label',
                        ]
                    for col in columns:
                        if col in df.columns:
                            df = df.drop([col], axis=1)
    
                    # print(df.columns)
                    
                    df = np.array(df)
                    return df
    
                X_train = drop_columns(df_train)
                X_test = drop_columns(df_test)
    
                ############################################################
                # Scaling
                ############################################################
                scaler = StandardScaler()
    
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
    
                # Save the scaler to a file
                scaler_filename = f"{path_model_save}scaler__{fold_idx}.joblib"
                joblib.dump(scaler, scaler_filename)
    
                ############################################################
                # Oversampling (if required)
                ############################################################
                if datasets in [['fsdkaggle'], ['esc50']]:
                    try:
                        oversample = SMOTE(sampling_strategy=0.5, k_neighbors=5)
                        X_train, y_train = oversample.fit_resample(X_train, y_train)
                    except:
                        oversample = SMOTE(sampling_strategy=0.5, k_neighbors=3)
                        X_train, y_train = oversample.fit_resample(X_train, y_train)
                        
                ############################################################
                # Create Model
                ############################################################
                if model_name != 'Keras_NN':
                    feature_selection = SelectPercentile(score_func=f_classif, percentile=10)
                    # Create a pipeline with feature selection, scaling, and the model
                    model = Pipeline([
                        ('feature_selection', feature_selection),
                        # ('scaling', scaler),
                        ('classification', model_selected)
                    ])
                    
                    model.fit(X_train, y_train)
    
                    predictions = model.predict_proba(X_test)
                    y_predict = []
                    for i in range(len(predictions)):
                        predict = np.argmax(predictions[i])
                        y_predict.append(predict)
            
                else:
                    model = get_NN_model(len(X_train[0]))
                    # model.summary()
    
                    batch_size = 16
                    early_stopping_patience = 10
    
                    # Add early stopping
                    my_callbacks = [
                        tf.keras.callbacks.ModelCheckpoint(
                            filepath=path_model_save + 'Checkpoints/model_{epoch:02d}_' + f'{fold_idx}.keras', 
                            save_freq='epoch', 
                            save_best_only=True
                            ),
                        tf.keras.callbacks.EarlyStopping(
                            monitor="val_loss", 
                            patience=early_stopping_patience, 
                            restore_best_weights=True
                            )
                    ]
    
                    # Fit Model
                    history = model.fit(
                        X_train, y_train,
                        epochs=100,
                        batch_size=batch_size,
                        callbacks=my_callbacks,
                        validation_split=0.15,
                        verbose=0,
                        )
    
                    history_loss_acc(history)
            
                    test_loss, test_acc = model.evaluate(X_test, y_test)
                    print('Test Accuracy: ', round(test_acc, 3))
    
                    predictions = model.predict(X_test)
                    y_predict = []
                    for i in range(len(predictions)):
                        predict = np.argmax(predictions[i])
                        y_predict.append(predict)
                            
                ############################################################
                # Append predictions to df_test and save
                ############################################################
                df_test['true'] = y_test
                df_test['pred'] = y_predict
                
                path_results_save = f'Results_Onset/Results_Onset/{dataset_str}/{model_name}_{segment_length}s/'
                if not os.path.exists(path_results_save):
                    os.makedirs(path_results_save)
                
                df_test.to_csv(f'{path_results_save}Fold_{fold_idx}.csv', index=False)
                
                ############################################################
                # Get evaluation metrics
                ############################################################
                acc = accuracy_score(y_test, y_predict)
                cm = evaluate_matrix(y_test, y_predict)
                roc_auc, pr_auc = ROC_PR_curve(y_test, predictions)
                pre = precision_score(y_test, y_predict)
                rec = recall_score(y_test, y_predict)
                f1 = f1_score(y_test, y_predict)
                tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
                spe = tn / (tn + fp)
                sen = rec
                
                ############################################################
                # Append results
                ############################################################
                list_acc.append(acc)
                list_cm = list_cm + cm
                list_roc_auc.append(roc_auc)
                list_pr_auc.append(pr_auc)
                list_pre.append(pre)
                list_rec.append(rec)
                list_f1.append(f1)
                list_spe.append(spe)
                list_sen.append(sen)
    
                print(f"Fold {fold_idx} - F1: {round(f1, 3)}")
                
                ############################################################
                # Save model
                ############################################################
                if model_name == 'Keras_NN':
                    # Serialize model to JSON
                    model_json = model.to_json()
                    with open(f"{path_model_save}model_{fold_idx}.json", "w") as json_file:
                        json_file.write(model_json)
                    
                    model.save_weights(f"{path_model_save}model_{fold_idx}.weights.h5")
                    model.save(f"{path_model_save}model_{fold_idx}.h5")
                    # loaded_model = get_model()
                    # loaded_model.load_weights('Results/Model/kfold.h5')
                    
                else:
                    model_filename = f"{path_model_save}model_{fold_idx}.joblib"
                    joblib.dump(model, model_filename)
    
                ############################################################
                # Save results
                ############################################################
                results = [
                    ', '.join(datasets), count_labels,
                    segment_length,
                    model_name, fold_idx,
                    acc, sen, spe, pre, rec, f1, roc_auc, pr_auc, cm]
                df_results.append(results)
    
                # To the next fold
                fold_idx = fold_idx + 1
                
            results = [
                ', '.join(datasets),  count_labels,
                segment_length,
                model_name, 'Avg',
                np.mean(list_acc),
                np.mean(list_sen),
                np.mean(list_spe),
                np.mean(list_pre),
                np.mean(list_rec),
                np.mean(list_f1),
                np.mean(list_roc_auc),
                np.mean(list_pr_auc),
                list_cm]
            df_results.append(results)
    
            print(classification_report(y_test, y_predict))
            print(list_cm)
            print(f'ROC AUC: {round(np.mean(list_roc_auc), 3)}')
            print(f'PR AUC: {round(np.mean(list_pr_auc), 3)}')
            print(f'F1: {round(np.mean(list_f1), 3)}')
    
    columns = ['dataset', 'label_count', 'window_length',
               'model', 'fold', 
               'acc', 'sen', 'spe', 'pre', 'rec', 'f1', 'auc', 'auprc', 'cm']    
    df_results = pd.DataFrame(df_results, columns = columns)
    df_results.to_csv(f'Results_Onset/Model_Onset/results_prediction_{segment_length}s_onset.csv', index=False)

print('#'*60)
print('DONE')
print('#'*60)


############################################################
coswara, coughvid, esc50, fsdkaggle, virufy
Window length: 1
############################################################
{0: 1000, 1: 1000}

##################################################
coswara coughvid esc50 fsdkaggle virufy - LR - 1
##################################################
Fold 1 - F1: 0.812
Fold 2 - F1: 0.82
Fold 3 - F1: 0.786
Fold 4 - F1: 0.819
Fold 5 - F1: 0.837
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       200
           1       0.82      0.86      0.84       200

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

[[767 233]
 [152 848]]
ROC AUC: 0.869
PR AUC: 0.815
F1: 0.815

##################################################
coswara coughvid esc50 fsdkaggle virufy - DT - 1
##################################################
Fold 1 - F



Fold 1 - F1: 0.859
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 975us/step - accuracy: 0.7998 - loss: 0.6809
Test Accuracy:  0.8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step




Fold 2 - F1: 0.809
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 913us/step - accuracy: 0.8194 - loss: 1.0191
Test Accuracy:  0.837
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step




Fold 3 - F1: 0.853
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 888us/step - accuracy: 0.8388 - loss: 0.7172
Test Accuracy:  0.868
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step




Fold 4 - F1: 0.87
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 915us/step - accuracy: 0.8595 - loss: 0.7019
Test Accuracy:  0.87
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step




Fold 5 - F1: 0.874
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       200
           1       0.85      0.90      0.87       200

    accuracy                           0.87       400
   macro avg       0.87      0.87      0.87       400
weighted avg       0.87      0.87      0.87       400

[[813 187]
 [118 882]]
ROC AUC: 0.922
PR AUC: 0.905
F1: 0.853
############################################################
DONE
############################################################


# Combine all

In [6]:
# Create an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Loop through each file and concatenate
for window_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    df = pd.read_csv(f'Results_Onset/Model_Onset/results_prediction_{window_length}s_onset.csv')
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Display or save the result
print(combined_df)
combined_df.to_csv(f'Results_Onset/Model_Onset/results_prediction_All_onset.csv', index=False)

                                         dataset         label_count  \
0    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
1    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
2    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
3    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
4    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
..                                           ...                 ...   
319  coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
320  coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
321  coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
322  coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
323  coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   

     window_length     model fold     acc    sen    spe       pre    rec  \
0              0.1        LR    1  0.7600  0.715  0.805  0.