# Load Libraries

In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2025-09-18 03:28:44.162362: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-18 03:28:44.225833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-18 03:28:44.242749: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-18 03:28:44.247781: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-18 03:28:44.260293: I tensorflow/core/platform/cpu_feature_guar

Num GPUs Available:  0


2025-09-18 03:28:54.523253: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import time
import joblib
import json
import csv

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from collections import Counter
from pprint import pprint
%matplotlib inline

# Preprocessing
from sklearn.preprocessing import (
    LabelEncoder, 
    StandardScaler,
    MinMaxScaler,
    scale
    )
from sklearn.model_selection import (
    KFold,
    StratifiedKFold
    ) 
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    roc_curve,
    roc_auc_score, 
    precision_recall_curve,
    auc,
    precision_score, 
    recall_score, 
    f1_score
    )

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.feature_selection import SelectPercentile, f_classif

# Keras
import tensorflow.keras as keras
from tensorflow.keras import models
from tensorflow.keras import layers

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import Callback

# Imbalance learning
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from functions_model import get_NN_model, history_loss_acc, evaluate_matrix, ROC_PR_curve

# Load Data

In [3]:
# List of models to evaluate
models_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(kernel='rbf', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "NB": GaussianNB(),
    "NN": MLPClassifier(hidden_layer_sizes=(100,)),
    "GB": GradientBoostingClassifier(),
    "Keras_NN": None,
}

In [4]:
list_datasets = [
    # ['fsdkaggle'],    # 2% cough Counter({0: 1570, 1: 30})
    # ['virufy'],       # 100% cough Counter({1: 121})
    # ['esc50'],        # 2% cough Counter({0: 1960, 1: 40})
    # ['coughvid'],     # 30% cough Counter({1: 19777, 0: 10267})
    # ['coswara'],      # 25% cough Counter({0: 18914, 1: 5408})
    ['coswara', 'coughvid', 'esc50', 'fsdkaggle', 'virufy'], 
]

overlap = 0

In [5]:
for window_length in [1, 5, 10]:
    df_results = []
    for datasets in list_datasets:
        datasets.sort()
        print('')
        print('#'*60)
        print(', '.join(datasets))
        print(f'Window Length: {window_length}')
        print('#'*60)
        
        dataset_str = '_'.join(datasets)
        
        ############################################################
        # Load data
        ############################################################
        df_all_combined = pd.DataFrame()
        for dataset in datasets:    
            df = pd.read_csv(f'Results/Features/data_{dataset}_features_{window_length}s_{overlap}.csv')
            df_all_combined = pd.concat([df_all_combined, df], axis=0)
        df_all_combined = df_all_combined.reset_index(drop=True)
        
        # df_all_combined = df_all_combined.fillna(df.mean())
        df_all_combined = df_all_combined.fillna(0)
        
        ############################################################
        # Get label distribution
        ############################################################
        list_labels = df_all_combined['label'].tolist()
        count_labels = dict(Counter(list_labels))
        pprint(count_labels)
    
        df_all_combined = df_all_combined.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)
        list_labels = df_all_combined['label'].tolist()
        count_labels = dict(Counter(list_labels))
        pprint(count_labels)
    
        ############################################################
        # Get features and labels
        ############################################################
        y = df_all_combined['label'].tolist()
        X = df_all_combined.drop(columns=['label'])
        
        ############################################################
        # Loop different models
        ############################################################
        for model_name, model_selected in models_dict.items():
            print('')
            print('#'*50)
            print(' '.join(datasets), '-', model_name, '-', window_length)
            print('#'*50)

            path_model_save = f'Results/Model/{dataset_str}/{model_name}_{window_length}s/'
            
            if not os.path.exists(path_model_save):
                os.makedirs(path_model_save)

            ############################################################
            # Performance Store
            ############################################################
            list_cm = 0
            list_roc_auc, list_pr_auc = [], []
            list_pre, list_rec, list_f1 = [], [], []
            list_acc, list_spe, list_sen = [], [], []
            
            ############################################################
            # K-fold Cross Validation model evaluation
            ############################################################
            kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            fold_idx = 1
            for train_ids, test_ids in kfold.split(X, y):

                ############################################################
                # Get dataset split
                ############################################################
                df_train = X.loc[train_ids]
                y_train = np.array(y)[train_ids]
    
                df_test = X.loc[test_ids]
                y_test = np.array(y)[test_ids]
    
                ############################################################
                # Drop useless columns
                ############################################################
                def drop_columns(df):
                    columns = [
                        'dataset', 'filename', 'filepath', 'age', 'gender', 'status',
                        'duration', 'duration_segment', 'sample_frequency',
                        'mean', 'variance', 'std_dev', 'skewness', 'kurtosis',
                        'median', 'range_val', 'iqr', 'mean_amplitude',
                        ]
                    for col in columns:
                        if col in df.columns:
                            df = df.drop([col], axis=1)
    
                    df = np.array(df)
                    return df
    
                X_train = drop_columns(df_train)
                X_test = drop_columns(df_test)

    
                ############################################################
                # Scaling
                ############################################################
                scaler = StandardScaler()
    
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
    
                # Save the scaler to a file
                scaler_filename = f"{path_model_save}scaler_{fold_idx}.joblib"
                joblib.dump(scaler, scaler_filename)
    
                ############################################################
                # Oversampling (if required)
                ############################################################
                if datasets in [['fsdkaggle'], ['esc50']]:
                    try:
                        oversample = SMOTE(sampling_strategy=0.5, k_neighbors=5)
                        X_train, y_train = oversample.fit_resample(X_train, y_train)
                    except:
                        oversample = SMOTE(sampling_strategy=0.5, k_neighbors=3)
                        X_train, y_train = oversample.fit_resample(X_train, y_train)
                        
                ############################################################
                # Create Model
                ############################################################
                if model_name != 'Keras_NN':
                    feature_selection = SelectPercentile(score_func=f_classif, percentile=10)
                    # Create a pipeline with feature selection, scaling, and the model
                    model = Pipeline([
                        ('feature_selection', feature_selection),
                        # ('scaling', scaler),
                        ('classification', model_selected)
                    ])
                    
                    model.fit(X_train, y_train)
    
                    predictions = model.predict_proba(X_test)
                    y_predict = []
                    for i in range(len(predictions)):
                        predict = np.argmax(predictions[i])
                        y_predict.append(predict)
            
                else:
                    model = get_NN_model(len(X_train[0]))
                    # model.summary()
    
                    batch_size = 16
                    early_stopping_patience = 10
    
                    # Add early stopping
                    my_callbacks = [
                        tf.keras.callbacks.ModelCheckpoint(
                            filepath=path_model_save + 'Checkpoints/model_{epoch:02d}_' + f'{fold_idx}.keras', 
                            save_freq='epoch', 
                            save_best_only=True
                            ),
                        tf.keras.callbacks.EarlyStopping(
                            monitor="val_loss", 
                            patience=early_stopping_patience, 
                            restore_best_weights=True
                            )
                    ]
    
                    # Fit Model
                    history = model.fit(
                        X_train, y_train,
                        epochs=100,
                        batch_size=batch_size,
                        callbacks=my_callbacks,
                        validation_split=0.15,
                        verbose=0,
                        )
    
                    history_loss_acc(history)
            
                    test_loss, test_acc = model.evaluate(X_test, y_test)
                    print('Test Accuracy\: ', round(test_acc, 3))
    
                    predictions = model.predict(X_test)
                    y_predict = []
                    for i in range(len(predictions)):
                        predict = np.argmax(predictions[i])
                        y_predict.append(predict)
                            
                ############################################################
                # Append predictions to df_test and save
                ############################################################
                df_test['true'] = y_test
                df_test['pred'] = y_predict
                
                path_results_save = f'Results/Results/{dataset_str}/{model_name}_{window_length}s/'
                if not os.path.exists(path_results_save):
                    os.makedirs(path_results_save)
                
                df_test.to_csv(f'{path_results_save}Fold_{fold_idx}.csv', index=False)
                
                ############################################################
                # Get evaluation metrics
                ############################################################
                acc = accuracy_score(y_test, y_predict)
                cm = evaluate_matrix(y_test, y_predict)
                roc_auc, pr_auc = ROC_PR_curve(y_test, predictions)
                pre = precision_score(y_test, y_predict)
                rec = recall_score(y_test, y_predict)
                f1 = f1_score(y_test, y_predict)
                tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
                spe = tn / (tn + fp)
                sen = rec
                
                ############################################################
                # Append results
                ############################################################
                list_acc.append(acc)
                list_cm = list_cm + cm
                list_roc_auc.append(roc_auc)
                list_pr_auc.append(pr_auc)
                list_pre.append(pre)
                list_rec.append(rec)
                list_f1.append(f1)
                list_spe.append(spe)
                list_sen.append(sen)
    
                print(f"Fold {fold_idx} - F1: {round(f1, 3)}")
                
                ############################################################
                # Save model
                ############################################################
                if model_name == 'Keras_NN':
                    # Serialize model to JSON
                    model_json = model.to_json()
                    with open(f"{path_model_save}model_{fold_idx}.json", "w") as json_file:
                        json_file.write(model_json)
                    
                    model.save_weights(f"{path_model_save}model_{fold_idx}.weights.h5")
                    model.save(f"{path_model_save}model_{fold_idx}.h5")
                    # loaded_model = get_model()
                    # loaded_model.load_weights('Results/Model/kfold.h5')
                    
                else:
                    model_filename = f"{path_model_save}model_{fold_idx}.joblib"
                    joblib.dump(model, model_filename)
    
                ############################################################
                # Save results
                ############################################################
                results = [
                    ', '.join(datasets), count_labels, 
                    window_length, overlap,
                    model_name, fold_idx,
                    acc, sen, spe, pre, rec, f1, roc_auc, pr_auc, cm]
                df_results.append(results)
    
                # To the next fold
                fold_idx = fold_idx + 1
                
            results = [
                ', '.join(datasets),  count_labels,
                window_length, overlap,
                model_name, 'Avg',
                np.mean(list_acc),
                np.mean(list_sen),
                np.mean(list_spe),
                np.mean(list_pre),
                np.mean(list_rec),
                np.mean(list_f1),
                np.mean(list_roc_auc),
                np.mean(list_pr_auc),
                list_cm]
            df_results.append(results)
            
            print(list_cm)
            print(f'ROC AUC: {round(np.mean(list_roc_auc), 3)}')
            print(f'PR AUC: {round(np.mean(list_pr_auc), 3)}')
            print(f'F1: {round(np.mean(list_f1), 3)}')
    
    columns = ['dataset', 'label_count', 'window_length', 'overlap',
               'model', 'fold', 
               'acc', 'sen', 'spe', 'pre', 'rec', 'f1', 'auc', 'auprc', 'cm']    
    df_results = pd.DataFrame(df_results, columns = columns)
    df_results.to_csv(f'Results/Model/results_prediction_{window_length}s_{overlap}.csv', index=False)

print('#'*60)
print('DONE')
print('#'*60)


############################################################
coswara, coughvid, esc50, fsdkaggle, virufy
Window Length: 1
############################################################
{0: 32118, 1: 16689}
{0: 1000, 1: 1000}

##################################################
coswara coughvid esc50 fsdkaggle virufy - LR - 1
##################################################
Fold 1 - F1: 0.69
Fold 2 - F1: 0.685
Fold 3 - F1: 0.693
Fold 4 - F1: 0.71
Fold 5 - F1: 0.724
[[680 320]
 [288 712]]
ROC AUC: 0.761
PR AUC: 0.733
F1: 0.701

##################################################
coswara coughvid esc50 fsdkaggle virufy - DT - 1
##################################################
Fold 1 - F1: 0.624
Fold 2 - F1: 0.642
Fold 3 - F1: 0.631
Fold 4 - F1: 0.678
Fold 5 - F1: 0.649
[[585 415]
 [327 673]]
ROC AUC: 0.629
PR AUC: 0.717
F1: 0.645

##################################################
coswara coughvid esc50 fsdkaggle virufy - RF - 1
##################################################
Fold 1 -



Fold 1 - F1: 0.709
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7245 - loss: 0.8963  
Test Accuracy\:  0.717
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Fold 2 - F1: 0.71




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 891us/step - accuracy: 0.7445 - loss: 0.7878
Test Accuracy\:  0.725
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Fold 3 - F1: 0.719




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7624 - loss: 0.7237  
Test Accuracy\:  0.752
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Fold 4 - F1: 0.739




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7648 - loss: 0.8298  
Test Accuracy\:  0.757
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Fold 5 - F1: 0.76




[[764 236]
 [293 707]]
ROC AUC: 0.826
PR AUC: 0.809
F1: 0.727

############################################################
coswara, coughvid, esc50, fsdkaggle, virufy
Window Length: 5
############################################################
{0: 7399, 1: 4127}
{0: 1000, 1: 1000}

##################################################
coswara coughvid esc50 fsdkaggle virufy - LR - 5
##################################################
Fold 1 - F1: 0.782
Fold 2 - F1: 0.766
Fold 3 - F1: 0.787
Fold 4 - F1: 0.819
Fold 5 - F1: 0.794
[[723 277]
 [167 833]]
ROC AUC: 0.83
PR AUC: 0.809
F1: 0.789

##################################################
coswara coughvid esc50 fsdkaggle virufy - DT - 5
##################################################
Fold 1 - F1: 0.681
Fold 2 - F1: 0.704
Fold 3 - F1: 0.748
Fold 4 - F1: 0.76
Fold 5 - F1: 0.714
[[718 282]
 [276 724]]
ROC AUC: 0.722
PR AUC: 0.791
F1: 0.722

##################################################
coswara coughvid esc50 fsdkaggle virufy - RF - 5



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 954us/step - accuracy: 0.8190 - loss: 0.6900
Test Accuracy\:  0.832
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step




Fold 2 - F1: 0.832
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 919us/step - accuracy: 0.8256 - loss: 0.6673
Test Accuracy\:  0.815
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Fold 3 - F1: 0.813




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 976us/step - accuracy: 0.8174 - loss: 0.5486
Test Accuracy\:  0.835
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Fold 4 - F1: 0.841




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8497 - loss: 0.5674  
Test Accuracy\:  0.832
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step




Fold 5 - F1: 0.823
[[842 158]
 [189 811]]
ROC AUC: 0.902
PR AUC: 0.905
F1: 0.823

############################################################
coswara, coughvid, esc50, fsdkaggle, virufy
Window Length: 10
############################################################
{0: 4906, 1: 2573}
{0: 1000, 1: 1000}

##################################################
coswara coughvid esc50 fsdkaggle virufy - LR - 10
##################################################
Fold 1 - F1: 0.788
Fold 2 - F1: 0.806
Fold 3 - F1: 0.811
Fold 4 - F1: 0.813
Fold 5 - F1: 0.798
[[718 282]
 [140 860]]
ROC AUC: 0.846
PR AUC: 0.799
F1: 0.803

##################################################
coswara coughvid esc50 fsdkaggle virufy - DT - 10
##################################################
Fold 1 - F1: 0.754
Fold 2 - F1: 0.771
Fold 3 - F1: 0.757
Fold 4 - F1: 0.768
Fold 5 - F1: 0.726
[[740 260]
 [236 764]]
ROC AUC: 0.754
PR AUC: 0.815
F1: 0.755

##################################################
coswara coughvid esc50 f



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 918us/step - accuracy: 0.8283 - loss: 0.5575
Test Accuracy\:  0.845
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step




Fold 2 - F1: 0.847
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8572 - loss: 0.5694
Test Accuracy\:  0.848
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
Fold 3 - F1: 0.845




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 864us/step - accuracy: 0.8451 - loss: 0.6337
Test Accuracy\:  0.853
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Fold 4 - F1: 0.854




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8636 - loss: 0.5090 
Test Accuracy\:  0.875
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 74ms/step




Fold 5 - F1: 0.88
[[855 145]
 [151 849]]
ROC AUC: 0.926
PR AUC: 0.922
F1: 0.851
############################################################
DONE
############################################################


# Combine all

In [7]:
# Create an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Loop through each file and concatenate
for window_length in [1, 5, 10]:
    df = pd.read_csv(f'Results/Model/results_prediction_{window_length}s_{overlap}.csv')
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Display or save the result
print(combined_df)
combined_df.to_csv(f'Results/Model/results_prediction_All.csv', index=False)

                                         dataset         label_count  \
0    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
1    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
2    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
3    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
4    coswara, coughvid, esc50, fsdkaggle, virufy  {0: 1000, 1: 1000}   
..                                           ...                 ...   
157  coswara, coughvid, esc50, fsdkaggle, virufy  {1: 1000, 0: 1000}   
158  coswara, coughvid, esc50, fsdkaggle, virufy  {1: 1000, 0: 1000}   
159  coswara, coughvid, esc50, fsdkaggle, virufy  {1: 1000, 0: 1000}   
160  coswara, coughvid, esc50, fsdkaggle, virufy  {1: 1000, 0: 1000}   
161  coswara, coughvid, esc50, fsdkaggle, virufy  {1: 1000, 0: 1000}   

     window_length  overlap     model fold     acc    sen    spe       pre  \
0                1        0        LR    1  0.6925  0.685