# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import time
import joblib
import json
import csv

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from collections import Counter
from pprint import pprint
%matplotlib inline

# Preprocessing
from sklearn.preprocessing import (
    StandardScaler,
    scale
    )
from sklearn.model_selection import ( 
    KFold,
    StratifiedKFold
    ) 
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    roc_curve,
    roc_auc_score, 
    precision_recall_curve,
    auc,
    precision_score, 
    recall_score, 
    f1_score
    )

from sklearn import metrics

# Keras
import tensorflow.keras as keras
from tensorflow.keras import models
from tensorflow.keras import layers

import tensorflow as tf
from tensorflow.keras.callbacks import Callback

from functions.functions_model import get_CNN_model, history_loss_acc, evaluate_matrix, ROC_PR_curve

# Load Data

In [None]:
list_datasets = [
    # ['fsdkaggle'],    # 2% cough Counter({0: 1570, 1: 30})
    # # # ['virufy'],       # 100% cough Counter({1: 121})
    # ['esc50'],        # 2% cough Counter({0: 1960, 1: 40})
    # ['coughvid'],     # 30% cough Counter({1: 19777, 0: 10267})
    # ['coswara'],      # 25% cough Counter({0: 18914, 1: 5408})
    ['coswara', 'coughvid', 'esc50', 'fsdkaggle', 'virufy'], 
]

overlap = 0

for window_length in [1, 5, 10]:

    df_results = []
    for datasets in list_datasets:
        datasets.sort()
        print('')
        print('#'*60)
        print(', '.join(datasets))
        print('Window length:', window_length)
        print('#'*60)
        
        dataset_str = '_'.join(datasets)

        path_model_save = f'Results/Model_CNN/{dataset_str}/CNN_{window_length}s/'
        
        if not os.path.exists(path_model_save):
            os.makedirs(path_model_save)
        
        ############################################################
        # Load data
        ############################################################
        df_all_combined = pd.DataFrame()
        for dataset in datasets:    
            df = pd.read_csv(f'Results/Features_CNN/data_{dataset}_features_{window_length}s_{overlap}.csv')
            df_all_combined = pd.concat([df_all_combined, df], axis=0)
        df_all_combined = df_all_combined.reset_index(drop=True)
        
        # df_all_combined = df_all_combined.fillna(df.mean())
        df_all_combined = df_all_combined.fillna(0)
        
        ############################################################
        # Get label distribution
        ############################################################
        df_all_combined = df_all_combined[df_all_combined['mean_amplitude'] > 0.005].reset_index(drop=True)
        df_all_combined = df_all_combined.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)
        list_labels = df_all_combined['label'].tolist()
        count_labels = dict(Counter(list_labels))
        pprint(count_labels)
    
        ############################################################
        # Get features and labels
        ############################################################
        y = df_all_combined['label'].tolist()
        X = df_all_combined.drop(columns=['label'])
        
        ############################################################
        # Performance Store
        ############################################################
        list_cm = 0
        list_roc_auc, list_pr_auc = [], []
        list_pre, list_rec, list_f1 = [], [], []
        list_acc, list_spe, list_sen = [], [], []
        
        ############################################################
        # K-fold Cross Validation model evaluation
        ############################################################
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_idx = 1
        for train_ids, test_ids in kfold.split(X, y):
            
            ############################################################
            # Get dataset split
            ############################################################
            df_train = X.loc[train_ids]
            y_train = np.array(y)[train_ids]
    
            df_test = X.loc[test_ids]
            y_test = np.array(y)[test_ids]
    
            ############################################################
            # Drop useless columns
            ############################################################
            def drop_columns(df):
                columns = [
                    'dataset', 'filename', 'filepath', 'age', 'gender', 'status',
                    'duration', 'duration_segment', 'sample_frequency', 'mean_amplitude',
                    'segment_shape',
                    ]
                for col in columns:
                    if col in df.columns:
                        df = df.drop([col], axis=1)
                
                df = np.array(df)
                return df
    
            X_train = drop_columns(df_train)
            X_test = drop_columns(df_test)


            ############################################################
            # Scaling
            ############################################################
            scaler = StandardScaler()
    
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
    
            # Save the scaler to a file
            scaler_filename = f"{path_model_save}scaler_pipeline_CNN_{fold_idx}.joblib"
            joblib.dump(scaler, scaler_filename)
    
            ############################################################
            # Oversampling (if required)
            ############################################################
            if datasets in [['fsdkaggle'], ['esc50']]:
                try:
                    oversample = SMOTE(sampling_strategy=0.5, k_neighbors=5)
                    X_train, y_train = oversample.fit_resample(X_train, y_train)
                except:
                    oversample = SMOTE(sampling_strategy=0.5, k_neighbors=3)
                    X_train, y_train = oversample.fit_resample(X_train, y_train)
    
            ############################################################
            # Reshaping
            ############################################################
            dimension_dictionary = {
                1: 22,
                5: 27,
                10: 27,
            }

            dim_first = 128
            input_shape = (dim_first, dimension_dictionary[window_length], 1)
            X_train = X_train.reshape((len(X_train), dim_first,  dimension_dictionary[window_length]))
            X_test = X_test.reshape((len(X_test), dim_first,  dimension_dictionary[window_length]))
    
            # Add one more axis for CNN
            X_train = X_train[..., np.newaxis]
            X_test = X_test[..., np.newaxis]
            
            ############################################################
            # Create Model
            ############################################################
            model = get_CNN_model(input_shape)
            # model.summary()
    
            batch_size = 16
            early_stopping_patience = 10

            # Add early stopping
            my_callbacks = [
                tf.keras.callbacks.ModelCheckpoint(
                    filepath=path_model_save + 'Checkpoints/model_{epoch:02d}_' + f'CNN_{fold_idx}.keras', 
                    save_freq='epoch', 
                    save_best_only=True
                    ),
                tf.keras.callbacks.EarlyStopping(
                    monitor="val_loss", 
                    patience=early_stopping_patience, 
                    restore_best_weights=True
                    )
            ]
    
            # Fit Model
            history = model.fit(
                X_train, y_train,
                epochs=100,
                batch_size=batch_size,
                callbacks=my_callbacks,
                validation_split=0.15,
                verbose=0,
                )
    
            history_loss_acc(history)
    
            test_loss, test_acc = model.evaluate(X_test, y_test)
            print('Test Accuracy: ', round(test_acc, 3))
    
            predictions = model.predict(X_test)
            y_predict = []
            for i in range(len(predictions)):
                predict = np.argmax(predictions[i])
                y_predict.append(predict)
                        
            ############################################################
            # Append predictions to df_test and save
            ############################################################
            df_test['true'] = y_test
            df_test['pred'] = y_predict
            
            path_results_save = f'Results/Results CNN/{dataset_str}/CNN_{window_length}s/'
            if not os.path.exists(path_results_save):
                os.makedirs(path_results_save)
            
            df_test.to_csv(f'{path_results_save}Fold_{fold_idx}.csv', index=False)
            
            ############################################################
            # Get evaluation metrics
            ############################################################
            acc = accuracy_score(y_test, y_predict)
            cm = evaluate_matrix(y_test, y_predict)
            roc_auc, pr_auc = ROC_PR_curve(y_test, predictions)
            pre = precision_score(y_test, y_predict)
            rec = recall_score(y_test, y_predict)
            f1 = f1_score(y_test, y_predict)
            tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
            spe = tn / (tn + fp)
            sen = rec
            
            ############################################################
            # Append results
            ############################################################
            list_acc.append(acc)
            list_cm = list_cm + cm
            list_roc_auc.append(roc_auc)
            list_pr_auc.append(pr_auc)
            list_pre.append(pre)
            list_rec.append(rec)
            list_f1.append(f1)
            list_spe.append(spe)
            list_sen.append(sen)

            print(f"Fold {fold_idx} - F1: {round(f1, 3)}")
            
            ############################################################
            # Save model
            ############################################################
            # Serialize model to JSON
            model_json = model.to_json()
            with open(f"{path_model_save}model_{fold_idx}.json", "w") as json_file:
                json_file.write(model_json)
            
            model.save_weights(f"{path_model_save}model_{fold_idx}.weights.h5")
            model.save(f"{path_model_save}model_{fold_idx}.h5")
            # loaded_model = get_model()
            # loaded_model.load_weights('Results/Model/kfold.h5')
    
            ############################################################
            # Save results
            ############################################################
            results = [
                ', '.join(datasets), count_labels, 
                window_length, overlap,
                'CNN', fold_idx,
                acc, sen, spe, pre, rec, f1, roc_auc, pr_auc, cm]
            df_results.append(results)
    
            # To the next fold
            fold_idx = fold_idx + 1
            
        results = [
            ', '.join(datasets),  count_labels,
            window_length, overlap,
            'CNN', 'Avg',
            np.mean(list_acc),
            np.mean(list_sen),
            np.mean(list_spe),
            np.mean(list_pre),
            np.mean(list_rec),
            np.mean(list_f1),
            np.mean(list_roc_auc),
            np.mean(list_pr_auc),
            list_cm]
        df_results.append(results)
        
        print(list_cm)
        print(f'ROC AUC: {np.mean(list_roc_auc)}')
        print(f'PR AUC: {np.mean(list_pr_auc)}')
        print(f'F1: {np.mean(list_f1)}')
    
    columns = ['dataset', 'label_count', 'window_length', 'overlap',
               'model', 'fold', 
               'acc', 'sen', 'spe', 'pre', 'rec', 'f1', 'auc', 'auprc', 'cm']    
    df_results = pd.DataFrame(df_results, columns = columns)
    df_results.to_csv(f'Results/Model_CNN/results_prediction_CNN_{window_length}s_{overlap}.csv', index=False)

print('#'*60)
print('DONE')
print('#'*60)

# Combine all files

In [None]:
# Create an empty DataFrame to hold the combined data
combined_df = pd.DataFrame()

# Loop through each file and concatenate
for window_length in [1, 5, 10]:
    df = pd.read_csv(f'Results/Model_CNN/results_prediction_CNN_{window_length}s_{overlap}.csv')
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Display or save the result
print(combined_df)
combined_df.to_csv(f'Results/Model_CNN/results_prediction_CNN_All.csv', index=False)