# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import ast
import os
import librosa

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
%matplotlib inline

from functions.functions_features import extract_features

# Load Functions

In [None]:
# Get columns
columns_features = [
    'mean', 'variance', 'std_dev', 'max_value', 'min_value', 'rms',
    'skewness', 'kurtosis', 'median', 'range_val', 'iqr',
    'zcr', 'energy', 'rmse', 'entropy',
    'spectral_centroid', 'spectral_bandwidth', 'spectral_contrast',
    'spectral_flatness', 'spectral_rolloff', 'chroma_stft',
    ]

# MFCC
for i in range(1, 21):
    columns_features.append(f'mfcc_mean_{i}_mean')
    columns_features.append(f'mfcc_{i}_std')

# Details

In [None]:
# Get cough
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]

overlap=0 # To avoid overfitting when doing kfold

if not os.path.exists(f'Results_Onset/Features/ML'):
    os.makedirs(f'Results_Onset/Features/ML')

# Extract ML features

In [None]:
for segment_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    for dataset_name in list_dataset_name:
        print('\n', dataset_name, segment_length)
        
        df_all = pd.read_csv(f'Results_Onset/Data_Onset/Annotation/data_summary_{dataset_name}_{segment_length}s_onset_label.csv')
        # df_all = df_all[df_all['label']==1].reset_index(drop=True)
        df_all['label_onset'] = df_all['label_onset'].apply(ast.literal_eval)
        df_all['label_event'] = df_all['label_event'].apply(ast.literal_eval)
        df_all = df_all.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)

        total_len = len(df_all)
        results_all = pd.DataFrame()
        path_save = f'Results_Onset/Features/ML/data_extracted_{dataset_name}_{segment_length}s_onset_label.csv'

        if os.path.exists(path_save) == False:
        
            for i in tqdm(range(total_len)):
        
                filepath = df_all['filepath'][i] # Audio path
                dataset = df_all['dataset'][i] # Dataset name
                filename = df_all['filename'][i]
                
                label = df_all['label'][i]
                age = df_all['age'][i]
                gender = df_all['gender'][i]
                status = df_all['status'][i]
                label_onset = df_all['label_onset'][i]
        
                (y, sr) = librosa.load(filepath) # mono=True
                duration = librosa.get_duration(y=y, sr=sr)
        
                segment_samples = int(segment_length * sr)
                step = segment_samples - int(overlap * sr)
        
                results = []
        
                # Set counter to only process the first 10 0 and 1
                counter = {0: 0, 1: 0}
                for j in range(0, len(label_onset)):
                    if counter[label_onset[j]] < 5:
                        start_sample = j * segment_samples
                        segment = y[start_sample:start_sample + segment_samples]
            
                        if len(segment) < segment_samples:
                            padding = np.zeros(segment_samples - len(segment))
                            segment = np.concatenate((segment, padding))

                        mean = np.mean(np.abs(segment))
                        
                        result_row = extract_features(segment, sr)
                        results.append([dataset, filename, filepath, label, age, gender, status, mean, label_onset[j]] + result_row)
                        counter[label_onset[j]] += 1
                
                columns = ['dataset', 'filename', 'filepath', 'label', 'age', 'gender', 'status', 'mean_amplitude', 'label_onset'] + columns_features
                results = pd.DataFrame(results, columns=columns)
                results_all = pd.concat([results_all, results])
            
            columns = ['dataset', 'filename', 'filepath', 'label', 'age', 'gender', 'status', 'mean_amplitude', 'label_onset'] + columns_features
            results_all = pd.DataFrame(results_all, columns=columns)
            results_all.to_csv(path_save, index=False)
            print(results_all.shape)

# Plot histogram

In [None]:
for segment_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    print(f'Window length: {segment_length}')
    df_all = pd.DataFrame()
    for dataset_name in list_dataset_name:
        # print(dataset_name)
        df = pd.read_csv(f'Results_Onset/Features/ML/data_extracted_{dataset_name}_{segment_length}s_onset_label.csv')
        df_all = pd.concat([df_all, df])
    
    # Extract RMS values
    x_non_cough_non_audio = df_all[(df_all['label'] == 0) & (df_all['label_onset'] == 0)]['mean_amplitude'].tolist()
    x_non_cough_cough_audio = df_all[(df_all['label'] == 1) & (df_all['label_onset'] == 0)]['mean_amplitude'].tolist()
    x_cough_cough_audio = df_all[(df_all['label'] == 1) & (df_all['label_onset'] == 1)]['mean_amplitude'].tolist()
    
    print(f'x_non_cough_non_audio: {len(x_non_cough_non_audio)}')
    print(f'x_non_cough_cough_audio: {len(x_non_cough_cough_audio)}')
    print(f'x_cough_cough_audio: {len(x_cough_cough_audio)}')
    
    # Determine common bin edges
    all_rms_values = x_non_cough_non_audio + x_non_cough_cough_audio + x_cough_cough_audio
    _, bins = np.histogram(all_rms_values, bins=20)
    
    # Create 3 horizontal plots with shared x-axis
    fig, axes = plt.subplots(1, 3, figsize=(12, 3), sharex=True, sharey=True)
    
    axes[0].hist(x_non_cough_non_audio, bins=bins, color='green', edgecolor='black', 
                 weights=[100 / len(x_non_cough_non_audio)] * len(x_non_cough_non_audio))
    axes[0].set_title(f'(a) Non-Cough (from Non-cough audio)')
    axes[0].set_ylabel('Frequency (%)')
    
    axes[1].hist(x_non_cough_cough_audio, bins=bins, color='red', edgecolor='black', 
                 weights=[100 / len(x_non_cough_cough_audio)] * len(x_non_cough_cough_audio))
    axes[1].set_title(f'(b) Non-Cough (from Cough audio)')
    
    axes[2].hist(x_cough_cough_audio, bins=bins, color='blue', edgecolor='black', 
                 weights=[100 / len(x_cough_cough_audio)] * len(x_cough_cough_audio))
    axes[2].set_title(f'(c) Cough (from Cough audio)')
    
    for ax in axes:
        ax.set_xlabel('RMS')
        ax.grid(True)
        yticks = ax.get_yticks()

    plt.title(f'Normalized RMS histogram - Window length: {segment_length}')
    plt.tight_layout()
    plt.show()