# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import time
import json
import csv
import librosa
import librosa.display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from collections import Counter
from pprint import pprint
%matplotlib inline

from functions.functions_features import split_audio, mean_variance_normalize, process_row, pad_array, process_CNN_row

# Load data

In [None]:
df_all = pd.read_csv('Results/Data/data_all.csv')
df_all

In [None]:
Counter(df_all['dataset'].tolist())

# Extract Features

In [None]:
# Get columns
columns = [
    'dataset', 'filepath', 'filename', 
    'age', 'gender', 'label', 'status', 'duration', 
    'duration_segment', 'sample_frequency', 'mean_amplitude',
    'mean', 'variance', 'std_dev', 'max_value', 'min_value', 'rms',
    'skewness', 'kurtosis', 'median', 'range_val', 'iqr',
    'zcr', 'energy', 'rmse', 'entropy',
    'spectral_centroid', 'spectral_bandwidth', 'spectral_contrast',
    'spectral_flatness', 'spectral_rolloff', 'chroma_stft',
    ]

for i in range(1, 21):
    columns.append(f'mfcc_mean_{i}_mean')
    columns.append(f'mfcc_{i}_std')

In [None]:
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]

if not os.path.exists(f'Results/Features'):
    os.makedirs(f'Results/Features')

overlap=0 # To avoid overfitting when doing kfold

In [None]:
for segment_length in [1, 5, 10]:
    for dataset_name in list_dataset_name:
        print('\n', dataset_name, segment_length)
        
        df_all = pd.read_csv(f'Results/Data/data_summary_{dataset_name}.csv')
        df_all = df_all.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)
        
        path_save = f'Results/Features/data_{dataset_name}_features_{segment_length}s_{overlap}.csv'
    
        if os.path.exists(path_save) == False:
        # if os.path.exists(path_save) == True:
            results_all = []
                
            total_len = len(df_all)
            for i in tqdm(range(total_len)):
                results_all.extend(process_row(i, df_all, segment_length, overlap))
        
            results_all_df = pd.DataFrame(results_all, columns=columns)
            results_all_df.to_csv(path_save, index=False)
            print(results_all_df.shape)

# Analyze features

In [None]:
segment_length = 10
df_all_combined = pd.DataFrame()
results_all = []

for dataset_name in list_dataset_name:
    ############################################################
    # Load data
    ############################################################
    print(dataset_name)
    df_all = pd.read_csv(f'Results/Features/data_{dataset_name}_features_{segment_length}s_{overlap}.csv')
    
    list_duration = df_all['duration']
    
    results = [dataset_name, 
               np.median(list_duration), np.mean(list_duration), 
               np.min(list_duration), np.max(list_duration),
               ]
    
    results_all.append(results)

    ############################################################
    # Clean data
    ############################################################
    df_all = df_all.drop(['filepath', 'age', 'gender', 'status'], axis=1)
    df_all_combined = pd.concat([df_all_combined, df_all], axis=0).reset_index(drop=True)
    
df_all_combined.to_csv(f'Results/Features/data_combined_features.csv', index=False)

columns = ['dataset', 'median', 'mean', 'min', 'max']
results_all = pd.DataFrame(results_all, columns=columns)
results_all.to_csv(f'Results/Features/data_combined_duration.csv', index=False)

df_all_combined = df_all_combined.drop(['dataset', 'filename'], axis=1)
df_all_combined = df_all_combined.fillna(0)
df_all_combined

In [None]:
values = df_all_combined['duration']
list_range = (0, 40)

bins = 10

# Calculate histogram data
counts, bin_edges = np.histogram(values, bins=bins)
total = sum(counts)


plt.figure(figsize=(5, 3))
# Plot histogram with percentage weights
plt.hist(values, bins=bins, 
         weights=np.ones_like(values) * 100 / len(values),
         range = list_range,
         edgecolor='black')


# Add title and labels
plt.title('Histogram of Audio Durations')
plt.xlabel('Duration (s)')
plt.ylabel('Percentage (%)')

# Show the plot
plt.show()