# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import ast
import os
import time

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import Counter
from pydub import AudioSegment # sudo apt install ffmpeg
from pprint import pprint

from functions.functions_cough import get_cough, convert_events_to_seconds, label_generator, slice_audio, audiosegment_to_amplitudes

# Load data

In [None]:
df_all = pd.read_csv('Results/Data/data_all.csv')
df_all

In [None]:
Counter(df_all['dataset'].tolist())

# Details

In [None]:
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]


fs = 22050
new_sample_rate = 16000  # New sample rate in Hz

# Extract for Whisper

In [None]:
for segment_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    for dataset_name in list_dataset_name:
        print('\n', dataset_name, segment_length)
        
        file_path_save = f"Results_Onset/Sliced_Wav_Onset/{dataset_name}_{segment_length}s/"
        
        if not os.path.exists(file_path_save):
            os.makedirs(file_path_save)
        
        df_all = pd.read_csv(f'Results_Onset/Data_Onset/Annotation/data_summary_{dataset_name}_{segment_length}s_onset_label.csv')
        df_all['label_onset'] = df_all['label_onset'].apply(ast.literal_eval)
        df_all['label_event'] = df_all['label_event'].apply(ast.literal_eval)
        df_all = df_all.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)

        path_save = f'Results_Onset/Sliced_Wav_Onset/dataset_{dataset_name}_{segment_length}s_onset.csv'

        # if os.path.exists(path_save) == False:
        if os.path.exists(path_save) == False:
        # if True:
            df_results = []
                
            for i in tqdm(range(len(df_all))):
                
                try:
                    data_set = df_all['dataset'][i]
                    file_path = df_all['filepath'][i]
                    file_name = df_all['filename'][i]
        
                    age = df_all['age'][i]
                    gender = df_all['gender'][i]
                    label = df_all['label'][i]
                    status = df_all['status'][i]
                    label_onset = df_all['label_onset'][i]
        
                    # Get slice of audio
                    slices = slice_audio(file_path, segment_length, new_sample_rate)
    
                    label_counter = {0:0, 1:0}
                    for j, slice in enumerate(slices):
                        if label_counter[label_onset[j]] < 5:
                            file_name_save = f"{file_name}_{j}.wav"
                            slice.export(file_path_save + file_name_save, format="wav")

                            mean = np.mean(np.abs(audiosegment_to_amplitudes(slice)))
        
                            label_counter[label_onset[j]] = label_counter[label_onset[j]] + 1
                            results = [data_set, 
                                       file_path, file_name, 
                                       file_path_save + file_name_save, file_name_save,
                                       age, gender, label_onset[j], status, mean]
            
                            df_results.append(results)
                            
                except Exception as error:
                    pass
                          
            columns = ['dataset', 
                       'filepath', 'filename',
                       'filepathslice', 'filenameslice',
                       'age', 'gender', 'label', 'status', 'mean_amplitude']
            
            df_results = pd.DataFrame(df_results, columns=columns)
            df_results.to_csv(path_save, index=False)
            print(df_results.shape)
            print(df_results[['label']].loc[0])


In [None]:
df_results