# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import time
import json
import csv

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import Counter
from pydub import AudioSegment # sudo apt install ffmpeg
from pprint import pprint

from functions.functions_cough import get_cough, convert_events_to_seconds, label_generator, slice_audio, audiosegment_to_amplitudes

# Main

In [2]:
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]

new_sample_rate = 16000  # New sample rate in Hz

if not os.path.exists(f'Results/Sliced_Wav'):
    os.makedirs(f'Results/Sliced_Wav')

In [3]:
for window_size in [1, 5, 10]:
    for dataset_name in list_dataset_name:
        print(dataset_name, window_size)
        
        file_path_save = f"Results/Sliced_Wav/{dataset_name}_{window_size}/"
        
        if not os.path.exists(file_path_save):
            os.makedirs(file_path_save)
        
        df_all = pd.read_csv(f'Results/Data/data_summary_{dataset_name}.csv')
        df_all = df_all.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)
            
        df_results = []
            
        for i in tqdm(range(len(df_all))):
            
            try:
            # if True:
                data_set = df_all['dataset'][i]
                file_path = df_all['filepath'][i]
                file_name = df_all['filename'][i]
    
                age = df_all['age'][i]
                gender = df_all['gender'][i]
                label = df_all['label'][i]
                status = df_all['status'][i]
    
                # Get slice of audio
                slices = slice_audio(file_path, window_size, new_sample_rate)
    
                for j, slice in enumerate(slices):
                    file_name_save = f"{file_name}_{j}.wav"
                    mean = np.mean(np.abs(audiosegment_to_amplitudes(slice)))
                    slice.export(file_path_save + file_name_save, format="wav")
    
                    results = [data_set, 
                               file_path, file_name, 
                               file_path_save + file_name_save, file_name_save,
                               age, gender, label, status,  mean]
    
                    df_results.append(results)
                    
            except Exception as error:
                pass
                      
        columns = ['dataset', 
                   'filepath', 'filename',
                   'filepathslice', 'filenameslice',
                   'age', 'gender', 'label', 'status', 'mean_amplitude']
        
        df_results = pd.DataFrame(df_results, columns=columns)
        df_results.to_csv(f'Results/Sliced_Wav/dataset_{data_set}_{window_size}s.csv', index=False)


coswara 1


100%|██████████| 2000/2000 [01:15<00:00, 26.48it/s]


coughvid 1


100%|██████████| 2000/2000 [01:10<00:00, 28.21it/s]


esc50 1


100%|██████████| 1040/1040 [00:25<00:00, 41.22it/s]


fsdkaggle 1


100%|██████████| 1273/1273 [00:38<00:00, 32.87it/s]


virufy 1


100%|██████████| 121/121 [00:00<00:00, 171.95it/s]


coswara 5


100%|██████████| 2000/2000 [00:28<00:00, 69.16it/s] 


coughvid 5


100%|██████████| 2000/2000 [00:41<00:00, 47.76it/s]


esc50 5


100%|██████████| 1040/1040 [00:08<00:00, 119.68it/s]


fsdkaggle 5


100%|██████████| 1273/1273 [00:19<00:00, 64.09it/s]


virufy 5


100%|██████████| 121/121 [00:00<00:00, 153.78it/s]


coswara 10


100%|██████████| 2000/2000 [00:37<00:00, 53.92it/s] 


coughvid 10


100%|██████████| 2000/2000 [00:29<00:00, 67.78it/s]


esc50 10


100%|██████████| 1040/1040 [00:12<00:00, 81.69it/s]


fsdkaggle 10


100%|██████████| 1273/1273 [00:20<00:00, 61.02it/s]


virufy 10


100%|██████████| 121/121 [00:00<00:00, 222.41it/s]


In [None]:
df_results

In [5]:
audiosegment_to_amplitudes(slice)

array([-992.,  125., 8106., ...,    0.,    0.,    0.], dtype=float32)