# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import ast
import os
import time

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import Counter
from pydub import AudioSegment # sudo apt install ffmpeg
from pprint import pprint

from functions.functions_cough import get_cough, convert_events_to_seconds, label_generator, slice_audio, audiosegment_to_amplitudes

# Load data

In [3]:
df_all = pd.read_csv('Results/Data/data_all.csv')
df_all

Unnamed: 0,dataset,filepath,filename,age,gender,label,status,prob
0,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-shallow,28.0,male,1,healthy,
1,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-heavy,28.0,male,1,healthy,
2,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_breathing-shallow,28.0,male,0,healthy,
3,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-a,28.0,male,0,healthy,
4,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-o,28.0,male,0,healthy,
...,...,...,...,...,...,...,...,...
72335,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-4.wav,37.0,male,1,negative,
72336,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-1.wav,37.0,male,1,negative,
72337,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-5.wav,24.0,female,1,negative,
72338,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-0.wav,24.0,female,1,negative,


In [4]:
Counter(df_all['dataset'].tolist())

Counter({'coughvid': 34434,
         'coswara': 24712,
         'fsdkaggle': 11073,
         'esc50': 2000,
         'virufy': 121})

# Details

In [5]:
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]


fs = 22050
new_sample_rate = 16000  # New sample rate in Hz

# Extract for Whisper

In [6]:
for segment_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    for dataset_name in list_dataset_name:
        print('\n', dataset_name, segment_length)
        
        file_path_save = f"Results_Onset/Sliced_Wav_Onset/{dataset_name}_{segment_length}s/"
        
        if not os.path.exists(file_path_save):
            os.makedirs(file_path_save)
        
        df_all = pd.read_csv(f'Results_Onset/Data_Onset/Annotation/data_summary_{dataset_name}_{segment_length}s_onset_label.csv')
        df_all['label_onset'] = df_all['label_onset'].apply(ast.literal_eval)
        df_all['label_event'] = df_all['label_event'].apply(ast.literal_eval)
        df_all = df_all.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)

        path_save = f'Results_Onset/Sliced_Wav_Onset/dataset_{dataset_name}_{segment_length}s_onset.csv'

        # if os.path.exists(path_save) == False:
        if os.path.exists(path_save) == False:
        # if True:
            df_results = []
                
            for i in tqdm(range(len(df_all))):
                
                try:
                    data_set = df_all['dataset'][i]
                    file_path = df_all['filepath'][i]
                    file_name = df_all['filename'][i]
        
                    age = df_all['age'][i]
                    gender = df_all['gender'][i]
                    label = df_all['label'][i]
                    status = df_all['status'][i]
                    label_onset = df_all['label_onset'][i]
        
                    # Get slice of audio
                    slices = slice_audio(file_path, segment_length, new_sample_rate)
    
                    label_counter = {0:0, 1:0}
                    for j, slice in enumerate(slices):
                        if label_counter[label_onset[j]] < 5:
                            file_name_save = f"{file_name}_{j}.wav"
                            slice.export(file_path_save + file_name_save, format="wav")

                            mean = np.mean(np.abs(audiosegment_to_amplitudes(slice)))
        
                            label_counter[label_onset[j]] = label_counter[label_onset[j]] + 1
                            results = [data_set, 
                                       file_path, file_name, 
                                       file_path_save + file_name_save, file_name_save,
                                       age, gender, label_onset[j], status, mean]
            
                            df_results.append(results)
                            
                except Exception as error:
                    pass
                          
            columns = ['dataset', 
                       'filepath', 'filename',
                       'filepathslice', 'filenameslice',
                       'age', 'gender', 'label', 'status', 'mean_amplitude']
            
            df_results = pd.DataFrame(df_results, columns=columns)
            df_results.to_csv(path_save, index=False)
            print(df_results.shape)
            print(df_results[['label']].loc[0])



 coswara 0.1


100%|██████████| 2000/2000 [01:48<00:00, 18.47it/s]


(14589, 10)
label    0
Name: 0, dtype: int64

 coughvid 0.1


100%|██████████| 2000/2000 [02:33<00:00, 13.07it/s]


(14160, 10)
label    0
Name: 0, dtype: int64

 esc50 0.1


100%|██████████| 1040/1040 [00:34<00:00, 30.58it/s]


(5400, 10)
label    0
Name: 0, dtype: int64

 fsdkaggle 0.1


100%|██████████| 1273/1273 [00:49<00:00, 25.87it/s]


(7403, 10)
label    0
Name: 0, dtype: int64

 virufy 0.1


100%|██████████| 121/121 [00:05<00:00, 23.23it/s]


(1097, 10)
label    1
Name: 0, dtype: int64

 coswara 0.2


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


(14507, 10)
label    0
Name: 0, dtype: int64

 coughvid 0.2


100%|██████████| 2000/2000 [01:40<00:00, 19.88it/s]


(14101, 10)
label    0
Name: 0, dtype: int64

 esc50 0.2


100%|██████████| 1040/1040 [00:33<00:00, 30.93it/s]


(5389, 10)
label    0
Name: 0, dtype: int64

 fsdkaggle 0.2


100%|██████████| 1273/1273 [00:44<00:00, 28.40it/s]


(6955, 10)
label    0
Name: 0, dtype: int64

 virufy 0.2


100%|██████████| 121/121 [00:04<00:00, 25.75it/s]


(897, 10)
label    1
Name: 0, dtype: int64

 coswara 0.3


100%|██████████| 2000/2000 [01:31<00:00, 21.80it/s]


(14293, 10)
label    0
Name: 0, dtype: int64

 coughvid 0.3


100%|██████████| 2000/2000 [01:36<00:00, 20.66it/s]


(13921, 10)
label    1
Name: 0, dtype: int64

 esc50 0.3


100%|██████████| 1040/1040 [00:31<00:00, 32.58it/s]


(5371, 10)
label    0
Name: 0, dtype: int64

 fsdkaggle 0.3


100%|██████████| 1273/1273 [00:41<00:00, 30.60it/s]


(6497, 10)
label    1
Name: 0, dtype: int64

 virufy 0.3


100%|██████████| 121/121 [00:03<00:00, 36.50it/s]


(726, 10)
label    1
Name: 0, dtype: int64

 coswara 0.5


100%|██████████| 2000/2000 [01:27<00:00, 22.78it/s]


(13308, 10)
label    0
Name: 0, dtype: int64

 coughvid 0.5


100%|██████████| 2000/2000 [01:30<00:00, 22.07it/s]


(13151, 10)
label    0
Name: 0, dtype: int64

 esc50 0.5


100%|██████████| 1040/1040 [00:34<00:00, 29.78it/s]


(5319, 10)
label    0
Name: 0, dtype: int64

 fsdkaggle 0.5


100%|██████████| 1273/1273 [00:41<00:00, 30.81it/s]


(5768, 10)
label    0
Name: 0, dtype: int64

 virufy 0.5


100%|██████████| 121/121 [00:02<00:00, 46.17it/s]


(484, 10)
label    1
Name: 0, dtype: int64

 coswara 0.7


100%|██████████| 2000/2000 [01:36<00:00, 20.66it/s]


(12013, 10)
label    0
Name: 0, dtype: int64

 coughvid 0.7


100%|██████████| 2000/2000 [01:31<00:00, 21.97it/s]


(12628, 10)
label    0
Name: 0, dtype: int64

 esc50 0.7


100%|██████████| 1040/1040 [00:34<00:00, 29.94it/s]


(5293, 10)
label    0
Name: 0, dtype: int64

 fsdkaggle 0.7


100%|██████████| 1273/1273 [00:47<00:00, 26.78it/s]


(5347, 10)
label    0
Name: 0, dtype: int64

 virufy 0.7


100%|██████████| 121/121 [00:01<00:00, 63.98it/s]


(363, 10)
label    1
Name: 0, dtype: int64

 coswara 1


100%|██████████| 2000/2000 [01:25<00:00, 23.45it/s]


(10311, 10)
label    0
Name: 0, dtype: int64

 coughvid 1


100%|██████████| 2000/2000 [01:19<00:00, 25.19it/s]


(11387, 10)
label    1
Name: 0, dtype: int64

 esc50 1


100%|██████████| 1040/1040 [00:26<00:00, 39.70it/s]


(5200, 10)
label    0
Name: 0, dtype: int64

 fsdkaggle 1


100%|██████████| 1273/1273 [00:57<00:00, 21.95it/s]


(4719, 10)
label    1
Name: 0, dtype: int64

 virufy 1


100%|██████████| 121/121 [00:01<00:00, 91.29it/s]

(242, 10)
label    1
Name: 0, dtype: int64





In [7]:
df_results

Unnamed: 0,dataset,filepath,filename,filepathslice,filenameslice,age,gender,label,status,mean_amplitude
0,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-095-cough-m-53-15.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/neg-0...,neg-0422-095-cough-m-53-15.wav_0.wav,53,male,1,negative,987.720276
1,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-095-cough-m-53-15.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/neg-0...,neg-0422-095-cough-m-53-15.wav_1.wav,53,male,0,negative,0.000000
2,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-083-cough-m-53-13.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/neg-0...,neg-0421-083-cough-m-53-13.wav_0.wav,53,male,1,negative,1145.416626
3,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-083-cough-m-53-13.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/neg-0...,neg-0421-083-cough-m-53-13.wav_1.wav,53,male,0,negative,0.000000
4,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-088-cough-f-66-6.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/neg-0...,neg-0421-088-cough-f-66-6.wav_0.wav,66,female,1,negative,1780.218872
...,...,...,...,...,...,...,...,...,...,...
237,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-1.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/neg-0...,neg-0422-098-cough-f-24-1.wav_1.wav,24,female,0,negative,0.000000
238,virufy,Dataset/virufy-data/clinical/segmented/pos/pos...,pos-0421-087-cough-f-40-1.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/pos-0...,pos-0421-087-cough-f-40-1.wav_0.wav,40,female,1,positive,1699.121704
239,virufy,Dataset/virufy-data/clinical/segmented/pos/pos...,pos-0421-087-cough-f-40-1.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/pos-0...,pos-0421-087-cough-f-40-1.wav_1.wav,40,female,0,positive,0.000000
240,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-089-cough-f-20-2.wav,Results_Onset/Sliced_Wav_Onset/virufy_1s/neg-0...,neg-0421-089-cough-f-20-2.wav_0.wav,20,female,1,negative,1038.009277
