# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import ast
import os
import time
import csv
import librosa
import librosa.display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import Audio, display
from tqdm import tqdm
from collections import Counter
from pprint import pprint
%matplotlib inline

from functions.functions_cough import (
    get_cough, 
    convert_events_to_seconds, 
    label_generator
    )

# Extract Labels for ML/CNN

In [2]:
list_dataset_name = [
    'coswara', 
    'coughvid', 
    'esc50', 
    'fsdkaggle', 
    'virufy',
    ]

# ML/CNN
overlap=0 # To avoid overfitting when doing kfold

# For whisper
new_sample_rate = 16000  # New sample rate in Hz

In [3]:
if not os.path.exists(f'Results_Onset/Data_Onset/Annotation'):
    os.makedirs(f'Results_Onset/Data_Onset/Annotation')

for segment_length in [0.1, 0.2, 0.3, 0.5, 0.7, 1]:
    for dataset_name in list_dataset_name:
        print(dataset_name, segment_length)
        
        df_all = pd.read_csv(f'Results/Data/data_summary_{dataset_name}.csv')
        # df_all = df_all[df_all['label']==1].reset_index(drop=True) # Include non cough directly
        df_all = df_all.sample(frac=1).groupby('label').head(1000).reset_index(drop=True)
        df_all['label_onset'] = ''
        df_all['label_event'] = ''
    
        total_len = len(df_all)
        
        path_save = f'Results_Onset/Data_Onset/Annotation/data_summary_{dataset_name}_{segment_length}s_onset_label.csv'
        
        if os.path.exists(path_save) == False:
        # if True:
            for i in tqdm(range(total_len)):
        
                filepath = df_all['filepath'][i] # Audio path
                dataset = df_all['dataset'][i] # Dataset name
                filename = df_all['filename'][i]
                
                label = df_all['label'][i]
                age = df_all['age'][i]
                gender = df_all['gender'][i]
                status = df_all['status'][i]
                
                try:
                # if True:
                    (y, sr) = librosa.load(filepath) # mono=True
                    duration = librosa.get_duration(y=y, sr=sr)
            
                    (
                        cough_events, 
                        silent_events, 
                        hop_length, 
                        energy) = get_cough(y, segment_length, sr)
            
                    cough_events_pp = convert_events_to_seconds(cough_events, segment_length, hop_length, sr)
                    time_intervals, labels = label_generator(cough_events_pp, duration, segment_length)
        
                    if label == 1:
                        df_all['label_onset'][i] = labels
                        df_all['label_event'][i] = cough_events_pp
                        
                    elif label == 0:
                        df_all['label_onset'][i] = [0 for x in labels]
                        df_all['label_event'][i] = []
            
                except Exception as error:
                    # print(error)
                    df_all['label_onset'][i] = []
                    df_all['label_event'][i] = []
    
            df_all.to_csv(path_save, index=False)
            print(df_all.shape)
            print(df_all[['label_onset', 'label_event']].loc[0])

coswara 0.1
coughvid 0.1
esc50 0.1
fsdkaggle 0.1
virufy 0.1
coswara 0.2
coughvid 0.2
esc50 0.2
fsdkaggle 0.2
virufy 0.2
coswara 0.3
coughvid 0.3
esc50 0.3
fsdkaggle 0.3
virufy 0.3
coswara 0.5
coughvid 0.5
esc50 0.5
fsdkaggle 0.5
virufy 0.5
coswara 0.7
coughvid 0.7
esc50 0.7
fsdkaggle 0.7
virufy 0.7
coswara 1
coughvid 1
esc50 1
fsdkaggle 1
virufy 1
