# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import shutil

import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter
from functions.functions_utility import convert_to_wav, file_mapping, custom_round

In [None]:
if not os.path.exists('Results/Data'):
    os.makedirs('Results/Data')

# Virufy Data

Download raw data at:
- https://github.com/virufy/virufy-data/tree/main/clinical

In [None]:
df_virufy = pd.read_csv('Dataset/virufy-data/clinical/labels.csv')
df_virufy['file_indicator'] = df_virufy['cough_filename'].apply(lambda x: x.replace('.mp3', ''))

# Load data file path
list_folder_path = [
    'Dataset/virufy-data/clinical/segmented/neg',
    'Dataset/virufy-data/clinical/segmented/pos',
    ]

list_all_files = []
for folder_path in list_folder_path:
    for filename in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            # print(file_path)
            if '.mp3' in file_path:
            
                file_path_new = file_path.replace('.mp3', '.wav')
                if os.path.exists(file_path_new)==False:
                    convert_to_wav(file_path, file_path_new)
                    
                list_all_files.append(file_path_new)
                
df_virufy['filepath'] = df_virufy['file_indicator'].apply(lambda x: file_mapping(x, list_all_files))
df_virufy = df_virufy.drop(['cough_filename', 'file_indicator'], axis=1)
df_virufy = df_virufy.explode('filepath').reset_index(drop=True)

# Preprocess columns
df_virufy['filename'] = df_virufy['filepath'].apply(lambda x: x.split('/')[-1].replace('.mp3', ''))
df_virufy['label'] = 1
df_virufy['status'] = df_virufy['corona_test']
df_virufy.fillna('', inplace=True)
df_virufy['dataset'] = 'virufy'

# Drop columns
df_virufy = df_virufy.drop(
    [
        'date', 'corona_test', 
        'medical_history', 'smoker', 
        'patient_reported_symptoms',
     ],
    axis=1)

# Sort columns and save data
df_virufy = df_virufy[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status']]
df_virufy.to_csv('Results/Data/data_summary_virufy.csv', index=False)
df_virufy

# Coswara Data

Download raw data at:
- https://github.com/iiscleap/Coswara-Data
- Go linux terminal and run extract_data.py to get Extracted_data from all the other folders
- Delete all the other folders because they are super massive

In [None]:
df_coswara = pd.read_csv('Dataset/Coswara-Data/combined_data.csv')

folder_path = 'Dataset/Coswara-Data/Extracted_data'

# Load data file path
list_all_files = []
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)
        if ('.wav' in file_path) and (filename[0] != '.'):
            list_all_files.append(file_path)
            # print(file_path)
            # unique_indicator = file_path.split('/')[-2]
        
df_coswara['filepath'] = df_coswara['id'].apply(lambda x: file_mapping(x, list_all_files))
df_coswara = df_coswara.explode('filepath').reset_index(drop=True)

# Preprocess columns
df_coswara['label'] = df_coswara['filepath'].apply(lambda x: 1 if 'cough' in x else 0)
df_coswara['gender'] = df_coswara['g']
df_coswara['age'] = df_coswara['a']
df_coswara['status'] = df_coswara['covid_status']
df_coswara['dataset'] = 'coswara'
df_coswara['filename'] = df_coswara['filepath'].apply(lambda x: '_'.join(x.split('/')[-2:]).replace('.wav', ''))
df_coswara.fillna('', inplace=True)

# Drop columns
df_coswara = df_coswara.drop(
    [
        'id',
        'a', 'covid_status', 'record_date', 'ep', 'g', 'l_c', 'l_l',
        'l_s', 'rU', 'smoker', 'cold', 'ht', 'diabetes', 'cough', 'ctDate',
        'ctScan', 'ctScore', 'diarrhoea', 'fever', 'loss_of_smell', 'mp',
        'testType', 'test_date', 'test_status', 'um', 'vacc', 'bd',
        'others_resp', 'ftg', 'st', 'ihd', 'asthma', 'others_preexist', 'cld',
        'pneumonia',
    ],
    axis=1)

# Sort columns and save data
df_coswara = df_coswara[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status']]
df_coswara.to_csv('Results/Data/data_summary_coswara.csv', index=False)
df_coswara

# CoughVid

Download raw data at:
- https://zenodo.org/records/7024894

Do not run this anymore, deleted coughvid_20211012 file to save space


In [None]:
folder_path = 'Dataset/coughvid_20211012'

print('Search Folders')
list_all_files = []
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)
        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            
# Get indicator
print('Load Indicators')
list_indicator_unique = list_all_files.copy()
list_indicator_unique = [x.split('/')[-1].split('.')[0] for x in list_indicator_unique]
list_indicator_unique = list(set(list_indicator_unique))
list_all_files = [x for x in list_all_files if '.json' not in x]


# Load data file path
print('Load Data')
df_coughvid = []
for id in tqdm(list_indicator_unique):
    # file_audio = f'Dataset/coughvid_20211012/{id}.webm'
    
    check_audio = [x for x in list_all_files if id in x]
    file_json = f'Dataset/coughvid_20211012/{id}.json'
    
    if len(check_audio) == 1:
        
        try:
            check_audio = check_audio[0].split('/')[-1]
            file_audio = f'Dataset/coughvid_20211012/{check_audio}'
            file_audio_new  = f'Dataset/coughvid/{check_audio}'
            file_audio_new = file_audio_new.replace('.webm', '.wav').replace('.ogg', '.wav')

            if os.path.exists(file_audio_new)==False:
                # Convert audio that is not in .wav format to .wav
                if '.wav' in file_audio:
                    shutil.copyfile(file_audio, file_audio_new)

                elif '.webm' in file_audio:
                    convert_to_wav(file_audio, file_audio_new)

                elif '.ogg' in file_audio:
                    convert_to_wav(file_audio, file_audio_new)

                else:
                    print(file_audio)

            # Open and read the JSON file
            with open(file_json, 'r') as file:
                data = json.load(file)
                data['filepath'] = file_audio_new
                df_coughvid.append(data)
                
        except Exception as error:
            print(error)
        
# Preprocess columns
df_coughvid = pd.DataFrame(df_coughvid)
df_coughvid['label'] = df_coughvid['cough_detected'].apply(lambda x: custom_round(float(x)))
df_coughvid['prob'] = df_coughvid['cough_detected']
df_coughvid['dataset'] = 'coughvid'
df_coughvid['filename'] = df_coughvid['filepath'].apply(lambda x: x.split('/')[-1].replace('.webm', ''))
df_coughvid.fillna('', inplace=True)

# Drop columns
df_coughvid = df_coughvid.drop(
    [
        'datetime', 'latitude', 'longitude', 
        'respiratory_condition', 'fever_muscle_pain',
        'expert_labels_1', 'expert_labels_2',
        'expert_labels_3', 'expert_labels_4',
    ], 
    axis=1)

# Sort columns and save data
df_coughvid = df_coughvid[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'prob', 'status']]
df_coughvid.to_csv('Results/Data/data_summary_coughvid.csv', index=False)
df_coughvid

# ESC-50

Download raw data at:
- https://github.com/karolpiczak/ESC-50?tab=readme-ov-file#download

In [None]:
folder_path = 'Dataset/ESC-50-master/ESC-50-master/audio'

list_all_files = []
for root, dirs, files in os.walk(folder_path):
    for filename in files:
        file_path = os.path.join(root, filename)
        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            # print(file_path)

# Load data file path
df_esc = pd.read_csv('Dataset/ESC-50-master/ESC-50-master/meta/esc50.csv')
df_esc['file_indicator'] = df_esc['filename']
df_esc['filepath'] = df_esc['file_indicator'].apply(lambda x: file_mapping(x, list_all_files)[0])

# Preprocess columns
df_esc['label'] = df_esc['category'].apply(lambda x: 1 if x=='coughing' else 0)
df_esc['age'] = ''
df_esc['gender'] = ''
df_esc['status'] = ''
df_esc['dataset'] = 'esc50'
df_esc['filename'] = df_esc['filepath'].apply(lambda x: x.split('/')[-1].replace('.wav', ''))

# Drop columns
df_esc = df_esc.drop(['fold'], axis=1)

# Sort columns
df_esc = df_esc[['dataset', 'filepath', 'filename', 'age', 'gender', 'label',  'status']]

# Save data
df_esc.to_csv('Results/Data/data_summary_esc50.csv', index=False)
df_esc

# FSD Kaggle 2018 Audio Train/Test

Download raw data at:
- https://zenodo.org/records/2552860#.XwscUud7kaE

In [None]:
list_all_files = []

folder_path = 'Dataset/FSDKaggle2018/FSDKaggle2018.audio_test'
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)

        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            # print(file_path)

folder_path = 'Dataset/FSDKaggle2018/FSDKaggle2018.audio_train'
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)

        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            # print(file_path)

# Load data file path
df_fsdkaggle = pd.DataFrame()

df_train = pd.read_csv('Dataset/FSDKaggle2018/FSDKaggle2018.meta/FSDKaggle2018.meta/train_post_competition.csv')
df_train = df_train.drop(['manually_verified', 'freesound_id', 'license'], axis=1)

df_test = pd.read_csv('Dataset/FSDKaggle2018/FSDKaggle2018.meta/FSDKaggle2018.meta/test_post_competition_scoring_clips.csv')
df_test = df_test.drop(['usage', 'freesound_id', 'license'], axis=1)

df_fsdkaggle = pd.concat([df_fsdkaggle, df_train, df_test], axis=0).reset_index(drop=True)

df_fsdkaggle['file_indicator'] = df_fsdkaggle['fname']
df_fsdkaggle['filepath'] = df_fsdkaggle['file_indicator'].apply(lambda x: file_mapping(x, list_all_files)[0])

# Preprocess columns
df_fsdkaggle['label'] = df_fsdkaggle['label'].apply(lambda x: 1 if x=='Cough' else 0)
df_fsdkaggle['age'] = ''
df_fsdkaggle['gender'] = ''
df_fsdkaggle['status'] = ''
df_fsdkaggle['dataset'] = 'fsdkaggle'
df_fsdkaggle['filename'] = df_fsdkaggle['filepath'].apply(lambda x: x.split('/')[-1].replace('.wav', ''))

# Sort columns
df_fsdkaggle = df_fsdkaggle[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status']]

# Save data
df_fsdkaggle.to_csv('Results/Data/data_summary_fsdkaggle.csv', index=False)
df_fsdkaggle

# Gather all data

In [None]:
df_all = pd.DataFrame()

list_dataset_name = ['coswara', 'coughvid', 'esc50', 'fsdkaggle', 'virufy']

for dataset_name in list_dataset_name:
    print(dataset_name)
    df = pd.read_csv(f'Results/Data/data_summary_{dataset_name}.csv')
    labels = Counter(df['label'].tolist())
    print(labels)
    df_all = pd.concat([df_all, df], axis=0)
    print('')

df_all = df_all.reset_index(drop=True)
df_all.to_csv('Results/Data/data_all.csv', index=False)

In [None]:
df_all