# Load Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import shutil

import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter
from functions.functions_utility import convert_to_wav, file_mapping, custom_round

In [2]:
if not os.path.exists('Results/Data'):
    os.makedirs('Results/Data')

# Virufy Data

Download raw data at:
- https://github.com/virufy/virufy-data/tree/main/clinical

In [3]:
df_virufy = pd.read_csv('Dataset/virufy-data/clinical/labels.csv')
df_virufy['file_indicator'] = df_virufy['cough_filename'].apply(lambda x: x.replace('.mp3', ''))

# Load data file path
list_folder_path = [
    'Dataset/virufy-data/clinical/segmented/neg',
    'Dataset/virufy-data/clinical/segmented/pos',
    ]

list_all_files = []
for folder_path in list_folder_path:
    for filename in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            # print(file_path)
            if '.mp3' in file_path:
            
                file_path_new = file_path.replace('.mp3', '.wav')
                if os.path.exists(file_path_new)==False:
                    convert_to_wav(file_path, file_path_new)
                    
                list_all_files.append(file_path_new)
                
            
df_virufy['filepath'] = df_virufy['file_indicator'].apply(lambda x: file_mapping(x, list_all_files))
df_virufy = df_virufy.drop(['cough_filename', 'file_indicator'], axis=1)
df_virufy = df_virufy.explode('filepath').reset_index(drop=True)

# Preprocess columns
df_virufy['filename'] = df_virufy['filepath'].apply(lambda x: x.split('/')[-1].replace('.mp3', ''))
df_virufy['label'] = 1
df_virufy['status'] = df_virufy['corona_test']
df_virufy.fillna('', inplace=True)
df_virufy['dataset'] = 'virufy'

# Drop columns
df_virufy = df_virufy.drop(
    [
        'date', 'corona_test', 
        'medical_history', 'smoker', 
        'patient_reported_symptoms',
     ],
    axis=1)

# Sort columns
df_virufy = df_virufy[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status']]

# Save data
df_virufy.to_csv('Results/Data/data_summary_virufy.csv', index=False)
df_virufy

  0%|          | 0/146 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Unnamed: 0,dataset,filepath,filename,age,gender,label,status
0,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-083-cough-m-53-10.wav,53,male,1,negative
1,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-083-cough-m-53-12.wav,53,male,1,negative
2,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-083-cough-m-53-9.wav,53,male,1,negative
3,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-083-cough-m-53-0.wav,53,male,1,negative
4,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0421-083-cough-m-53-13.wav,53,male,1,negative
...,...,...,...,...,...,...,...
116,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-4.wav,37,male,1,negative
117,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-1.wav,37,male,1,negative
118,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-5.wav,24,female,1,negative
119,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-0.wav,24,female,1,negative


# Coswara Data

Download raw data at:
- https://github.com/iiscleap/Coswara-Data
- Go linux terminal and run extract_data.py to get Extracted_data from all the other folders
- Delete all the other folders because they are super massive

In [4]:
df_coswara = pd.read_csv('Dataset/Coswara-Data/combined_data.csv')

folder_path = 'Dataset/Coswara-Data/Extracted_data'

# Load data file path
list_all_files = []
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)
        if ('.wav' in file_path) and (filename[0] != '.'):
            list_all_files.append(file_path)
            # print(file_path)
            # unique_indicator = file_path.split('/')[-2]
        
            
df_coswara['filepath'] = df_coswara['id'].apply(lambda x: file_mapping(x, list_all_files))
df_coswara = df_coswara.explode('filepath').reset_index(drop=True)

# Preprocess columns
df_coswara['label'] = df_coswara['filepath'].apply(lambda x: 1 if 'cough' in x else 0)
df_coswara['gender'] = df_coswara['g']
df_coswara['age'] = df_coswara['a']
df_coswara['status'] = df_coswara['covid_status']
df_coswara['dataset'] = 'coswara'
df_coswara['filename'] = df_coswara['filepath'].apply(lambda x: '_'.join(x.split('/')[-2:]).replace('.wav', ''))
df_coswara.fillna('', inplace=True)

# Drop columns
df_coswara = df_coswara.drop(
    [
        'id',
        'a', 'covid_status', 'record_date', 'ep', 'g', 'l_c', 'l_l',
        'l_s', 'rU', 'smoker', 'cold', 'ht', 'diabetes', 'cough', 'ctDate',
        'ctScan', 'ctScore', 'diarrhoea', 'fever', 'loss_of_smell', 'mp',
        'testType', 'test_date', 'test_status', 'um', 'vacc', 'bd',
        'others_resp', 'ftg', 'st', 'ihd', 'asthma', 'others_preexist', 'cld',
        'pneumonia',
    ],
    axis=1)

# Sort columns
df_coswara = df_coswara[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status']]

# Save data
df_coswara.to_csv('Results/Data/data_summary_coswara.csv', index=False)
df_coswara

0it [00:00, ?it/s]

Unnamed: 0,dataset,filepath,filename,age,gender,label,status
0,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-shallow,28,male,1,healthy
1,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-heavy,28,male,1,healthy
2,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_breathing-shallow,28,male,0,healthy
3,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-a,28,male,0,healthy
4,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-o,28,male,0,healthy
...,...,...,...,...,...,...,...
24707,coswara,Dataset/Coswara-Data/Extracted_data/20210714/i...,iYwmYc9CdlSuzqGwIlXNWI6eFpm1_vowel-o,78,female,0,positive_mild
24708,coswara,Dataset/Coswara-Data/Extracted_data/20210714/i...,iYwmYc9CdlSuzqGwIlXNWI6eFpm1_counting-normal,78,female,0,positive_mild
24709,coswara,Dataset/Coswara-Data/Extracted_data/20210714/i...,iYwmYc9CdlSuzqGwIlXNWI6eFpm1_vowel-e,78,female,0,positive_mild
24710,coswara,Dataset/Coswara-Data/Extracted_data/20210714/i...,iYwmYc9CdlSuzqGwIlXNWI6eFpm1_counting-fast,78,female,0,positive_mild


# CoughVid

Download raw data at:
- https://zenodo.org/records/7024894

Do not run this anymore, deleted coughvid_20211012 file to save space


In [5]:
folder_path = 'Dataset/coughvid_20211012'

print('Search Folders')
list_all_files = []
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)
        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            # print(file_path)
            
# Get indicator
print('Load Indicators')
list_indicator_unique = list_all_files.copy()
list_indicator_unique = [x.split('/')[-1].split('.')[0] for x in list_indicator_unique]
list_indicator_unique = list(set(list_indicator_unique))
list_all_files = [x for x in list_all_files if '.json' not in x]


# Load data file path
print('Load Data')
df_coughvid = []
for id in tqdm(list_indicator_unique):
    # file_audio = f'Dataset/coughvid_20211012/{id}.webm'
    
    check_audio = [x for x in list_all_files if id in x]
    file_json = f'Dataset/coughvid_20211012/{id}.json'
    
    if len(check_audio) == 1:
        
        try:
            check_audio = check_audio[0].split('/')[-1]
            file_audio = f'Dataset/coughvid_20211012/{check_audio}'
            file_audio_new  = f'Dataset/coughvid/{check_audio}'
            file_audio_new = file_audio_new.replace('.webm', '.wav').replace('.ogg', '.wav')

            if os.path.exists(file_audio_new)==False:
                # Convert audio that is not in .wav format to .wav
                if '.wav' in file_audio:
                    shutil.copyfile(file_audio, file_audio_new)

                elif '.webm' in file_audio:
                    convert_to_wav(file_audio, file_audio_new)

                elif '.ogg' in file_audio:
                    convert_to_wav(file_audio, file_audio_new)

                else:
                    print(file_audio)

            # Open and read the JSON file
            with open(file_json, 'r') as file:
                data = json.load(file)
                data['filepath'] = file_audio_new
                df_coughvid.append(data)
                
        except Exception as error:
            print(error)
        
df_coughvid = pd.DataFrame(df_coughvid)

# Preprocess columns
df_coughvid['label'] = df_coughvid['cough_detected'].apply(lambda x: custom_round(float(x)))
df_coughvid['prob'] = df_coughvid['cough_detected']
df_coughvid['dataset'] = 'coughvid'
df_coughvid['filename'] = df_coughvid['filepath'].apply(lambda x: x.split('/')[-1].replace('.webm', ''))
df_coughvid.fillna('', inplace=True)

# Drop columns
df_coughvid = df_coughvid.drop(
    [
        'datetime', 'latitude', 'longitude', 
        'respiratory_condition', 'fever_muscle_pain',
        'expert_labels_1', 'expert_labels_2',
        'expert_labels_3', 'expert_labels_4',
    ], 
    axis=1)

# Sort columns
df_coughvid = df_coughvid[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'prob', 'status']]

# Save data
df_coughvid.to_csv('Results/Data/data_summary_coughvid.csv', index=False)
df_coughvid

Search Folders


0it [00:00, ?it/s]

Load Indicators
Load Data


  0%|          | 0/34435 [00:00<?, ?it/s]

Dataset/coughvid_20211012/metadata_compiled.csv
[Errno 2] No such file or directory: 'Dataset/coughvid_20211012/metadata_compiled.json'


Unnamed: 0,dataset,filepath,filename,age,gender,label,prob,status
0,coughvid,Dataset/coughvid/10686cb3-2b6b-44e1-bc33-db015...,10686cb3-2b6b-44e1-bc33-db0157bab007.wav,,,0,0.1059,
1,coughvid,Dataset/coughvid/659d3e54-2b28-405e-8edb-f6c2a...,659d3e54-2b28-405e-8edb-f6c2ac5e2262.wav,18,male,1,0.8584,healthy
2,coughvid,Dataset/coughvid/3cb7b8de-3b4a-451c-92ba-bc19d...,3cb7b8de-3b4a-451c-92ba-bc19dc26ce8d.wav,46,male,1,0.9955,healthy
3,coughvid,Dataset/coughvid/87879c37-5870-4773-9025-930cb...,87879c37-5870-4773-9025-930cb3ad4688.wav,40,male,1,0.6534,healthy
4,coughvid,Dataset/coughvid/84ea74d0-4da4-4806-9c88-34555...,84ea74d0-4da4-4806-9c88-34555b4a8abe.wav,10,male,0,0.2386,healthy
...,...,...,...,...,...,...,...,...
34429,coughvid,Dataset/coughvid/bb5bb9ca-524f-464e-90cf-e4513...,bb5bb9ca-524f-464e-90cf-e45132ca4af4.wav,23,female,1,0.9314,healthy
34430,coughvid,Dataset/coughvid/631a182d-5702-49bf-a5f3-2f15f...,631a182d-5702-49bf-a5f3-2f15fe985a24.wav,47,male,1,0.9531,healthy
34431,coughvid,Dataset/coughvid/0f327c17-f9f8-490f-9540-4e410...,0f327c17-f9f8-490f-9540-4e410a0a5c6c.wav,,,0,0.1219,
34432,coughvid,Dataset/coughvid/1f7c73dd-3c1e-484b-b810-07e33...,1f7c73dd-3c1e-484b-b810-07e33f97f15e.wav,51,male,1,0.8280,healthy


# ESC-50

Download raw data at:
- https://github.com/karolpiczak/ESC-50?tab=readme-ov-file#download

In [6]:
folder_path = 'Dataset/ESC-50-master/ESC-50-master/audio'

list_all_files = []
for root, dirs, files in os.walk(folder_path):
    for filename in files:
        file_path = os.path.join(root, filename)
        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            # print(file_path)

# Load data file path
df_esc = pd.read_csv('Dataset/ESC-50-master/ESC-50-master/meta/esc50.csv')
df_esc['file_indicator'] = df_esc['filename']

df_esc['filepath'] = df_esc['file_indicator'].apply(lambda x: file_mapping(x, list_all_files)[0])

# Preprocess columns
df_esc['label'] = df_esc['category'].apply(lambda x: 1 if x=='coughing' else 0)
df_esc['age'] = ''
df_esc['gender'] = ''
df_esc['status'] = ''
df_esc['dataset'] = 'esc50'
df_esc['filename'] = df_esc['filepath'].apply(lambda x: x.split('/')[-1].replace('.wav', ''))

# Drop columns
df_esc = df_esc.drop(
    [
        'fold', 
    ], 
    axis=1)

# Sort columns
df_esc = df_esc[['dataset', 'filepath', 'filename', 'age', 'gender', 'label',  'status']]

# Save data
df_esc.to_csv('Results/Data/data_summary_esc50.csv', index=False)
df_esc

Unnamed: 0,dataset,filepath,filename,age,gender,label,status
0,esc50,Dataset/ESC-50-master/ESC-50-master/audio/1-10...,1-100032-A-0,,,0,
1,esc50,Dataset/ESC-50-master/ESC-50-master/audio/1-10...,1-100038-A-14,,,0,
2,esc50,Dataset/ESC-50-master/ESC-50-master/audio/1-10...,1-100210-A-36,,,0,
3,esc50,Dataset/ESC-50-master/ESC-50-master/audio/1-10...,1-100210-B-36,,,0,
4,esc50,Dataset/ESC-50-master/ESC-50-master/audio/1-10...,1-101296-A-19,,,0,
...,...,...,...,...,...,...,...
1995,esc50,Dataset/ESC-50-master/ESC-50-master/audio/5-26...,5-263831-B-6,,,0,
1996,esc50,Dataset/ESC-50-master/ESC-50-master/audio/5-26...,5-263902-A-36,,,0,
1997,esc50,Dataset/ESC-50-master/ESC-50-master/audio/5-51...,5-51149-A-25,,,0,
1998,esc50,Dataset/ESC-50-master/ESC-50-master/audio/5-61...,5-61635-A-8,,,0,


# FSD Kaggle 2018 Audio Train/Test

Download raw data at:
- https://zenodo.org/records/2552860#.XwscUud7kaE

In [7]:
list_all_files = []

folder_path = 'Dataset/FSDKaggle2018/FSDKaggle2018.audio_test'
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)

        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            # print(file_path)

folder_path = 'Dataset/FSDKaggle2018/FSDKaggle2018.audio_train'
for root, dirs, files in tqdm(os.walk(folder_path)):
    for filename in files:
        file_path = os.path.join(root, filename)

        if ':Zone.Identifier' not in file_path:
            list_all_files.append(file_path)
            # print(file_path)

# Load data file path
df_fsdkaggle = pd.DataFrame()

df_train = pd.read_csv('Dataset/FSDKaggle2018/FSDKaggle2018.meta/FSDKaggle2018.meta/train_post_competition.csv')
df_train = df_train.drop(['manually_verified', 'freesound_id', 'license'], axis=1)

df_test = pd.read_csv('Dataset/FSDKaggle2018/FSDKaggle2018.meta/FSDKaggle2018.meta/test_post_competition_scoring_clips.csv')
df_test = df_test.drop(['usage', 'freesound_id', 'license'], axis=1)

df_fsdkaggle = pd.concat([df_fsdkaggle, df_train, df_test], axis=0).reset_index(drop=True)

df_fsdkaggle['file_indicator'] = df_fsdkaggle['fname']
df_fsdkaggle['filepath'] = df_fsdkaggle['file_indicator'].apply(lambda x: file_mapping(x, list_all_files)[0])

# Preprocess columns
df_fsdkaggle['label'] = df_fsdkaggle['label'].apply(lambda x: 1 if x=='Cough' else 0)
df_fsdkaggle['age'] = ''
df_fsdkaggle['gender'] = ''
df_fsdkaggle['status'] = ''
df_fsdkaggle['dataset'] = 'fsdkaggle'
df_fsdkaggle['filename'] = df_fsdkaggle['filepath'].apply(lambda x: x.split('/')[-1].replace('.wav', ''))

# Sort columns
df_fsdkaggle = df_fsdkaggle[['dataset', 'filepath', 'filename', 'age', 'gender', 'label', 'status']]

# Save data
df_fsdkaggle.to_csv('Results/Data/data_summary_fsdkaggle.csv', index=False)
df_fsdkaggle

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Unnamed: 0,dataset,filepath,filename,age,gender,label,status
0,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_trai...,00044347,,,0,
1,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_trai...,001ca53d,,,0,
2,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_trai...,002d256b,,,0,
3,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_trai...,0033e230,,,0,
4,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_trai...,00353774,,,0,
...,...,...,...,...,...,...,...
11068,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_test...,ff96680f,,,0,
11069,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_test...,ffa69cfc,,,0,
11070,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_test...,ffaca82d,,,0,
11071,fsdkaggle,Dataset/FSDKaggle2018/FSDKaggle2018.audio_test...,ffb6eb52,,,0,


# Gather all data

In [8]:
df_all = pd.DataFrame()


list_dataset_name = ['coswara', 'coughvid', 'esc50', 'fsdkaggle', 'virufy']

for dataset_name in list_dataset_name:
    print(dataset_name)
    df = pd.read_csv(f'Results/Data/data_summary_{dataset_name}.csv')
    labels = Counter(df['label'].tolist())
    print(labels)
    df_all = pd.concat([df_all, df], axis=0)
    print('')

df_all = df_all.reset_index(drop=True)
df_all.to_csv('Results/Data/data_all.csv', index=False)

coswara
Counter({0: 19221, 1: 5491})

coughvid
Counter({1: 22703, 0: 11731})

esc50
Counter({0: 1960, 1: 40})

fsdkaggle
Counter({0: 10800, 1: 273})

virufy
Counter({1: 121})



In [9]:
df_all

Unnamed: 0,dataset,filepath,filename,age,gender,label,status,prob
0,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-shallow,28.0,male,1,healthy,
1,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_cough-heavy,28.0,male,1,healthy,
2,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_breathing-shallow,28.0,male,0,healthy,
3,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-a,28.0,male,0,healthy,
4,coswara,Dataset/Coswara-Data/Extracted_data/20200424/i...,iV3Db6t1T8b7c5HQY2TwxIhjbzD3_vowel-o,28.0,male,0,healthy,
...,...,...,...,...,...,...,...,...
72335,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-4.wav,37.0,male,1,negative,
72336,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-097-cough-m-37-1.wav,37.0,male,1,negative,
72337,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-5.wav,24.0,female,1,negative,
72338,virufy,Dataset/virufy-data/clinical/segmented/neg/neg...,neg-0422-098-cough-f-24-0.wav,24.0,female,1,negative,
