In [None]:
from collections import defaultdict
from os.path import join, exists, splitext, basename
from os import listdir, symlink, makedirs
from shutil import copyfile
from praatio import tgio
from tqdm import tqdm
from glob import glob
import librosa
import pandas as pd

from cac.utils.io import load_json, write_txt

In [None]:
# directory where the data resides
data_root = '/data/coswara-15-03-21/'

In [None]:
# src and destination directories
load_dir = join(data_root, 'raw')
save_root = join(data_root, 'processed')

In [None]:
makedirs(save_root, exist_ok=True)

In [None]:
load_audio_dir = join(load_dir, 'audio')
load_annotation_dir = join(load_dir, 'annotations')

In [None]:
save_audio_dir = join(save_root, 'audio')
makedirs(save_audio_dir, exist_ok=True)

### Create symlinks to the original .wav files

In [None]:
date_dirs = glob(join(load_audio_dir, '*'))

user_dirs = []
for date in date_dirs:
    user_dirs.extend(glob(join(date, '*')))

In [None]:
len(date_dirs), len(user_dirs)

In [None]:
for user_dir in tqdm(user_dirs):
    user_name = basename(user_dir)
    date = basename(basename(user_dir))

    user_audio_files = glob(join(user_dir, '*.wav'))
    for audio_file in user_audio_files:
        file_name = basename(audio_file)
        dest = join(save_audio_dir, '{}-{}-{}'.format(date, user_name, file_name))
        if not exists(dest):
            symlink(audio_file, dest)

### Create attributes and annotation files

In [None]:
attributes = pd.read_csv(join(load_dir, 'annotations/combined_data.csv'))

In [None]:
labels_legend = load_json(join(load_dir, 'annotations/csv_labels_legend.json'))

In [None]:
attributes

In [None]:
files, users, labels = [], [], []

for user_dir in tqdm(user_dirs):
    user_name = basename(user_dir)
    user_audio_files = glob(join(user_dir, '*.wav'))
    for audio_file in user_audio_files:
        duration = librosa.get_duration(filename=audio_file)
        if duration == 0:
            continue

        file_name = basename(audio_file)
        labels.append(splitext(file_name)[0])
        files.append(file_name)
        users.append(user_name)

In [None]:
annotation = pd.DataFrame({'file': files, 'id': users, 'label': labels})

In [None]:
full_attributes = pd.merge(annotation, attributes, on='id')

In [None]:
# save the dataframe
attributes_save_path = join(save_root, 'attributes.csv')
full_attributes.to_csv(attributes_save_path, index=False)

In [None]:
annotation = annotation.rename(columns={'label': 'classification'})

In [None]:
annotation['classification'] = annotation['classification'].apply(lambda x: [x])

In [None]:
annotation.head()

In [None]:
annotation[annotation['id'] == 'Na6w7stX7ocNYZPQW9MQAiM6mrw2']['classification'].values

In [None]:
# save the dataframe
annotation_save_path = join(save_root, 'annotation.csv')
annotation.to_csv(annotation_save_path, index=False)

In [None]:
description = "\
Annotation columns: \n \
`classification`: valid labels = [''cough-heavy', 'counting-fast', 'counting-normal', \
 'cough-shallow','vowel-o', 'breathing-deep', 'vowel-e', 'vowel-a','breathing-shallow''] \n \
\
Voice samples collected include breathing sounds (fast and slow), cough sounds (deep and shallow), \
phonation of sustained vowels (/a/ as in made, /i/,/o/), and counting numbers at slow and fast pace. \
Metadata information collected includes the participant's age, gender, location (country, state/ province), \
current health status (healthy/ exposed/ cured/ infected) and the presence of comorbidities \
(pre-existing medical conditions).\n"

In [None]:
write_txt(description, join(save_root, 'description.txt'))