The goal of data cleaning notebook for `wiai-crowdsourced` is 3-fold:

1. Create symlinks from `raw/audio/*.wav` <- `processed/audio/*.wav`
2. Use `CaC_work_sheet.csv` to create `processed/annotations.csv`
3. Use `CaC_work_sheet.csv` to create `processed/attributes.csv`

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from os import makedirs, symlink, rmdir, listdir
from os.path import join, dirname, exists, isdir, basename, splitext
from shutil import rmtree
import math
from collections import defaultdict
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import librosa
from librosa import get_duration
import scipy.io.wavfile as wav
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from termcolor import colored

from cac.utils.io import save_yml
from cac.utils.pandas import apply_filters
from cac.utils.file import get_audio_type, get_unique_id

In [None]:
# directory where the data resides
data_root = '/data/wiai-crowdsourced/'

# src and destination directories
load_root = join(data_root, 'raw')
save_root = join(data_root, 'processed')
makedirs(save_root, exist_ok=True)

load_audio_dir = join(load_root, 'audio')
save_audio_dir = join(save_root, 'audio')
makedirs(save_audio_dir, exist_ok=True)

In [None]:
data_sheet = pd.read_csv(join(load_audio_dir, 'CaC_work_sheet-mar15-anonymized.csv'))

In [None]:
data_sheet.shape

#### Important check: find out files that are unreadable via `EOFError` (can't be discovered by `librosa`)

In [None]:
files = listdir(load_audio_dir)

In [None]:
len(files)

In [None]:
files[0]

In [None]:
invalid_files = []

for file in tqdm(files, desc='Checking valid files'):
    fpath = f'/data/wiai-crowdsourced/raw/audio/{file}'
    try:
        S, R = librosa.load(fpath)
    except:
        invalid_files.append(file)

In [None]:
len(invalid_files)

#### Generate symlinks from `raw` <- `processed`

In [None]:
user_dirs = [f for f in glob(join(load_audio_dir, '*')) if isdir(f)]
files = []
timestamps = []
users = []

for user_dir in tqdm(user_dirs):
    user_files = glob(join(user_dir, '*/*.wav'))

    for user_file in user_files:
        users.append(basename(user_dir))
        
        # strongly dependent on structure
        timestamps.append(user_file.split('/')[-2])
        
        save_filename = '_'.join(user_file.split('/')[-3:])
        save_path = join(save_audio_dir, save_filename)
            
        # ignore .wav
        files.append(splitext(save_filename)[0])
        
        if not exists(save_path):
            symlink(user_file, save_path)

In [None]:
data_sheet.columns

#### Creating `attributes.csv` and `annotations.csv`

In [None]:
sound_labels = {
    'breathing': 'breathing',
    'cough_1': 'cough',
    'cough_2': 'cough',
    'cough_3': 'cough',
    'audio_1_to_10': 'audio_1_to_10'
}

In [None]:
unsup_label_keys = [
    'sex',
    'user_id',
    'age',
    'country',
    'mask_used',
    'symptoms',
    'browser',
    'device',
    'test_type',
    'test_location',
    'respiratory_conditions'
]

In [None]:
data_sheet[unsup_label_keys] = data_sheet[unsup_label_keys].fillna('NA')

In [None]:
files = []
unsup_labels = []
clf_labels = []
users = []

for index in tqdm(range(len(data_sheet)), desc="Iterating over all patients"):

    row = data_sheet.loc[index]

    recording_dir = row['recording_dir']
    user_timestamp = '_'.join(recording_dir.split('/')[-2:])
    user = user_timestamp.split('/')[0]
    disease_status = row['disease_status']

    user_files = []
    user_clf_labels = []
    user_unsup_labels = []
    user_ids = []
    
    for key, value in sound_labels.items():
        file = '/'.join([save_audio_dir, '_'.join([user_timestamp, f'{key}.wav'])])
        if key != 'room_sound':
            clf_label = [value, disease_status]
        else:
            clf_label = [value]

        unsup_label = dict(row[unsup_label_keys])
        unsup_label['dataset-name'] = 'wiai-crowdsourced'

        if exists(file):
            user_files.append(file)
            user_clf_labels.append(clf_label)
            user_ids.append(user)
            user_unsup_labels.append(unsup_label)
    
    files.extend(user_files)
    clf_labels.extend(user_clf_labels)
    unsup_labels.extend(user_unsup_labels)
    users.extend(user_ids)

In [None]:
# len(starts), len(ends), \
len(files), len(users), len(clf_labels), len(unsup_labels)

In [None]:
df = pd.DataFrame({'file': files, 'classification': clf_labels, 'unsupervised': unsup_labels, 'users': users})

In [None]:
df.shape

In [None]:
df['id'] = df['file'].apply(get_unique_id)
df['audio_type'] = df['file'].apply(get_audio_type)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# save the dataframe
annotation_save_path = join(save_root, 'annotation.csv')
df.to_csv(annotation_save_path, index=False)

In [None]:
# save the dataframe
annotation_save_path = join(save_root, 'attributes.csv')
data_sheet.to_csv(annotation_save_path, index=False)

Check the total duration of the dataset

In [None]:
durations = []
for filepath in tqdm(files, desc='Durations'):
    if exists(filepath):
        try:
            duration = get_duration(filename=filepath)
            durations.append(duration)
        except:
            pass

In [None]:
sum(durations), len(durations)

Note that about 26 files have some issue in loading with librosa.