The goal of data cleaning notebook for `wiai-facility` is 3-fold:

1. Create symlinks from `raw/audio/*.wav` <- `processed/audio/*.wav`
2. Use `CaC_work_sheet.csv` to create `processed/annotations.csv`
3. Use `CaC_work_sheet.csv` to create `processed/attributes.csv`

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from os import makedirs, symlink, rmdir
from os.path import join, dirname, exists, isdir, basename, splitext
from shutil import rmtree
import math
from collections import defaultdict
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from librosa import get_duration
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from termcolor import colored

from cac.utils.io import save_yml
from cac.utils.pandas import apply_filters
from cac.utils.file import get_audio_type, get_unique_id

In [None]:
# directory where the data resides
data_root = '/data/wiai-facility/'

# src and destination directories
load_root = join(data_root, 'raw')
save_root = join(data_root, 'processed')
makedirs(save_root, exist_ok=True)

load_audio_dir = join(load_root, 'audio')
save_audio_dir = join(save_root, 'audio')
makedirs(save_audio_dir, exist_ok=True)

In [None]:
data_sheet = pd.read_csv(join(load_audio_dir, 'CaC_work_sheet_nov23-anonymized.csv'))

In [None]:
data_sheet.shape

#### Generate symlinks from `raw` <- `processed`

In [None]:
user_dirs = [f for f in glob(join(load_audio_dir, '*')) if isdir(f)]
files = []
timestamps = []
users = []

for user_dir in tqdm(user_dirs):
    user_files = glob(join(user_dir, '*/*.wav'))
    for user_file in user_files:
        users.append(basename(user_dir))
        filename = basename(user_file)

        if 'breathing' in filename:
            filename = 'breathing'
        elif 'cough_sound_recording_1' in filename:
            filename = 'cough_1'
        elif 'cough_sound_recording_2' in filename:
            filename = 'cough_2'
        elif 'cough_sound_recording_3' in filename:
            filename = 'cough_3'
        elif 'speech_recording' in filename:
            filename = 'audio_1_to_10'
        elif 'room_recording' in filename:
            filename = 'room_sound'
        elif 'aaaaa_recording' in filename:
            filename = 'a_sound'
        elif 'eeeee_recording' in filename:
            filename = 'e_sound'
        elif 'ooooo_recording' in filename:
            filename = 'o_sound'
        else:
            import ipdb; ipdb.set_trace()

        # strongly dependent on structure
        timestamps.append(user_file.split('/')[-2])
        
        save_filename = '_'.join([*user_file.split('/')[-3:-1], filename + '.wav'])
        save_path = join(save_audio_dir, save_filename)
            
        # ignore .wav
        files.append(splitext(save_filename)[0])
        
        if not exists(save_path):
            symlink(user_file, save_path)

#### Creating `attributes.csv` and `annotations.csv`

In [None]:
sound_labels = {
    'breathing': 'breathing',
    'cough_1': 'cough',
    'cough_2': 'cough',
    'cough_3': 'cough',
    'audio_1_to_10': 'audio_1_to_10',
    'room_sound': 'room_sound',
    'a_sound': 'a_sound',
    'e_sound': 'e_sound',
    'o_sound': 'o_sound'
}

In [None]:
unsup_label_keys = [
    'enroll_patient_gender',
    'patient_id',
    'enroll_patient_age',
    'enroll_state',
    'enroll_facility',
    'enroll_habits',
    'enroll_travel_history',
    'enroll_comorbidities',
    'enroll_contact_with_confirmed_covid_case',
    'enroll_fever',
    'enroll_days_with_fever',
    'enroll_cough',
    'enroll_days_with_cough',
    'enroll_shortness_of_breath',
    'enroll_days_with_shortness_of_breath',
    'enroll_patient_temperature',
    'enroll_patient_respiratory_rate',
    'enroll_cough_relief_measures',
    'testresult_covid_test_result'
]

In [None]:
data_sheet[unsup_label_keys] = data_sheet[unsup_label_keys].fillna('NA')

In [None]:
files = []
unsup_labels = []
clf_labels = []
users = []

for index in tqdm(range(len(data_sheet)), desc="Iterating over all patients"):

    row = data_sheet.loc[index]

    recording_dir = row['audio_folder']
    user_timestamp = '_'.join(recording_dir.split('/')[-2:])
    user = user_timestamp.split('/')[0]
    disease_status = row['testresult_covid_test_result']

    user_files = []
    user_clf_labels = []
    user_unsup_labels = []
    user_ids = []
    
    for key, value in sound_labels.items():
        file = '/'.join([save_audio_dir, '_'.join([user_timestamp, f'{key}.wav'])])
        if key != 'room_sound':
            clf_label = [value, disease_status]
        else:
            clf_label = [value]

        unsup_label = dict(row[unsup_label_keys])
        unsup_label['dataset-name'] = 'wiai-facility'

        if exists(file):
            user_files.append(file)
            user_clf_labels.append(clf_label)
            user_ids.append(user)
            user_unsup_labels.append(unsup_label)
    
    files.extend(user_files)
    clf_labels.extend(user_clf_labels)
    unsup_labels.extend(user_unsup_labels)
    users.extend(user_ids)

In [None]:
# len(starts), len(ends), \
len(files), len(users), len(clf_labels), len(unsup_labels)

In [None]:
df = pd.DataFrame({'file': files, 'classification': clf_labels, 'unsupervised': unsup_labels, 'users': users})

In [None]:
df.shape

In [None]:
df['id'] = df['file'].apply(get_unique_id)
df['audio_type'] = df['file'].apply(get_audio_type)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# save the dataframe
annotation_save_path = join(save_root, 'annotation.csv')
df.to_csv(annotation_save_path, index=False)

In [None]:
# save the dataframe
annotation_save_path = join(save_root, 'attributes.csv')
data_sheet.to_csv(annotation_save_path, index=False)

Check the total duration of the dataset

In [None]:
durations = []
for filename in tqdm(files, desc='Durations'):
    filepath = join(save_audio_dir, filename + '.wav')
    if exists(filepath):
        duration = get_duration(filename=filepath)
        durations.append(duration)

In [None]:
sum(durations)

#### Junk code

In [None]:
from joblib import Parallel, delayed

In [None]:
df = {'file': [], 'classification': [], 'users': [], 'start': [], 'end': []}

In [None]:
def update_df_by_user_files(index):
    row = data_sheet.loc[index]
    recording_dir = row['audio_folder']
    user_timestamp = '_'.join(recording_dir.split('/')[-2:])
    user = user_timestamp.split('/')[0]
    disease_status = row['testresult_covid_test_result']

    user_files = []
    user_labels = []
    user_filesecs = []
    user_ids = []
    user_fstarts = []
    
    for key, value in sound_labels.items():
        file = '/'.join([save_audio_dir, '_'.join([user_timestamp, f'{key}.wav'])])
        if key != 'room_sound':
            label = [value, disease_status]
        else:
            label = [value]

        if exists(file):
            user_files.append(file)
            user_labels.append(label)
            user_filesecs.append(get_duration(filename=file))
            user_fstarts.append(0)
            user_ids.append(user)
    
    df['file'].extend(user_files)
    df['classification'].extend(user_labels)
    df['end'].extend(user_filesecs)
    df['users'].extend(user_ids)
    df['start'].extend(user_fstarts)

In [None]:
iterator = tqdm(range(len(data_sheet)), desc="Iterating over all patients")
Parallel(n_jobs=10, require='sharedmem')(delayed(update_df_by_user_files)(index) for index in iterator);

In [None]:
df = pd.DataFrame(df)