In [None]:
from collections import defaultdict
from os.path import join, exists, splitext, basename, dirname, isdir
from os import listdir, symlink, makedirs
from shutil import copyfile
from praatio import tgio
from tqdm import tqdm
from glob import glob
import librosa
import pandas as pd
import scipy.io.wavfile as wav

from cac.utils.io import load_json, write_txt
from cac.utils.pandas import apply_antifilters

In [None]:
# directory where the data resides
data_root = '/data/flusense/'

In [None]:
# src and destination directories
load_dir = join(data_root, 'raw')
save_root = join(data_root, 'processed')

In [None]:
makedirs(save_root, exist_ok=True)

In [None]:
load_audio_dir = join(load_dir, 'audio')
load_annotation_dir = join(load_dir, 'annotations')

In [None]:
save_audio_dir = join(save_root, 'audio')
makedirs(save_audio_dir, exist_ok=True)

#### Important check: find out files that are unreadable via `EOFError` (can't be discovered by `librosa`)

In [None]:
files = listdir(load_audio_dir)

In [None]:
len(files)

In [None]:
files[0]

In [None]:
invalid_files = []

for file in tqdm(files, desc='Checking valid files'):
    fpath = f'/data/flusense/raw/audio/{file}'
    try:
        fs,signal = wav.read(fpath)
    except:
        invalid_files.append(file)

In [None]:
len(invalid_files)

In [None]:
# files = list(set(files) - set(invalid_files))

In [None]:
# len(files)

### Create symlinks to the original .wav files

In [None]:
len(files)

In [None]:
def check_exists(fname):
    fpath = join("/data/flusense/FluSense-data/FluSense-audio/", fname)
    return exists(fpath)

In [None]:
check_exists(files[0])

In [None]:
files = [f for f in files if check_exists(f)]

In [None]:
len(files)

In [None]:
for file in tqdm(files, desc='Creating symlinks processed/ <- raw/'):
    dest = join(save_audio_dir, file)
    if not exists(dest):
        symlink(join(load_audio_dir, file), dest)

### Creating symlinks for annotations

Annotation files are .TextGrid => using library praatio to read them

Steps for each file:

* Checked that each annotation object satisfies len(annotation.tierNameList) == 1
* Extract the list of entries
* For each entry, add the label of that entry to the list of classification labels for that file and add each interval to the list of intervals for that file

Final goal is to obtain classification_targets and segmentation_targets for all the files

In [None]:
classification_targets = []
segmentation_targets = []

for file in files:
    _classification_targets = set()
    _segmentation_targets = defaultdict(list)

    text_grid = tgio.openTextgrid(join(load_annotation_dir, file.replace('wav', 'TextGrid')))
    
    # ensure that only one name in the namelist
    assert len(text_grid.tierNameList) == 1
    
    t_name = text_grid.tierNameList[0]
    
    # this is a list of entries
    # each entry consists of an interval
    entry_list = text_grid.tierDict[t_name].entryList
    
    for entry in entry_list:
        label = entry.label
        start = entry.start
        end = entry.end
        
        # add label to classification targets for that file
        _classification_targets.add(label)
        # add interval to segmentation targets for that file
        _segmentation_targets[label].append([start, end])
    
    classification_targets.append(list(_classification_targets))
    segmentation_targets.append(dict(_segmentation_targets))

In [None]:
starts = [0.0 for _ in files]
ends = [librosa.get_duration(filename=join(save_audio_dir, x)) for x in tqdm(files)]

In [None]:
# remove .wav from the filenames
files = [splitext(file)[0] for file in files]

In [None]:
# create dataframe storing the data
df = pd.DataFrame({'file': files, 'classification': classification_targets, 'segmentation': segmentation_targets, 'start': starts, 'end': ends})

In [None]:
df.head()

In [None]:
df.shape

In [None]:
invalid_files

In [None]:
# drop invalid/unreadable files
df = apply_antifilters(df, {'file': [x.split('.wav')[0] for x in invalid_files]})

In [None]:
df.shape

In [None]:
# save the dataframe
annotation_save_path = join(save_root, 'annotation.csv')
df.to_csv(annotation_save_path, index=False)

In [None]:
description = "Annotation columns: \n \
`classification`: valid labels = [cough, sneeze, sniffle, throat-clearing,\
 speech, etc (i.e everything else)] \n \
`segmentation`: {label: list of lists, each marking the start and end of the\
 interval in which the label is occuring in the file}"

with open(join(save_root, 'description.txt'), 'w') as f:
    f.write(description)