In [18]:
from cac.utils.io import read_yml, save_yml
import pandas as pd
import numpy as np
from os import makedirs
from os.path import join, dirname, basename, splitext
from librosa.core import get_duration
from tqdm import tqdm

In [9]:
version = 'v9.8'

In [14]:
data_root = '/data/wiai-facility'
load_dir = join(data_root, 'processed/audio/')

In [10]:
data_config = read_yml(join(f'/data/wiai-facility/processed/versions/{version}.yml'))
attributes = pd.read_csv('/data/wiai-facility/processed/attributes.csv')

In [11]:
def get_user_from_path(path):
    filename = splitext(basename(path))[0]
    split_index = filename.find('_cough')
    user = filename[:split_index]
    user = '_'.join(user.split('_')[:-2])
    return user

In [12]:
keys = ['all', 'train', 'val', 'test']

In [19]:
new_config = dict()
for key in keys:
    print (f'Working on {key}')
    d = data_config[key]
    files = []
    labels = []
    starts = []
    ends = []
    users = []
    
    length = len(d['file'])
    for i in tqdm(range(length)):
        file = d['file'][i]
        label = d['label'][i]
        start = d['start'][i]
        end = d['end'][i]
        user = get_user_from_path(file)
        
        if user not in users:
            users.append(user)

            filename = splitext(basename(file))[0]
            split_index = filename.find('_cough')
            path = join(load_dir, ''.join([filename[:split_index], '_audio_1_to_10.wav']))
            end = get_duration(filename=path)

            files.append(path)
            labels.append(label)
            starts.append(start)
            ends.append(end)
    
    print (f'Number of files from {length} reduced to {len(ends)}')
    new_config[key] = {'end' : ends,
                       'file' : files,
                       'label' : labels,
                       'start' : starts}

  1%|          | 67/12780 [00:00<00:19, 659.38it/s]

Working on all


100%|██████████| 12780/12780 [01:18<00:00, 163.53it/s]
  1%|          | 112/10224 [00:00<00:09, 1093.63it/s]

Number of files from 12780 reduced to 4260
Working on train


100%|██████████| 10224/10224 [00:07<00:00, 1345.86it/s]
 14%|█▍        | 178/1278 [00:00<00:00, 1766.95it/s]

Number of files from 10224 reduced to 3408
Working on val


100%|██████████| 1278/1278 [00:00<00:00, 1812.07it/s]
 15%|█▌        | 193/1278 [00:00<00:00, 1917.63it/s]

Number of files from 1278 reduced to 426
Working on test


100%|██████████| 1278/1278 [00:00<00:00, 1848.60it/s]

Number of files from 1278 reduced to 426





In [20]:
data_root = '/data/wiai-facility/processed/'
save_version = f'{version}_voice'
save_path = join(data_root, 'versions', f'{save_version}.yml')

print (save_path)
save_yml(new_config, save_path)

/data/wiai-facility/processed/versions/v9.8_voice.yml
