In [None]:
from os import makedirs
from os.path import join, dirname
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from cac.utils.io import save_yml

In [None]:
data_root = '/data/freesound-kaggle/processed/'

In [None]:
annotation = pd.read_csv(join(data_root, 'annotation.csv'))
attributes = pd.read_csv(join(data_root, 'attributes.csv'))

In [None]:
tasks = {
    'classification': {
        'valid_labels': ['cough']
    }
}

In [None]:
annotation.head()

In [None]:
# ignore 'file'
columns = annotation.columns[1:]

In [None]:
annotation['classification'] = annotation['classification'].apply(lambda x: eval(x))

In [None]:
files = [join(data_root, 'audio', file + '.wav') for file in annotation['file']]
starts = annotation['start'].values
ends = annotation['end'].values

In [None]:
labels = []

for _ in files:
    labels.append(dict())
    
for task in tasks:
    valid_values = tasks[task]['valid_labels']

    for i, _values in enumerate(annotation[task]):   
        _labels = []
        
        for valid_value in valid_values:
            if valid_value in _values:
                _labels.append(valid_value)
        
        labels[i][task] = _labels

In [None]:
df = pd.DataFrame({'file': files, 'label': labels, 'start': starts, 'end': ends})

In [None]:
df.head()

In [None]:
len(df)

In [None]:
indices = list(range(len(df)))
train_indices, val_test_indices = train_test_split(indices, test_size=0.2, random_state=20)
val_indices, test_indices = train_test_split(val_test_indices, test_size=0.5, random_state=20)

In [None]:
len(train_indices), len(val_indices), len(test_indices)

In [None]:
df_train = df.loc[train_indices].reset_index(drop=True)
df_val = df.loc[val_indices].reset_index(drop=True)
df_test = df.loc[test_indices].reset_index(drop=True)

In [None]:
version = 'v1.0'
save_path = join(data_root, 'versions', '{}.yml'.format(version))

In [None]:
description = dict()
description['tasks'] = tasks
description['description'] = 'cough vs non-cough with random split'

for name, _df in zip(['all', 'train', 'val', 'test'], [df, df_train, df_val, df_test]):
    description[name] = {
        'file': _df['file'].values.tolist(),
        'label': _df['label'].values.tolist(),
    }

In [None]:
# save description
makedirs(dirname(save_path), exist_ok=True)
save_yml(description, save_path)