Creates a dataset version for unsupervised learning tasks.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from os import makedirs, symlink, rmdir
from os.path import join, dirname, exists, isdir, basename, splitext
from shutil import rmtree
import math
from collections import defaultdict
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from librosa import get_duration
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from termcolor import colored

from cac.utils.io import save_yml
from cac.utils.pandas import apply_filters

In [None]:
random_state = 0
np.random.seed(random_state)

In [None]:
# directory where the data resides
data_root = '/data/wiai-facility/'

save_root = join(data_root, 'processed')
version_dir = join(save_root, 'versions')
makedirs(version_dir, exist_ok=True)

save_audio_dir = join(save_root, 'audio')

In [None]:
attributes = pd.read_csv(join(save_root, 'attributes.csv'))
annotation = pd.read_csv(join(save_root, 'annotation.csv'))

In [None]:
annotation.shape, attributes.shape

#### Split patients in training and validation sets

In [None]:
all_patients = list(annotation['id'].unique())

In [None]:
len(all_patients)

In [None]:
train_ids, val_test_ids = train_test_split(all_patients, test_size=0.2, random_state=random_state)

In [None]:
val_ids, test_ids = train_test_split(val_test_ids, test_size=0.5, random_state=random_state)

In [None]:
len(train_ids), len(val_ids), len(test_ids)

In [None]:
df_train = apply_filters(annotation, {'id': train_ids}, reset_index=True)
df_train = df_train.drop(columns=['classification', 'users', 'audio_type', 'id'])
df_train.rename({'unsupervised': 'label'}, axis=1, inplace=True)

In [None]:
df_val = apply_filters(annotation, {'id': val_ids}, reset_index=True)
df_val = df_val.drop(columns=['classification', 'users', 'audio_type', 'id'])
df_val.rename({'unsupervised': 'label'}, axis=1, inplace=True)

In [None]:
df_test = apply_filters(annotation, {'id': test_ids}, reset_index=True)
df_test = df_test.drop(columns=['classification', 'users', 'audio_type', 'id'])
df_test.rename({'unsupervised': 'label'}, axis=1, inplace=True)

In [None]:
df_all = apply_filters(annotation, {'id': all_patients}, reset_index=True)
df_all = df_all.drop(columns=['classification', 'users', 'audio_type', 'id'])
df_all.rename({'unsupervised': 'label'}, axis=1, inplace=True)

In [None]:
df_train.shape, df_val.shape, df_test.shape, df_all.shape

In [None]:
version = 'default-unsupervised'
save_path = join(save_root, 'versions', '{}.yml'.format(version))

In [None]:
description = dict()
description['description'] = 'version for unsupervised task(s) with random split'

for name, _df in zip(['all', 'train', 'val', 'test'], [df_all, df_train, df_val, df_test]):
    description[name] = {
        'file': _df['file'].values.tolist(),
        'label': _df['label'].values.tolist()
    }

In [None]:
# save description
makedirs(dirname(save_path), exist_ok=True)
save_yml(description, save_path)