In [None]:
from os import makedirs
from os.path import join, dirname
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from cac.utils.io import save_yml, read_yml
from cac.utils.pandas import apply_filters

In [None]:
random_state = 0
np.random.seed(random_state)

In [None]:
data_root = '/data/flusense/processed/'

In [None]:
annotation = pd.read_csv(join(data_root, 'annotation.csv'))

In [None]:
annotation['segmentation'][0]

In [None]:
annotation['classification'][0]

In [None]:
annotation['segmentation'] = annotation['segmentation'].apply(lambda x: x if isinstance(x, dict) else eval(x))
annotation['classification'] = annotation['classification'].apply(lambda x: x if isinstance(x, list) else eval(x))

In [None]:
annotation.head()

In [None]:
annotation['file'] = annotation['file'].apply(lambda x: join(data_root, 'audio', x) + '.wav')

In [None]:
annotation.head()

In [None]:
indices = np.arange(len(annotation))

In [None]:
train_indices, val_test_indices = train_test_split(indices, test_size=0.2, random_state=random_state)

In [None]:
val_indices, test_indices = train_test_split(val_test_indices, test_size=0.5, random_state=random_state)

In [None]:
len(train_indices), len(val_indices), len(test_indices)

In [None]:
annotation_train = annotation.loc[train_indices].reset_index(drop=True)
annotation_val = annotation.loc[val_indices].reset_index(drop=True)
annotation_test = annotation.loc[test_indices].reset_index(drop=True)

In [None]:
tasks = {
    'classification': {
        'valid_labels': ['cough']
    }
}

In [None]:
def convert_files_into_segments(df):
    segmented_files = defaultdict(list)

    for row_idx in tqdm(range(len(df)), desc='Iterating over the dataset'):
        row = df.loc[row_idx]
        file = row['file']

        for label in row['segmentation']:
            segments = row['segmentation'][label]

            for segment in segments:
                start, end = segment

                if (end - start) <= 1e-2:
                    continue

                segmented_files['file'].append(file)
                label = {'classification': ['cough']} if label == 'cough' else {'classification': []}
                segmented_files['label'].append(label)
                segmented_files['start'].append(start)
                segmented_files['end'].append(end)

    return pd.DataFrame(segmented_files, columns=['file', 'label', 'start', 'end'])

In [None]:
df_train = convert_files_into_segments(annotation_train)

In [None]:
df_val = convert_files_into_segments(annotation_val)

In [None]:
df_test = convert_files_into_segments(annotation_test)

In [None]:
df_val.label.astype('str').value_counts()

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 7))

modes = ['train', 'val', 'test']

for i, mode in enumerate(modes):
    _df = eval('df_{}'.format(mode)).copy()
    _df['label'] = _df['label'].apply(lambda x: 'cough' if 'cough' in x['classification'] else 'non-cough')

    sns.countplot(x='label', data=_df, ax=ax[i])
    
    counts = _df['label'].value_counts()
    ax[i].set_title(mode.upper() + '   [cough: {}  non-cough: {}]'.format(counts['cough'], counts['non-cough']))
    ax[i].set_ylim(0, 10000)
    ax[i].grid()

plt.show()

In [None]:
version = 'segmented-v1.0'
save_path = join(data_root, 'versions', '{}.yml'.format(version))

In [None]:
description = dict()
description['tasks'] = tasks
description['description'] = 'cough vs non-cough classification with split randomly done across files & files segmented within a split'

for name, _df in zip(['all', 'train', 'val', 'test'], [pd.concat([df_train, df_val, df_test]), df_train, df_val, df_test]):
    description[name] = {
        'file': _df['file'].values.tolist(),
        'label': _df['label'].values.tolist(),
        'start': _df['start'].values.tolist(),
        'end': _df['end'].values.tolist()
    }

In [None]:
# save description
makedirs(dirname(save_path), exist_ok=True)
save_yml(description, save_path)