### Random split across patients

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from os import makedirs
from os.path import join, dirname
from collections import defaultdict
from random import shuffle
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from librosa.core import get_duration

from cac.utils.io import save_yml
from cac.utils.pandas import apply_filters

In [None]:
data_root = '/data/coswara-15-03-21/processed/'

In [None]:
annotation = pd.read_csv(join(data_root, 'annotation.csv'))
attributes = pd.read_csv(join(data_root, 'attributes.csv'))

In [None]:
annotation.shape, attributes.shape

In [None]:
len(attributes.id.unique())

#### Extract patients that have a COVID test result and rows with cough sounds

In [None]:
attributes['covid_status'].value_counts()

In [None]:
attributes['label'].value_counts()

In [None]:
# attributes['label'] = attributes['label'].apply(lambda x: eval(x))

In [None]:
attributes['covid_label'].value_counts()

In [None]:
# only select those rows that are cough and either COVID positive or negative
selected_attribute_rows = apply_filters(
    attributes,
    filters={
        'label': ["['cough-shallow', 'positive']", "['cough-heavy', 'positive']", "['cough-shallow', 'negative']", "['cough-heavy', 'negative']"]
    }
)

In [None]:
selected_attribute_rows.shape

In [None]:
selected_annotation_rows = annotation.loc[selected_attribute_rows.index]

In [None]:
selected_annotation_rows

#### Creating a random split

In [None]:
selected_ids = list(selected_attribute_rows.id.unique())

In [None]:
dev_ids, test_ids = train_test_split(selected_ids, test_size=0.15, random_state=0)

In [None]:
train_ids, val_ids = train_test_split(dev_ids, test_size=0.1875, random_state=0)

In [None]:
len(train_ids), len(val_ids), len(test_ids), len(selected_ids)

In [None]:
selected_attribute_rows['set'] = None
selected_attribute_rows.loc[selected_attribute_rows.id.isin(train_ids), 'set'] = 'train'
selected_attribute_rows.loc[selected_attribute_rows.id.isin(val_ids), 'set'] = 'val'
selected_attribute_rows.loc[selected_attribute_rows.id.isin(test_ids), 'set'] = 'test'

In [None]:
from cac.utils.plot import plot_categorical_attribute

In [None]:
plot_categorical_attribute(selected_attribute_rows, attribute='set', hue='covid_label', title='Dataset; Coswara | Version: default')

#### Create and save the data version

In [None]:
files = [join(data_root, 'audio', '{}-{}-{}'.format(date, _id, file)) \
         for date, _id, file in zip(selected_annotation_rows['date'], selected_annotation_rows['id'], selected_annotation_rows['file'])]
ids = list(selected_annotation_rows['id'])

In [None]:
labels = [
    {'classification': ['covid']} if 'positive' in eval(label) else {'classification': []} for label in selected_annotation_rows['classification']
]

In [None]:
starts = [0.0 for _ in files]
ends = [get_duration(filename=x) for x in tqdm(files)]

In [None]:
df = pd.DataFrame({'file': files, 'label': labels, 'start': starts, 'end': ends, 'id': ids})

In [None]:
df_all = apply_filters(df, {'id': selected_ids}, reset_index=True)
df_train = apply_filters(df, {'id': train_ids}, reset_index=True)
df_val = apply_filters(df, {'id': val_ids}, reset_index=True)
df_test = apply_filters(df, {'id': test_ids}, reset_index=True)

In [None]:
df_all.shape, df_train.shape, df_val.shape, df_test.shape

In [None]:
version = 'default'
save_path = join(data_root, 'versions', '{}.yml'.format(version))

In [None]:
description = dict()
description['description'] = 'cough vs non-cough with random split'

for name, _df in zip(['all', 'train', 'val', 'test'], [df, df_train, df_val, df_test]):
    description[name] = {
        'file': _df['file'].values.tolist(),
        'label': _df['label'].values.tolist(),
        'start': _df['start'].values.tolist(),
        'end': _df['end'].values.tolist(),
    }

In [None]:
# save description
makedirs(dirname(save_path), exist_ok=True)
save_yml(description, save_path)