#### Creating mini-dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from cac.utils.io import read_yml
from os.path import join
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import pandas as pd

from cac.data.audio import AudioItem
from cac.data.transforms import DataProcessor
from cac.utils.viz import plot_raw_audio_signal, plot_spectrogram_image

In [3]:
DATA_DIR = "/data/wiai-facility/"
all_audio_files = glob(join(DATA_DIR, "processed", "audio", "*.wav"))

In [4]:
MINI_DATA_RELEASE_DIR = "/data/wiai-release-mini-audios/"
MINI_SPECTROGRAM_DATA_RELEASE_DIR = "/data/wiai-release-mini-spectrograms/"
os.makedirs(MINI_DATA_RELEASE_DIR, exist_ok = True)
os.makedirs(MINI_SPECTROGRAM_DATA_RELEASE_DIR, exist_ok = True)

### Add 100 raw audio files

In [5]:
os.makedirs(os.path.join(MINI_DATA_RELEASE_DIR, 'raw', 'audio'), exist_ok = True)

In [6]:
annotations = pd.read_csv('/data/wiai-facility/processed/annotation_nov23.csv')

In [7]:
i = 0
for audio_file in tqdm(all_audio_files):
    src_path = audio_file
    dest_path = os.path.join(MINI_DATA_RELEASE_DIR, 'raw', 'audio', audio_file.split('/')[-1])
    if 'cough' in src_path and audio_file in annotations.file.values:
        i += 1
        os.symlink(src_path, dest_path)
    if i == 100:
        break

  0%|          | 290/83838 [00:00<02:29, 559.98it/s]


### Create spectrogram inputs from mini dataset

In [8]:
mini_dataset_audio_files = glob(join(MINI_DATA_RELEASE_DIR, "raw", "audio", "*.wav"))

In [9]:
len(mini_dataset_audio_files)

100

In [10]:
transforms_cfg = [
    {
        "name": "ToTensor",
        "params": {"device": "cpu"}
    },
    {
        "name": "Resample",
        "params": {
            "orig_freq": 44100,
            "new_freq": 16000
        }
    },
    {
        "name": "BackgroundNoise",
        "params": {
            "dataset_config": [
                {
                    "name": "esc-50",
                    "version": "default",
                    "mode": "all"
                }
            ],
            "min_noise_scale": 0.4,
            "max_noise_scale": 0.75
        }
    },
    {
        "name": "Spectrogram",
        "params": {
            "n_fft": 512,
            "win_length": 512,
            "hop_length": 160
        }
    },
    {
        "name": "MelScale",
        "params": {
            "n_mels": 64,
            "sample_rate": 16000,
            "f_min": 125,
            "f_max": 7500
        }
    },
    {
        "name": "AmplitudeToDB",
        "params": {}
    },
    {
        "name": "ToNumpy",
        "params": {}
    },
]

In [11]:
signal_transform = DataProcessor(transforms_cfg)

Loading items: 100%|██████████| 2000/2000 [00:00<00:00, 387984.27it/s]


In [12]:
os.makedirs(os.path.join(MINI_SPECTROGRAM_DATA_RELEASE_DIR, 'processed', 'spectrograms'), exist_ok = True)
spec_save_path = os.path.join(MINI_SPECTROGRAM_DATA_RELEASE_DIR, 'processed', 'spectrograms')

In [13]:
for mini_audio_file in tqdm(mini_dataset_audio_files):
    audio_file_name = mini_audio_file.split('/')[-1].split('.')[0]
    item = AudioItem(path=mini_audio_file)
    signal = item.load()["signal"]
    transformed_signal = signal_transform(signal)
    
    dest_path = os.path.join(spec_save_path, f'{audio_file_name}.npy')
    np.save(dest_path, transformed_signal)

100%|██████████| 100/100 [00:14<00:00,  5.54it/s]


### Create Corresponding yml file

In [14]:
from cac.utils.io import read_yml
import librosa

In [15]:
# ['end', 'file', 'label', 'start']
# {'classification': []}, {'classification': ['covid']}

In [16]:
spec_input_yml = dict()

In [17]:
splits = ['all', 'train', 'val', 'test']

In [18]:
start_list, file_list, end_list, label_list = [], [], [], []

In [19]:
for mini_audio_file in tqdm(mini_dataset_audio_files):
    start = 0.
    end = librosa.get_duration(filename = mini_audio_file)
    file_name = mini_audio_file.split('/')[-1]
    path = os.path.join('/data/wiai-facility/processed/audio', file_name)
    label_ = eval(annotations[annotations.file == path].iloc[:, 1].values[0])[-1]
    if label_ == 'Negative':
        label = {'classification': []}
    else:
        label = {'classification': ['covid']}
    file_name_prep = file_name.split('.')[0]
    spec_file_path = os.path.join(spec_save_path, f'{file_name_prep}.npy')
    os.path.exists(spec_file_path)
    
    start_list.append(start)
    end_list.append(end)
    file_list.append(spec_file_path)
    label_list.append(label)

100%|██████████| 100/100 [00:00<00:00, 177.44it/s]


In [20]:
for split in splits:
    spec_input_yml[split] = {
        'file' : file_list,
        'label' : label_list,
        'start' : start_list,
        'end' : end_list
    }

#### Save yml file

In [21]:
from cac.utils.io import save_yml

In [22]:
split_file_path = os.path.join(MINI_SPECTROGRAM_DATA_RELEASE_DIR, 'processed', 'versions')
os.makedirs(split_file_path, exist_ok = True)

In [23]:
spec_split_file_name = os.path.join(split_file_path, 'default.yml')
save_yml(spec_input_yml, spec_split_file_name)

In [24]:
spec_split_file_name

'/data/wiai-release-mini-spectrograms/processed/versions/default.yml'