## First, run this cell to set up paths and import dependencies

In [None]:
import os

from tqdm import tqdm

if not os.path.exists(r"./notebooks"):
    %cd ..

from src.audio_processor import AudioProcessor
from src.audio_dataset_processor import DAPSDatasetProcessor
from src.data_processing import SOAAudioClips, save_mean_std, compute_mean_std_from_images
from src.dataset_analysis import duration_statistics
from src.config import VALID_ACCESS_LABELS, TRAIN_DIR, TEST_DIR, VAL_DIR, DATA_DIR, DATASET_DIR

os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)

## 1. Split all allowed .wav files
We are using [DAPS](https://zenodo.org/records/4660670) dataset. It has several directories available in which there are .wav files of 5 scripts read by 20 speakers. Directories differ from each other with augmentation, which is labeled by `room` and `recording device`. In this cell we are specifying allowed directories, their contents are being discovered and splitted into 3 datasets (training, validation and test). The same script cannot be in the same dataset - `AudioDatasetProcessor` class take care of that. Balancing classes is done using batch_sampler in DataLoader by undersampling major class.

In [None]:
allowed_directories=['ipadflat_confroom1', 'ipadflat_office1', 'ipad_balcony1', 'ipad_bedroom1', 'ipad_confroom1', 'ipad_confroom2', 'ipad_livingroom1', 'ipad_office1', 'ipad_office2', 'iphone_balcony1', 'iphone_bedroom1', 'iphone_livingroom1']
dataset_processor = DAPSDatasetProcessor(DATA_DIR, VALID_ACCESS_LABELS, allowed_directories)
dataset_processor.compute_statistics()
train_set, validate_set, test_set = dataset_processor.get_datasets()

## 2. Duration statistics of .wav files
We first are checking full clips duration statistics.

In [None]:
soa_train_full_clips = SOAAudioClips(train_set)
soa_test_full_clips = SOAAudioClips(validate_set)
soa_val_full_clips = SOAAudioClips(test_set)

print("\nDataset Statistics:")
print("Training set:")
print(duration_statistics(soa_train_full_clips.clips))

print("Validation set:")
print(duration_statistics(soa_test_full_clips.clips))

print("Test set:")
print(duration_statistics(soa_val_full_clips.clips))

## 3. Split into few seconds clips
Now we decide to split full clips into few seconds subclips, they are filtered so only clips with more than `0.5` of recording has speech detected by `webrtcvad`. We are doing this to establish the same size of input for CNN.

Then after processing we save log mel grayscale spectrograms into separate directories for every dataset.

In [None]:
audio_processor = AudioProcessor()

print("Preprocessed Train Dataset:")
duration_stats = audio_processor.process_audio_clips(soa_train_full_clips, TRAIN_DIR)
print(duration_stats)

print("\nPreprocessed Validation Dataset:")
duration_stats = audio_processor.process_audio_clips(soa_val_full_clips, VAL_DIR)
print(duration_stats)

print("\nPreprocessed Test Dataset:")
duration_stats = audio_processor.process_audio_clips(soa_test_full_clips, TEST_DIR)
print(duration_stats)

## 4. Mean and standard deviation of training dataset
It is important to normalize data for our Neural Networks. It creates better distribution. It should smoothen loss function plane (so it would be easier to find global minimum). We use only training dataset for this purpose and save it to JSON file next to datasets' image directories.

Effect of normalization for example CNN can be seen [here](https://wandb.ai/mytkom-warsaw-university-of-technology/iml/reports/Effect-of-normalization-input-for-TutorialCNN--VmlldzoxMDUxMTI1OQ?accessToken=s67utpfjryb4um1240bd56i51zo5oy2bj0gbaqqz79z3hnnabkub1rdhsamhwd2v).

In [None]:
mean, std = compute_mean_std_from_images(TRAIN_DIR)
print(f"Mean: {mean}, Standard deviation: {std}")
save_mean_std(mean, std, f"{DATASET_DIR}/scaling_params.json")