This notebook enables to create datasets ready for training. It requires a .wav dataset, the .yaml file describing it and a .csv file giving annotations.

In [None]:
import datetime
import random

import numpy as np
import yaml
from tqdm import tqdm
import csv
from pathlib import Path

from src.utils.data_reading.features_extractor import STFTFeaturesExtractor, RelativeDWTFeaturesExtractor, WaveformDataFeaturesExtractor
from src.utils.data_reading.sound_file_manager import WavFilesManager

## Parameters

In [None]:
dataset_root_path = "/PATH/TO/DATASET"  # path of the downloaded dataset (containing the .yaml)
output_path = f"{dataset_root_path}/output/train/spectrograms"  # directory where the dataset will be outputed, can be changed

# duration taken before and after the event for the output
delta_output = datetime.timedelta(seconds=50)

# type of data that we want
features_extractor = STFTFeaturesExtractor(None, vmin=-35, vmax=140)  # spectrograms generator
#features_extractor = RelativeDWTFeaturesExtractor(None)  # DWT features generator
#features_extractor = WaveformDataFeaturesExtractor(None, 1)  # waveforms generator

random_offset_multiplicator = 1  # set to 0 if we want to center events, 1 if they can be randomly offseted up to the borders of the segments

random.seed(0)  # seed for random number generator

## Initialization

In [None]:
yaml_path = f"{dataset_root_path}/datasets.yaml"
with open(yaml_path, "r") as f:  # parameters of the dataset
    params = yaml.load(f, Loader=yaml.BaseLoader)["train_dataset"]
prefix = f'{yaml_path}/{params["root_dir"]}'  # path of the dataset root (where station directories are)
stations = list(params["stations"].keys())  # list of station names (like ["ELAN", "MADE",...])

dataset_csv_path = f"{output_path}/dataset.csv"  # csv file that will contain information about the created dataset

positives = {h : [] for h in stations}
negatives = {h : [] for h in stations}
managers = {h : WavFilesManager(f"{prefix}/{h}") for h in stations}  # WavFilesManager enable to handle the .wav reading part

Path(f"{output_path}/positives").mkdir(parents=True, exist_ok=True)
Path(f"{output_path}/negatives").mkdir(parents=True, exist_ok=True)

csv_data = []

## Data loading

In [None]:
with open(f"{prefix}/dataset.csv") as f:  # annotations list
    csv_reader = csv.reader(f, delimiter=",")
    next(csv_reader) # skip first line which contains column names
    lines = list(csv_reader)
for line in lines:
    station = line[0]
    date = datetime.datetime.strptime(line[2], "%Y%m%d_%H%M%S")
    if line[1] == "geophony":
        positives[station].append(date)
    elif line[1] == "negative":
        date_end = datetime.datetime.strptime(line[2], "%Y%m%d_%H%M%S")
        negatives[station].append(date + (date_end - date)/2)

## Dataset creation

### Positives

In [None]:
dates = {station: [] for station in stations}
segments_to_save = {station: [] for station in stations}

# choose segments (with a random offset) for the positives
for station, d in positives.items():  # for each station
    d = np.sort(d)
    for i in range(len(d)):  # for each positive event, chronologically
        dates[station].append([])
        offset = (random.random()-0.5) * 1.9 * delta_output * random_offset_multiplicator
        segments_to_save[station].append((d[i] + offset - delta_output, d[i] + offset + delta_output))
        
        j=i # look for events before this one that may appear in the window
        while j>=0 and abs(d[j] - (d[i] + offset)) <= delta_output:
            event_pos = d[j] - (d[i] + offset)
            dates[station][-1].append(f"{event_pos.total_seconds():.1f}")
            j -= 1
        j=i+1 # look for events after this one that may appear in the window
        while j<len(d) and abs(d[j] - (d[i] + offset)) <= delta_output:
            event_pos = d[j] - (d[i] + offset)
            dates[station][-1].append(f"{event_pos.total_seconds():.1f}")
            j += 1

# compute and save spectrogram
for station in tqdm(positives.keys()):
    features_extractor.manager = managers[station]
    starts, ends = zip(*segments_to_save[station])
    path_prefix = f'{output_path}/positives/{station}'
    
    if isinstance(features_extractor, RelativeDWTFeaturesExtractor):
        # make only 1 file for the station because data representation is small enough
        path = f'{path_prefix}.{features_extractor.EXTENSION}'
        features_extractor.save_features_batch_single_file(starts, ends, path)
        csv_data.extend([[path, str(i), "positive"] + dates[station][i] for i in range(len(starts))])
    else:
        # make 1 folder per station
        Path(path_prefix).mkdir(parents=True, exist_ok=True)
        d = positives[station]
        paths = [f'{path_prefix}/{d[i].strftime("%Y%m%d_%H%M%S")}.{features_extractor.EXTENSION}' for i in range(len(d))]
        csv_data.extend([[paths[i], "positive"] + dates[station][i] for i in range(len(starts))])
        features_extractor.save_features_batch(starts, ends, paths)

### Negatives

In [None]:
for station in tqdm(negatives.keys()):
    features_extractor.manager = managers[station]
    starts, ends = zip(*[(negatives[station][i] - delta_output, negatives[station][i] + delta_output) for i in range(len(negatives[station]))])
    path_prefix = f'{output_path}/negatives/{station}'
    
    if isinstance(features_extractor, RelativeDWTFeaturesExtractor):
        # make only 1 file for the station because data representation is small enough
        path = f'{path_prefix}.{features_extractor.EXTENSION}'
        features_extractor.save_features_batch_single_file(starts, ends, path)
        csv_data.extend([[path, str(i), "negative"] for i in range(len(starts))])
    else:
        # make 1 folder per station
        Path(path_prefix).mkdir(parents=True, exist_ok=True)
        d = negatives[station]
        paths = [f'{path_prefix}/{d[i].strftime("%Y%m%d_%H%M%S")}.{features_extractor.EXTENSION}' for i in range(len(d))]
        csv_data.extend([[paths[i], "negative"] for i in range(len(starts))])
        features_extractor.save_features_batch(starts, ends, paths)

In [None]:
with open(dataset_csv_path, "w") as f:
    csv.writer(f).writerows(csv_data)