This notebook enables to create datasets ready for test. It requires a .wav dataset, the .yaml file describing it and a .csv file giving annotations.

In [None]:
import datetime

import numpy as np
import yaml
from tqdm import tqdm
import csv
from pathlib import Path

from src.utils.data_reading.features_extractor import STFTFeaturesExtractor, RelativeDWTFeaturesExtractor, WaveformDataFeaturesExtractor
from src.utils.data_reading.sound_file_manager import WavFilesManager

## Parameters

In [None]:
dataset_root_path = "PATH/TO/DATA"  # path of the downloaded dataset (containing the .yaml)
dataset = "test_dataset_HYDROMOMAR"  # test set we want to use, can also be "test_dataset_HYDROMOMAR" (see the yaml file)
output_path = f"{dataset_root_path}/output/{dataset}/waveforms"  # directory where the dataset will be outputed, can be changed

# half duration of each window
delta_output = datetime.timedelta(seconds=50)

# type of data that we want
#features_extractor = STFTFeaturesExtractor(None, vmin=-35, vmax=140)   # spectrograms generator
#features_extractor = RelativeDWTFeaturesExtractor(None)  # DWT features generator
features_extractor = WaveformDataFeaturesExtractor(None, 1)  # waveforms generator

## Initialization

In [None]:
yaml_path = f"{dataset_root_path}/datasets.yaml"
with open(yaml_path, "r") as f:  # parameters of the dataset
    params = yaml.load(f, Loader=yaml.BaseLoader)[dataset]
prefix = f'{dataset_root_path}/{params["root_dir"]}'  # path of the dataset root (where station directories are)
stations = list(params["stations"].keys())  # list of station names (like ["ELAN", "MADE",...])

dataset_csv_path = f"{prefix}/dataset.csv"  # csv file that contains information about the .wav dataset
test_dataset_csv_path = f"{output_path}/dataset.csv"  # csv file that will contain information about the created test dataset

managers = {h : WavFilesManager(f"{prefix}/{h}") for h in stations}  # WavFilesManager enable to handle the .wav reading part

Path(f"{output_path}/positives").mkdir(parents=True, exist_ok=True)
Path(f"{output_path}/negatives").mkdir(parents=True, exist_ok=True)

csv_data = []

## Data loading

In [None]:
with open(f"{dataset_csv_path}") as f:
    csv_reader = csv.reader(f, delimiter=",")
    next(csv_reader) # skip first line which contains column names
    lines = list(csv_reader)
lines = [l for l in lines if l[1] in ["uncertain","T","H"]]
for line in lines:
    line[2] = datetime.datetime.strptime(line[2], "%Y%m%d_%H%M%S")

## Dataset creation

In [None]:
positives = {station: [] for station in stations}
segments_to_save = {station: [] for station in stations}
csv_data = []

# browse the segments to save
for station in stations:
    d = [l[2] for l in lines if l[0]==station]
    nb = [l[-1] for l in lines if l[0]==station]
    d = np.sort(d)
    start = datetime.datetime.strptime(params["stations"][station]["date_start"], "%Y%m%d_%H%M%S")  # start of the current segment
    initial_sart = start
    end = datetime.datetime.strptime(params["stations"][station]["date_end"], "%Y%m%d_%H%M%S")  # end of the current segment
    i = 0  # index of the considered segment
    j = 0  # positive events index
    while start+2*delta_output <= end:
        segments_to_save[station].append((start, start+2*delta_output))
        positives[station].append([])
        
        # get the next positive event that is after start
        while j<len(d) and d[j] - (start + delta_output) <= delta_output:
            event_pos = d[j] - (start + delta_output)
            positives[station][-1].append(f"{event_pos.total_seconds():.1f}")
            positives[station][-1].append(f"{nb[j]}")
            j += 1
        
        i += 1
        start = initial_sart + i * (2 * delta_output)

# compute and save spectrogram
for station in tqdm(stations):
    features_extractor.manager = managers[station]
    segments_to_save[station] = np.array(segments_to_save[station])
    
    pos_idx = [i for i in range(len(positives[station])) if len(positives[station][i]) > 0]
    neg_idx = [i for i in range(len(positives[station])) if len(positives[station][i]) == 0]
    pos_starts, pos_ends = zip(*segments_to_save[station][pos_idx])
    neg_starts, neg_ends = zip(*segments_to_save[station][neg_idx])
    pos_path_prefix = f'{output_path}/positives/{station}'
    neg_path_prefix = f'{output_path}/negatives/{station}'
    
    if isinstance(features_extractor, RelativeDWTFeaturesExtractor):
        # make only 1 file for the station because data representation is small enough
        path = f'{pos_path_prefix}.{features_extractor.EXTENSION}'
        features_extractor.save_features_batch_single_file(pos_starts, pos_ends, path)
        csv_data.extend([[path, str(i), "positive"] + positives[station][pos_idx[i]] for i in range(len(pos_idx))])
        
        path = f'{neg_path_prefix}.{features_extractor.EXTENSION}'
        features_extractor.save_features_batch_single_file(neg_starts, neg_ends, path)
        csv_data.extend([[path, str(i), "negative"] for i in range(len(neg_idx))])
    else:
        # make 1 folder per station
        Path(pos_path_prefix).mkdir(parents=True, exist_ok=True)
        d = [p + delta_output for p in pos_starts] # middle of each segment
        paths = [f'{pos_path_prefix}/{d[i].strftime("%Y%m%d_%H%M%S")}.{features_extractor.EXTENSION}' for i in range(len(d))]
        features_extractor.save_features_batch(pos_starts, pos_ends, paths)
        csv_data.extend([[paths[i], "positive"] + positives[station][pos_idx[i]] for i in range(len(pos_idx))])
        
        Path(neg_path_prefix).mkdir(parents=True, exist_ok=True)
        d = [n + delta_output for n in neg_starts] # middle of each segment
        paths = [f'{neg_path_prefix}/{d[i].strftime("%Y%m%d_%H%M%S")}.{features_extractor.EXTENSION}' for i in range(len(d))]
        features_extractor.save_features_batch(neg_starts, neg_ends, paths)
        csv_data.extend([[paths[i], "negative"] + positives[station][neg_idx[i]] for i in range(len(neg_idx))])

In [None]:
with open(test_dataset_csv_path, "w") as f:
    csv.writer(f).writerows(csv_data)