In [13]:
import datetime
import random

import numpy as np
import yaml
from matplotlib import pyplot as plt
from tqdm import tqdm
import os
import csv

from utils.data_reading.features_extractor import STFTFeaturesExtractor, RelativeDWTFeaturesExtractor, WaveformDataFeaturesExtractor
from utils.data_reading.sound_file_manager import WavFilesManager
from utils.misc.misc import make_directory

## Parameters

In [14]:
yaml_path = "/media/plerolland/LaBoite/PublicData/dataset.yaml"
dataset = "test_dataset_HYD"
dataset = "test_dataset_OHA"

# duration taken before and after the event for the output
delta_output = datetime.timedelta(seconds=50)

# type of data that we want
features_extractor = RelativeDWTFeaturesExtractor(None)  # DWT features generator
features_extractor = STFTFeaturesExtractor(None, vmin=-35, vmax=140)  # spectrograms generator
features_extractor = WaveformDataFeaturesExtractor(None, 1)

output_path = '/media/plerolland/LaBoite/PublicData/test/{dataset_name}/waveforms'

## Initialization

In [15]:
with open(yaml_path, "r") as f:
    params = yaml.load(f, Loader=yaml.BaseLoader)[dataset]
output_path = output_path.format(**{"dataset_name":params["name"]})
prefix = params["root_dir"]
stations = list(params["stations"].keys())

dataset_csv_path = f"{output_path}/dataset.csv"

managers = {h : WavFilesManager(f"{prefix}/{h}") for h in stations}
csv_data = []

make_directory(output_path)
make_directory(f"{output_path}/positives")
make_directory(f"{output_path}/negatives")

## Data loading

In [16]:
with open(f"{prefix}/dataset.csv") as f:
    csv_reader = csv.reader(f, delimiter=",")
    next(csv_reader) # skip first line which contains column names
    lines = list(csv_reader)
lines = [l for l in lines if l[1]=="geophony"]
for line in lines:
    line[2] = datetime.datetime.strptime(line[2], "%Y%m%d_%H%M%S")

## Dataset creation

In [17]:
dates = {station: [] for station in stations}
segments_to_save = {station: [] for station in stations}

for station in stations:
    d = [l[2] for l in lines if l[0]==station]
    d = np.sort(d)
    start = datetime.datetime.strptime(params["stations"][station]["date_start"], "%Y%m%d_%H%M%S")
    end = datetime.datetime.strptime(params["stations"][station]["date_end"], "%Y%m%d_%H%M%S")
    j = 0
    while start+2*delta_output <= end:
        segments_to_save[station].append((start, start+2*delta_output))
        dates[station].append([])
        
        while j<len(d) and abs(d[j] - start) <= delta_output:
            event_pos = d[j] - (start + delta_output)
            dates[station][-1].append(f"{event_pos.total_seconds():.1f}")
            j += 1
            
        start += 2*delta_output

# compute and save spectrogram
for station in tqdm(stations):
    features_extractor.manager = managers[station]
    segments_to_save[station] = np.array(segments_to_save[station])
    
    pos_idx = [i for i in range(len(dates[station])) if len(dates[station][i]) > 0]
    neg_idx = [i for i in range(len(dates[station])) if len(dates[station][i]) == 0]
    pos_starts, pos_ends = zip(*segments_to_save[station][pos_idx])
    neg_starts, neg_ends = zip(*segments_to_save[station][neg_idx])
    pos_path_prefix = f'{output_path}/positives/{station}'
    neg_path_prefix = f'{output_path}/negatives/{station}'
    
    if isinstance(features_extractor, RelativeDWTFeaturesExtractor):
        # make only 1 file for the station because data representation is small enough
        path = f'{pos_path_prefix}.{features_extractor.EXTENSION}'
        features_extractor.save_features_batch_single_file(pos_starts, pos_ends, path)
        csv_data.extend([[path, str(i), "positive"] + dates[station][i] for i in pos_idx])
        
        path = f'{neg_path_prefix}.{features_extractor.EXTENSION}'
        features_extractor.save_features_batch_single_file(neg_starts, neg_ends, path)
        csv_data.extend([[path, str(i), "negative"] for i in pos_idx])
    else:
        # make 1 folder per station
        make_directory(pos_path_prefix)
        d = [p + delta_output for p in pos_starts] # middle of each segment
        paths = [f'{pos_path_prefix}/{d[i].strftime("%Y%m%d_%H%M%S")}.{features_extractor.EXTENSION}' for i in range(len(d))]
        features_extractor.save_features_batch(pos_starts, pos_ends, paths)
        csv_data.extend([[paths[i], "positive"] + dates[station][pos_idx[i]] for i in range(len(pos_idx))])
        
        make_directory(neg_path_prefix)
        d = [n + delta_output for n in neg_starts] # middle of each segment
        paths = [f'{neg_path_prefix}/{d[i].strftime("%Y%m%d_%H%M%S")}.{features_extractor.EXTENSION}' for i in range(len(d))]
        features_extractor.save_features_batch(neg_starts, neg_ends, paths)
        csv_data.extend([[paths[i], "negative"] + dates[station][neg_idx[i]] for i in range(len(neg_idx))])

100%|██████████| 6/6 [00:04<00:00,  1.36it/s]


In [18]:
with open(dataset_csv_path, "w") as f:
    csv.writer(f).writerows(csv_data)