In [None]:
# JUST RUN THIS CELL : NOTHING TO FILL !

import os, sys
from pathlib import Path
import pandas as pd
import subprocess
from OSmOSE import Spectrogram, Job_builder
from OSmOSE.cluster import reshape
from OSmOSE.utils import *
from time import sleep
from IPython.display import Image
import pickle
import numpy as np
import glob
from tqdm import tqdm
import random
import shutil

path_osmose_dataset = "/home/datawork-osmose/dataset/"
path_osmose_home = "/home/datawork-osmose/"

jb = Job_builder()

display_folder_storage_info(path_osmose_home)

In [None]:
list_dataset(path_osmose=path_osmose_dataset, campaign="DELGOST")

### Summary

**I. Select dataset** : choose your dataset to be processed and get key metadata on it

**II. Configure spectrograms** : define all spectrogram parameters, and adjust them based on spectrograms computed on the fly

**III. Generate spectrograms** : launch the complete generation of spectrograms

# I. Select dataset 

If your dataset is part of a recording campaign, please provide its name with `campaign_name` ; in that case your dataset should be present in `home/datawork-osmose/dataset/{campaign_name}/{dataset_name}`. Otherwise let the default value `campaign_name = ""`.

In [None]:
# FILL RED PARTS !

dataset_name = "DELGOST_090623_ST"
campaign_name = "DELGOST"

dataset = Spectrogram(
    dataset_path=Path(path_osmose_dataset, campaign_name, dataset_name),
    owner_group="gosmose",
    local=False,
)

print(dataset)

# Configure spectrograms

The two following parameters `spectro_duration` (in s) and `dataset_sr` (in Hz) will allow you to process your data using different file durations (ie segmentation) and/or sampling rate (ie resampling) parameters. `spectro_duration` is the maximal duration of the spectrogram display window.

To process audio files from your original folder (ie without any segmentation and/or resampling operations), use the original audio file duration and sample rate parameters estimated at your dataset uploading (they are printed in the previous cell). 

In [None]:
# FILL GREEN PARTS !
dataset.spectro_duration = 10  # seconds
dataset.dataset_sr = 120000  # Hz

Then, you can set the value of `zoom_levels`, which is the number of zoom levels you want (they are used in our web-based annotation tool APLOSE). With `zoom_levels = 0`, your shortest spectrogram display window has a duration of `spectro_duration` seconds (that is no zoom at all) ; with `zoom_levels = 1`, a duration of `spectro_duration`/2 seconds ; with `zoom_levels = 2`, a duration of `spectro_duration`/4 seconds ...

In [None]:
# FILL GREEN PARTS !
dataset.zoom_level = 0  # int

After that, you can set the following classical spectrogram parameters : `nfft` (in samples), `winsize` (in samples), `overlap` (in \%). **Note that with those parameters you set the resolution of your spectrogram display window with the smallest duration, obtained with the highest zoom level.**

In [None]:
# FILL GREEN PARTS !
dataset.nfft = 1024  # samples
dataset.window_size = 1024  # samples
dataset.overlap = 20  # %

In case of audio segmentation, you can use the following variable `audio_file_overlap` (in seconds, default value = 0) to set an overlap in seconds between two consecutive segments.

In [None]:
# FILL GREEN PARTS !
dataset.audio_file_overlap = 0  # seconds

#### Amplitude normalization 

Eventually, we also propose you different modes of data/spectrogram normalization.

Normalization over raw data samples with the variable `data_normalization` (default value `'none'`, i.e. no normalization) :
- instrument-based normalization with the three parameters `sensitivity_dB` (in dB, default value = 0), `gain` (in dB, default value = 0) and `peak_voltage` (in V, default value = 1). Using default values, no normalization will be performed ;

- z-score normalization over a given time period through the variable `zscore_duration`, applied directly on your raw timeseries. The possible values are:
    - `zscore_duration = 'original'` : the audio file duration will be used as time period ;
    - `zscore_duration = '10H'` : any time period put as a string using classical [time alias](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases). This period should be higher than your file duration. 

Normalization over spectra with the variable `spectro_normalization` (default value `'density'`, see OSmOSEanalytics/documentation/theory_spectrogram.pdf for details) :
- density-based normalization by setting `spectro_normalization = 'density'`
- spectrum-based normalization by setting `spectro_normalization = 'spectrum'` 

In the cell below, you can also have access to the amplitude dynamics in dB throuh the parameters `dynamic_max` and `dynamic_min`, the colormap `spectro_colormap` to be used (see possible options in the [documentation](https://matplotlib.org/stable/tutorials/colors/colormaps.html)) and specify the frequency cut `HPfilter_freq_min` of a high-pass filter if needed.

In [None]:
# FILL GREEN and RED PARTS !
dataset.data_normalization = "instrument"  # 'instrument' OR 'zscore' OR 'none'

dataset.zscore_duration = (
    "original"  # parameter for 'zscore' mode, values = time alias OR 'original'
)
dataset.sensitivity = -177.2  # parameter for 'instrument' mode
dataset.gain_dB = 0  # parameter for 'instrument' mode
dataset.peak_voltage = 2  # parameter for 'instrument' mode

dataset.spectro_normalization = "density"  # 'density' OR 'spectrum'

dataset.dynamic_max = 120  # dB
dataset.dynamic_min = 0  # dB
dataset.colormap = "viridis"

dataset.hp_filter_min_freq = 1  # Hz

You can now check the size of your spectrogram resulting from those parameters

In [None]:
dataset.check_spectro_size()

#### Adjust spectrogram parameters

In the cell below you can visualize some spectrograms computed on the fly

- `number_adjustment_spectrograms` is the number of spectrogram examples used to adjust your parameters

- You can use the variable `file_list` in the cell below to adjust your spectrogram parameters using specific files; be careful these files must be present in a `temp_adjustment_output_dir` folder computed with a random selection; put their names in this list as follows, eg `file_list = ['2020_06_05T15_10_00.wav','2020_06_07T15_41_40.wav']

In [None]:
# FILL GREEN PARTS !
number_adjustment_spectrogram = 2
file_list = []

In [None]:
# JUST RUN THIS CELL : NOTHING TO FILL !

orig_metadata = pd.read_csv(
    dataset._get_original_after_build().joinpath("metadata.csv"), header=0
)
orig_dura = orig_metadata["audio_file_origin_duration"][0]
orig_sr = orig_metadata["origin_sr"][0]

origin_files = glob.glob(
    path_osmose_dataset
    + f"{campaign_name}/{dataset_name}/data/audio/{orig_dura}_{orig_sr}/*wav"
)
temp_adjustment_output_dir = (
    path_osmose_dataset
    + f"{campaign_name}/{dataset_name}/data/audio/temp_{dataset.spectro_duration}_{dataset.dataset_sr}"
)

if (
    dataset.data_normalization == "zscore"
    and dataset.spectro_normalization != "spectrum"
):
    dataset.spectro_normalization = "spectrum"
    print(
        "WARNING: the spectrogram normalization has been changed to spectrum because the data will be normalized using zscore."
    )

if len(file_list) > 0:
    files_adjust = [temp_adjustment_output_dir + "/" + ff for ff in file_list]

elif dataset.spectro_duration == orig_dura and dataset.dataset_sr == orig_sr:
    files_adjust = random.sample(
        origin_files, min(number_adjustment_spectrogram, len(origin_files))
    )
    files_adjust = files_adjust[:number_adjustment_spectrogram]

else:
    files_to_process = random.sample(
        origin_files, min(number_adjustment_spectrogram, len(origin_files))
    )

    if os.path.exists(temp_adjustment_output_dir):
        shutil.rmtree(temp_adjustment_output_dir)

    reshaped = reshape(
        input_files=files_to_process,
        chunk_size=dataset.spectro_duration,
        new_sr=dataset.dataset_sr,
        output_dir_path=temp_adjustment_output_dir,
        offset_beginning=0,
        offset_end=0,
        last_file_behavior="pad",
    )

    files_adjust = glob.glob(temp_adjustment_output_dir + "/*wav")
    files_adjust = files_adjust[:number_adjustment_spectrogram]

for audio_file in files_adjust:
    dataset.process_file(audio_file, adjust=True)

dataset.save_spectro_metadata(True)

# Generate spectrograms

- `dataset.batch_number` indicates the number of concurrent jobs. A higher number can speed things up until a certain point. It still does not work very well.

- If you create your spectrograms for an APLOSE campaign, set - `write_datasets_csv_for_APLOSE=True` below !

- The variable below `save_matrix` should be set to True if you want to generate the numpy matrices along your png spectrograms

In [None]:
# FILL GREEN PARTS !
dataset.batch_number = 5

write_datasets_csv_for_APLOSE = False

save_matrix = False

#### Segmentation


In [None]:
dataset.initialize(
    env_name=sys.executable.replace("/bin/python", ""),
    force_init=True,
    last_file_behavior="discard",
)

#### Spectrogram generation

In [None]:
# JUST RUN THIS CELL : NOTHING TO FILL !

# add entry to dataset csv file
if write_datasets_csv_for_APLOSE is True:

    dataset_info = {
        "project": campaign_name,
        "dataset": dataset.name,
        "spectro_duration": f"{dataset.spectro_duration}",
        "dataset_sr": f"{dataset.dataset_sr}",
        "files_type": ".wav",
        "identifier": campaign_name
        + "_"
        + dataset.name
        + "_"
        + str(dataset.spectro_duration)
        + "_"
        + str(dataset.dataset_sr),
    }
    dataset_info = pd.DataFrame(dataset_info, index=[0])

    add_entry_for_APLOSE(
        path=path_osmose_dataset, file="datasets_copy.csv", info=dataset_info
    )

# compute expected_nber_segmented_files
if (
    dataset.spectro_duration
    != pd.read_csv(
        str(dataset._get_original_after_build()) + "/metadata.csv", header=0
    )["audio_file_origin_duration"][0]
):
    origin_file_metadata = pd.read_csv(
        str(dataset._get_original_after_build()) + "/file_metadata.csv"
    )
    nber_files_to_process = 0
    for dd in origin_file_metadata["duration"].values:
        nber_files_to_process += dd / (
            dataset.spectro_duration - dataset.audio_file_overlap
        )
    nber_files_to_process = int(nber_files_to_process)
else:
    nber_files_to_process = pd.read_csv(
        str(dataset._get_original_after_build()) + "/metadata.csv", header=0
    )["audio_file_count"][0]

batch_size = nber_files_to_process // dataset.batch_number

dataset.save_spectro_metadata(False)

for batch in range(dataset.batch_number):
    i_min = batch * batch_size
    i_max = (
        i_min + batch_size
        if batch < dataset.batch_number - 1
        else nber_files_to_process
    )  # If it is the last batch, take all files

    jobfile = jb.build_job_file(
        script_path=Path(
            os.path.abspath("../src"), "qsub_spectrogram_generator_pkg.py"
        ),
        script_args=f"--dataset-path {dataset.path}\
                --dataset-sr {dataset.dataset_sr} \
                --batch-ind-min {i_min}\
                --batch-ind-max {i_max}\
                {'--save-matrix' if save_matrix else ''}",
        jobname="OSmOSE_SpectroGenerator",
        preset="low",
        env_name=sys.executable.replace("/bin/python", ""),
        mem="70G",
        walltime="10:00:00",
        logdir=dataset.path.joinpath("log"),
    )

pending_jobs = [
    jobid
    for jobid in dataset.pending_jobs
    if b"finished" not in subprocess.run(["qstat", jobid], capture_output=True).stderr
]
job_id_list = jb.submit_job(dependency=pending_jobs)  # submit all built job files
nb_jobs = len(jb.finished_jobs) + len(job_id_list)

print(f"The job ids are {job_id_list}")

# Track progress

In [None]:
# JUST RUN THIS CELL : NOTHING TO FILL !

nber_audio_files = len(list(dataset.audio_path.glob("*wav")))
nber_spectros = len(list(dataset.path_output_spectrogram.glob("*png")))

if nber_audio_files == nber_files_to_process:
    status = "DONE"
else:
    status = "ONGOING"

print(
    "o Audio file preparation : " + status + " (",
    nber_audio_files,
    "/",
    str(nber_files_to_process),
    ")",
)

if nber_spectros == nber_files_to_process * sum(
    2**i for i in range(dataset.zoom_level + 1)
):
    status = "DONE"
else:
    status = "ONGOING"

print(
    "o Spectrogram generation : " + status + " (",
    nber_spectros,
    "/",
    str(nber_files_to_process * sum(2**i for i in range(dataset.zoom_level + 1))),
    ")",
)