In [None]:
import os
os.chdir(os.path.join('/home/datawork-osmose',[s for s in os.getcwd().split('/') if 'OSmOSEanalytics' in s][0],'source'))
from OSmOSE import Spectrogram, Job_builder
from time import sleep
from pathlib import Path

path_osmose_dataset = "/home/datawork-osmose/dataset/"
path_osmose_home = "/home/datawork-osmose/"

## <span style="color:red">*FILL & RUN CELLS*</span> Dataset preparation

- ``dataset_ID`` is the name of the dataset to be processed;
- ``analysis_fs`` is the sample frequency you want to use for your analysis, which can be different from the original one.

In [None]:
dataset_ID = ''
sr_analysis = 240
local_execution = False
dataset = Spectrogram(dataset_path =Path(path_osmose_dataset, dataset_ID), sr_analysis=sr_analysis, owner_group="gosmose", local=local_execution)

print(dataset)

### Dataset subset

Note that you can process only a subset of your entire dataset by creating the file `/home/datawork-osmose/dataset/dataset_ID/analysis/subset_files.csv`, which is a simple list of files to be processed, for example:

`% head /home/datawork-osmose/dataset/fecampOWFSOMM/analysis/subset_files.csv
channelA_2020_11_20_15_40_17.wav
channelA_2020_11_20_15_43_20.wav
channelA_2020_11_20_16_20_17.wav
channelA_2020_11_20_16_23_20.wav
channelA_2020_11_20_16_30_17.wav
channelA_2020_11_20_16_33_20.wav
channelA_2020_11_20_16_43_20.wav
channelA_2020_11_20_16_50_17.wav
channelA_2020_11_20_16_53_20.wav
channelA_2020_11_20_17_10_17.wav
`

## <span style="color:red">*FILL & RUN CELLS*</span> Configure spectrogram parameters

### Main parameters 

Start by setting the value of `maxtime_display_spectro` in seconds. It corresponds to the maximal duration of the spectrogram display window. If it is different than the original file duration, you have to reshape the audio files to fit this time window.

Then, you can set the value of `nber_tile_level`, which is the number of zoom levels you want (they are used in our web-based annotation tool APLOSE). With `nber_tile_level = 1`, your shortest spectrogram display window has a duration of `maxtime_display_spectro` seconds (that is no zoom at all) ; with `nber_tile_level = 2`, a duration of `maxtime_display_spectro`/2 seconds ; with `nber_tile_level = 3`, a duration of `maxtime_display_spectro`/4 seconds ...

After that, you can set the following classical spectrogram parameters : `nfft` (in samples), `winsize` (in samples), `overlap` (in \%). **Note that with those parameters you set the resolution of your spectrogram display window with the smallest duration, obtained with the highest zoom level.**

In [None]:
dataset.spectro_duration = 900 # this default value corresponds to your audio file duration

dataset.zoom_levels = 2

dataset.nfft = 512 # samples
dataset.window_size = 512 # samples
dataset.overlap = 97 # %

### Amplitude normalization 

Eventually, we also propose you different modes of data/spectrogram normalization.

Normalization over raw data samples with the variable `data_normalization` (default value `''`, i.e. no normalization) :
- instrument-based normalization with the three parameters `sensitivity_dB` (in dB, default value = 0), `gain` (in dB, default value = 0) and `peak_voltage` (in V, default value = 1). Using default values, no normalization will be performed ;

- z-score normalization over a given time period through the variable `zscore_duration`, applied directly on your raw timeseries. The possible values are:
    - `zscore_duration = 'original'` : the audio file duration will be used as time period ;
    - `zscore_duration = '10H'` : any time period put as a string using classical [time alias](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases). This period should be higher than your file duration. 

Normalization over spectra with the variable `spectro_normalization` (default value `'density'`, see OSmOSEanalytics/documentation/theory_spectrogram.pdf for details) :
- density-based normalization by setting `spectro_normalization = 'density'`
- spectrum-based normalization by setting `spectro_normalization = 'spectrum'` 

In the cell below, you can also have access to the amplitude dynamics in dB throuh the parameters `max_color_val` and `min_color_val`, the colormap `colmapspectros` to be used (see possible options in the [documentation](https://matplotlib.org/stable/tutorials/colors/colormaps.html)) and specify the frequency cut `fmin_HighPassFilter` of a high-pass filter if needed.

In [None]:
dataset.data_normalization = 'instrument' # 'instrument' OR 'zscore'

dataset.zscore_duration = 'original' # parameter for 'zscore' mode, values = time alias OR 'original' 
dataset.sensitivity = -164 # parameter for 'instrument' mode
dataset.gain_dB = 14.7 # parameter for 'instrument' mode
dataset.peak_voltage = 2.5 # parameter for 'instrument' mode

dataset.spectro_normalization = 'density' # 'density' OR 'spectrum' 

dataset.dynamic_max = 150
dataset.dynamic_min = 0
dataset.spectro_colormap = 'viridis'

dataset.HPfilter_freq_min = 0

### Parameter adjustement 


In the cell below you can **check your spectrogram dimension w.r.t your screen resolution** (just run it). We calculate the number of time windows (or equivalently, the number of spectra) you have in your shortest spectrogram display window.

Be aware that this number should be as close as your horizontal screen resolution (ie approximately 2000 pixels, as a classical screen resolution is 1920x1080 pixels (horizontal pixels) x (vertical pixels) ) to avoid numerical compression during image display on your screen, as well as useless over-resoluted spectrograms obtained at a high computational cost. We warn you if you are higher, but you can still compute higher-resolution spectrograms if you want.

In [None]:
dataset.check_spectro_size()

## <span style="color:red">*FILL & RUN CELL*</span> Adjust spectrogram parameters and initialize

`dataset.number_adjustment_spectrograms` is the number of spectrogram examples used to adjust your parameters. If you are really not sure about your parameters, it is better to start with a small number, because each time you will have to wait for the generation of all your `dataset.number_adjustment_spectrograms` (x the different zoom levels) spectrograms before being able to re-generate spectrograms with another set of parameters.

Alternatively, you can use the `adjustFiles` to specify which files you want to use to adjust the spectrogram. If both are supplied, only `adjustFiles` will be used.

In [None]:
dataset.number_adjustment_spectrograms = 1

reshape_method = "none" # Automatically reshape the audio files to fit the spectro_duration value. Available methods : "classic" or "legacy"
samplerate = None # Only if you want an analysis sample rate different than sr_analysis
dataset.initialize(sr_analysis=samplerate, reshape_method=reshape_method)

path_to_list = Path().joinpath(f"{dataset.name}_wav_list.csv").resolve()

with open(path_to_list, "w") as f:
    f.write("\n".join(dataset.list_wav_to_process))

## <span style="color:blue">*JUST RUN CELL*</span> Adjust spectrogram parameters

### Compute `dataset.number_adjustment_spectrograms` spectrograms to adjust parameters. 

In [None]:
adjust_jb = Job_builder()

jobfile = adjust_jb.build_job_file(script_path=Path().joinpath("qsub_spectrogram_Generator.py"), \
            script_args=f"--input-file-list {path_to_list} --nb-files {dataset.number_adjustment_spectrograms} \
                --dataset-path {dataset.path} --analysis-fs {sr_analysis}", jobname="OSmOSE_AdjustSpectro", preset="low")

job_id = adjust_jb.submit_job() #submit all built job files

### Visualize `nberAdjustSpectros` spectrograms to adjust parameters. 

Re-run several times this cell to update the folder of images because they keep being generated while you visualize them. If this set of parameters does not suit you, change them and re-run new spectrograms with the previous cells, as many times as you want.

In [None]:
while adjust_jb.ongoing_jobs:
    for i in range(5):
        print(f"Parameter adjustment is still running{'.' * i}")
        sleep(1)

spectro = os.listdir(dataset.path_output_spectrogram)
display(Image(dataset.path_output_spectrogram.joinpath(spectro)))

# DONE

## <span style="color:blue">*JUST RUN CELL*</span>  Prepare spectrogram generation

Just one thing : if you create your spectrograms for an APLOSE campaign, set `write_datasets_csv_for_APLOSE=True` below !

In [None]:
write_datasets_csv_for_APLOSE=False #TODO

## <span style="color:blue">*JUST RUN CELL*</span> Launch spectrogram generation

You might want to increase the dataset.Batch_number if the files needs to be split up in more than 10 groups, or decrease it if 10 groups are too much.

In [None]:
dataset.batch_number = 10

batch_size = len(dataset.list_wav_to_process) // dataset.batch_number

while dataset.Jb.ongoing_jobs:
    print(f"\rPlease wait for the following jobs to finish: {','.join(dataset.Jb.ongoing_jobs)}", end="")
    sleep(2)

print("\rAll previous jobs are completed, ready to launch spectrograms")

jb = Job_builder()
for batch in range(dataset.batch_number):
    i_min = batch * batch_size
    i_max = (i_min + batch_size if batch < dataset.batch_number - 1 else len(dataset.list_wav_to_process)) # If it is the last batch, take all files

    jobfile = jb.build_job_file(script_path=Path().joinpath("qsub_spectrogram_generator_pkg.py"), \
                script_args=f"--input-file-list {path_to_list} --dataset-path {dataset.path} --analysis-fs {sr_analysis} \
                --ind-min {i_min} --ind-max {i_max}", jobname="OSmOSE_SpectroGenerator", preset="medium")

job_id_list = jb.submit_job() #submit all built job files

## Track progress

In [None]:
while jb.ongoing_jobs:
    for i in range(5):
        print(f"\rOngoing jobs: {len(jb.ongoing_jobs)}/{len(job_id_list)}; Finished jobs: {len(jb.finished_jobs)}/{len(job_id_list)}{'.' * i}", end="")
        sleep(1)
