In [None]:
version = "0.1.0"
import os
from pathlib import Path
import pandas as pd
os.chdir(Path('/home/datawork-osmose/git_osmose_datarmor_2/source'))
import subprocess
from OSmOSE import Spectrogram, Job_builder, utils
from time import sleep
from IPython.display import Image

path_osmose_dataset = "/home/datawork-osmose/dataset/"
path_osmose_home = "/home/datawork-osmose/"
env_name = "osmose"

jb = Job_builder()

#### <span style="color:blue">*JUST RUN CELL*</span>

In [None]:
utils.display_folder_storage_infos(path_osmose_home)

## <span style="color:red">*FILL & RUN CELLS*</span> Dataset preparation

- ``dataset_name`` is the name of the dataset to be processed;
- ``dataset_sr`` is the sample frequency you want to use for your analysis, which can be different from the original one.

In [None]:
dataset_name = 'MPSU_ForestouHuella'

save_matrix = False # Set to True if you want to generate the numpy matrices

local_execution = False # Change to True if you execute this notebook on your computer and not on datarmor
date_template = "" # strftime format, used to build the dataset from scratch (ignore if the dataset is already built)
dataset = Spectrogram(dataset_path =Path(path_osmose_dataset, dataset_name), owner_group="gosmose", local=local_execution)

### Dataset subset

Note that you can process only a subset of your entire dataset by creating the file `/home/datawork-osmose/dataset/dataset_ID/analysis/subset_files.csv`, which is a simple list of files to be processed, for example:

`% head /home/datawork-osmose/dataset/fecampOWFSOMM/analysis/subset_files.csv
channelA_2020_11_20_15_40_17.wav
channelA_2020_11_20_15_43_20.wav
channelA_2020_11_20_16_20_17.wav
channelA_2020_11_20_16_23_20.wav
channelA_2020_11_20_16_30_17.wav
channelA_2020_11_20_16_33_20.wav
channelA_2020_11_20_16_43_20.wav
channelA_2020_11_20_16_50_17.wav
channelA_2020_11_20_16_53_20.wav
channelA_2020_11_20_17_10_17.wav
`

## <span style="color:red">*FILL & RUN CELLS*</span> Configure spectrogram parameters

### Main parameters 

Start by setting the value of `spectro_duration` in seconds. It corresponds to the maximal duration of the spectrogram display window. If it is different than the original file duration, you have to reshape the audio files to fit this time window.

Then, you can set the value of `zoom_levels`, which is the number of zoom levels you want (they are used in our web-based annotation tool APLOSE). With `zoom_levels = 0`, your shortest spectrogram display window has a duration of `spectro_duration` seconds (that is no zoom at all) ; with `zoom_levels = 1`, a duration of `spectro_duration`/2 seconds ; with `zoom_levels = 2`, a duration of `spectro_duration`/4 seconds ...

After that, you can set the following classical spectrogram parameters : `nfft` (in samples), `winsize` (in samples), `overlap` (in \%). **Note that with those parameters you set the resolution of your spectrogram display window with the smallest duration, obtained with the highest zoom level.**

In [None]:
dataset.zoom_level = 1

dataset.nfft = 2048 # samples
dataset.window_size = 1024 # samples
dataset.overlap = 95 # %

The two following characteristics are set as identical to the original audio files by default. Change them and run the cell below only if you want other parameters for the audio files.

In [None]:
dataset.spectro_duration = 20
dataset.dataset_sr = 16000

### Amplitude normalization 

Eventually, we also propose you different modes of data/spectrogram normalization.

Normalization over raw data samples with the variable `data_normalization` (default value `''`, i.e. no normalization) :
- instrument-based normalization with the three parameters `sensitivity_dB` (in dB, default value = 0), `gain` (in dB, default value = 0) and `peak_voltage` (in V, default value = 1). Using default values, no normalization will be performed ;

- z-score normalization over a given time period through the variable `zscore_duration`, applied directly on your raw timeseries. The possible values are:
    - `zscore_duration = 'original'` : the audio file duration will be used as time period ;
    - `zscore_duration = '10H'` : any time period put as a string using classical [time alias](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases). This period should be higher than your file duration. 

Normalization over spectra with the variable `spectro_normalization` (default value `'density'`, see OSmOSEanalytics/documentation/theory_spectrogram.pdf for details) :
- density-based normalization by setting `spectro_normalization = 'density'`
- spectrum-based normalization by setting `spectro_normalization = 'spectrum'` 

In the cell below, you can also have access to the amplitude dynamics in dB throuh the parameters `dynamic_max` and `dynamic_min`, the colormap `spectro_colormap` to be used (see possible options in the [documentation](https://matplotlib.org/stable/tutorials/colors/colormaps.html)) and specify the frequency cut `HPfilter_freq_min` of a high-pass filter if needed.

In [None]:
dataset.data_normalization = 'instrument' # 'instrument' OR 'zscore'

dataset.zscore_duration = 'original' # parameter for 'zscore' mode, values = time alias OR 'original' 
dataset.sensitivity = -164 # parameter for 'instrument' mode
dataset.gain_dB = 14.7 # parameter for 'instrument' mode
dataset.peak_voltage = 2.5 # parameter for 'instrument' mode

dataset.spectro_normalization = 'density' # 'density' OR 'spectrum' 

dataset.dynamic_max = 120
dataset.dynamic_min = 0
dataset.colormap = 'viridis'

dataset.hp_filter_min_freq = 2

### Parameter adjustement 


In the cell below you can **check your spectrogram dimension w.r.t your screen resolution** (just run it). We calculate the number of time windows (or equivalently, the number of spectra) you have in your shortest spectrogram display window.

Be aware that this number should be as close as your horizontal screen resolution (ie approximately 2000 pixels, as a classical screen resolution is 1920x1080 pixels (horizontal pixels) x (vertical pixels) ) to avoid numerical compression during image display on your screen, as well as useless over-resoluted spectrograms obtained at a high computational cost. We warn you if you are higher, but you can still compute higher-resolution spectrograms if you want.

In [None]:
dataset.check_spectro_size()

## <span style="color:red">*FILL & RUN CELL*</span> Adjust spectrogram parameters and initialize

`dataset.number_adjustment_spectrograms` is the number of spectrogram examples used to adjust your parameters. If you are really not sure about your parameters, it is better to start with a small number, because each time you will have to wait for the generation of all your `dataset.number_adjustment_spectrograms` (x the different zoom levels) spectrograms before being able to re-generate spectrograms with another set of parameters.

`dataset.batch_number` indicates the number of concurrent jobs. A higher number can speed things up until a certain point. It still does not work very well.

In [None]:
dataset.number_adjustment_spectrogram = 5
dataset.batch_number = 6

reshape_method = "classic" # Automatically reshape the audio files to fit the spectro_duration value. Available methods : "classic" or "legacy"
merge_on_reshape = False # Set to False if fyou don't want to merge audio files while reshaping them (if they do not follow each other chronologically for example)
force_init = False # Force every initialization parameter, including force_reshape and other computing jobs. It is best to avoid using it.
dataset.initialize(reshape_method=reshape_method, date_template=date_template, force_init=force_init, merge_on_reshape=merge_on_reshape)
dataset.update_parameters(dataset.path.joinpath("processed","spectrogram","adjust_metadata.csv"))

## <span style="color:blue">*JUST RUN CELL*</span> Adjust spectrogram parameters

### Compute `dataset.number_adjustment_spectrograms` spectrograms to adjust parameters. 

In [None]:
file_list = [] # Fill audio file names when you want to generate specific adjustment spectrograms
jobfile = jb.build_job_file(script_path=Path(os.getcwd(), "qsub_spectrogram_generator_pkg.py"), \
            script_args=f"""--nb-adjust-files {dataset.number_adjustment_spectrogram} \
            --dataset-path {dataset.path} \
            --dataset-sr {dataset.dataset_sr} \
            --files "{" ".join(file_list)}" """,
            jobname="OSmOSE_AdjustSpectro", 
            preset="low",
            env_name=env_name,
            mem="20G",
            walltime="01:00:00",
            logdir=dataset.path.joinpath("log"))

pending_jobs = [jobid for jobid in dataset.pending_jobs if b"finished" not in subprocess.run(["qstat",jobid], capture_output=True).stderr]
job_id = jb.submit_job(dependency=pending_jobs) #submit all built job files

### Visualize `nberAdjustSpectros` spectrograms to adjust parameters. 

Re-run several times this cell to update the folder of images because they keep being generated while you visualize them. If this set of parameters does not suit you, change them and re-run new spectrograms with the previous cells, as many times as you want.

In [None]:
if jb.ongoing_jobs:
    print(f"\rParameter adjustment is still running, come back later!")
else:
    print("\rParameter adjustment finished!                           ")

    path_output_spectro = dataset.path_output_spectrogram.parent.parent.joinpath("adjustment_spectros","image")
    if not (path_output_spectro.exists() and len(os.listdir(path_output_spectro))>0):
        jb.read_output_file(outtype = "out", job_file_name=jb.finished_jobs[-1]["outfile"])
        raise UserWarning("Something went wrong with the spectro adjustment job. Full job trace above.")

    spectro_list = os.listdir(path_output_spectro)
    for spectro in spectro_list:
        display(Image(path_output_spectro.joinpath(spectro)))

# DONE

## <span style="color:blue">*JUST RUN CELL*</span>  Prepare spectrogram generation

Just one thing : if you create your spectrograms for an APLOSE campaign, set `write_datasets_csv_for_APLOSE=True` below !

In [None]:
write_datasets_csv_for_APLOSE=False 

if write_datasets_csv_for_APLOSE:

    dataset_csv = Path(path_osmose_dataset, "datasets.csv")
    
    dataset_name = f"{dataset.name} ({dataset.spectro_duration}_{dataset.dataset_sr})"
    dataset_info = {'name': dataset_name,
           'folder_name': dataset.name,
           'conf_folder': f"{dataset.spectro_duration}_{dataset.dataset_sr}",
           'dataset_type_name':'',
           'dataset_type_desc':'',
           'files_type': '.wav',
           'location_name': '',
           'location_desc': '',
           'location_lat':'',
           'location_lon':''}

    if dataset_csv.exists():
        meta = pd.read_csv(dataset_csv)
        if dataset_name not in meta['name'].values:
            meta = meta.append(dataset_info, ignore_index = True)
            meta.sort_values(by=['folder_name'], ascending=False)
            meta.to_csv(dataset_csv , index=False)

    else:
        met=pd.DataFrame.from_records([df2]) 
        met.to_csv(dataset_csv , index=False)        



## <span style="color:blue">*JUST RUN CELL*</span> Launch spectrogram generation

**if this cell fails, run it again please, can faile twice in a row**

You might want to increase the dataset.batch_number if the files needs to be split up in more than 10 groups, or decrease it if 10 groups are too much.

In [None]:
dataset.batch_number = 10
all_files = list(dataset.audio_path.glob("*.wav"))
batch_size = len(all_files) // dataset.batch_number

if dataset.jb.ongoing_jobs:
    print(f"\rPlease wait for the following jobs to finish: {','.join([jobinfo['path'] for jobinfo in dataset.jb.ongoing_jobs])}", end="")
else:

    print("\rAll previous jobs are completed, ready to launch spectrograms")

    for batch in range(dataset.batch_number):
        i_min = batch * batch_size
        i_max = (i_min + batch_size if batch < dataset.batch_number - 1 else len(all_files)) # If it is the last batch, take all files

        jobfile = jb.build_job_file(script_path=Path(os.getcwd(),"qsub_spectrogram_generator_pkg.py"), \
                    script_args=f"--dataset-path {dataset.path}\
                    --dataset-sr {dataset.dataset_sr} \
                    --batch-ind-min {i_min}\
                    --batch-ind-max {i_max}\
                    {'--save-matrix' if save_matrix else ''}", 
                    jobname="OSmOSE_SpectroGenerator", 
                    preset="low",
                    env_name=env_name,
                    mem="70G",
                    walltime="10:00:00",
                    logdir=dataset.path.joinpath("log"))


    job_id_list = jb.submit_job() #submit all built job files
    nb_jobs = len(jb.finished_jobs) + len(job_id_list)
    
    print(f"The job ids are {job_id_list}")

## Track progress

### Check job status

In [None]:
if jb.ongoing_jobs:
    spectros = len(os.listdir(dataset.path_output_spectrogram))
    audio = len(os.listdir(dataset.audio_path))-2
    total = audio * (2**dataset.zoom_level)-1
    
    print(f"Ongoing jobs: {len(jb.ongoing_jobs)}/{nb_jobs}; Finished jobs: {len(jb.finished_jobs)}/{nb_jobs}...")
    print(f"Spectrograms: {spectros}/{total} ({spectros*100//total}%).")
 
    
else:
    print("All jobs are finished.")

## Read output files

Use the cell below to read the output files of your job. You have two available job builders to chose from :
 
 - dataset.jb to read the output of dataset initialization jobs.
 - jb to read the output of spectrogram generation jobs.
 
Once you've chosen the job_builder, select an output file name to read. You can set the read mode to err if you wish to read the error output file (usually empty).

In [None]:
job_builder = jb #jb or dataset.jb

job_builder.list_jobs()

In [None]:
job_to_read ="OSmOSE_SpectroGenerator2"
read_mode = "out" # set to "err" to read the error output file

job_builder.read_output_file(outtype = read_mode, job_name=job_to_read) 

In [None]:
dataset.jb.update_job_access()
jb.update_job_access()

In [None]:
job_id = "" # Get the job id from the list above

!qstat -fx {job_id}

In [None]:
for jobinfo in job_builder.ongoing_jobs:
    jobinfo["path"].unlink()

In [None]:
job_builder.finished_jobs