In [3]:
#============================== EXPERIMENT IMPORTS ==============================
#===== Standard imports
import warnings                   # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore') # commented out till the final version, to avoid missing "real" warnings 

#===== 3rd party imports
# None

#===== Repository imports
import proxycodelib               # Mandatory. Allow access to shared python code in the upper 'codelib' directory
from jupytools import mooltipath  # Magic absolute path builder
 

#============================= EXPERIMENT PARAMETERS =============================
# Path where to find initial annotated dataset (audio and lab files)
INPUT_PATH ='D:/datasets/sounds/Nolasco'
DATASET      = 'MAIN'       # Name of the predefined target dataset

In [4]:
from chunker import load_dataset_manifest
filenames, sr, duration, overlap, chunks_md5 = load_dataset_manifest(DATASET)
print('NB AUDIO FILES :', len (filenames))
print('SAMPLE RATE    :', sr)
print('DURATION       :', duration)
print('OVERLAP        :', overlap)
print('CHUNKS MD5     :', chunks_md5)

NB AUDIO FILES : 48
SAMPLE RATE    : 22050
DURATION       : 1.0
OVERLAP        : 0.5
CHUNKS MD5     : HASH


In [6]:
from chunker import build_dataset_labs
build_dataset_labs(DATASET, INPUT_PATH)

48

In [8]:
# Slice target dataset into chunks
from chunker import build_dataset_chunks
chunk_dir, nb_files,nb_chunks, md5h = build_dataset_chunks(DATASET, INPUT_PATH)

print("OUTPUT DIR      :", chunk_dir)
print("PROCESSED FILES :", nb_files)
print("CHUNKS BUILT    :", nb_chunks)
print("MD5 HASH        :", md5h)

[2020-07-27 01:47:30 RAM74.1% 0.06GB] Starting to process 48 audio files.
[2020-07-27 01:47:30 RAM74.1% 0.06GB] CF001 - Missing Queen - Day -.mp3
[2020-07-27 01:47:31 RAM74.2% 0.07GB] GH001 - Active - Day - 141022_0659_0751.mp3
[2020-07-27 01:49:30 RAM63.3% 0.3GB] CF003 - Active - Day - (214).wav
[2020-07-27 01:49:41 RAM58.5% 0.07GB] CF003 - Active - Day - (215).wav
[2020-07-27 01:49:51 RAM59.3% 0.07GB] CF003 - Active - Day - (216).wav
[2020-07-27 01:50:01 RAM59.0% 0.07GB] CF003 - Active - Day - (217).wav
[2020-07-27 01:50:12 RAM60.4% 0.07GB] CF003 - Active - Day - (218).wav
[2020-07-27 01:50:23 RAM60.5% 0.07GB] CF003 - Active - Day - (219).wav
[2020-07-27 01:50:33 RAM59.2% 0.07GB] CF003 - Active - Day - (220).wav
[2020-07-27 01:50:43 RAM58.8% 0.07GB] CF003 - Active - Day - (221).wav
[2020-07-27 01:50:52 RAM59.1% 0.07GB] CF003 - Active - Day - (222).wav
[2020-07-27 01:51:02 RAM59.0% 0.07GB] CF003 - Active - Day - (223).wav
[2020-07-27 01:51:12 RAM59.0% 0.07GB] CF003 - Active - Day - (2

AssertionError: MD5 checksum (6a2ac3d4440d9678182174c073c50216) does not match manifest value (HASH).

In [9]:
from chunker import build_dataset_thresholds
build_dataset_thresholds(DATASET, [0, .5, 1])

[2020-07-27 02:03:26 RAM62.8% 0.09GB] CF001 - Missing Queen - Day -
[2020-07-27 02:03:26 RAM62.8% 0.09GB] GH001 - Active - Day - 141022_0659_0751
[2020-07-27 02:03:28 RAM60.5% 0.09GB] CF003 - Active - Day - (214)
[2020-07-27 02:03:28 RAM60.5% 0.09GB] CF003 - Active - Day - (215)
[2020-07-27 02:03:28 RAM60.5% 0.09GB] CF003 - Active - Day - (216)
[2020-07-27 02:03:28 RAM60.5% 0.09GB] CF003 - Active - Day - (217)
[2020-07-27 02:03:28 RAM60.5% 0.09GB] CF003 - Active - Day - (218)
[2020-07-27 02:03:28 RAM60.5% 0.09GB] CF003 - Active - Day - (219)
[2020-07-27 02:03:29 RAM60.5% 0.09GB] CF003 - Active - Day - (220)
[2020-07-27 02:03:29 RAM60.5% 0.09GB] CF003 - Active - Day - (221)
[2020-07-27 02:03:29 RAM60.5% 0.09GB] CF003 - Active - Day - (222)
[2020-07-27 02:03:29 RAM60.5% 0.09GB] CF003 - Active - Day - (223)
[2020-07-27 02:03:29 RAM60.5% 0.09GB] CF003 - Active - Day - (224)
[2020-07-27 02:03:29 RAM60.5% 0.09GB] CF003 - Active - Day - (225)
[2020-07-27 02:03:29 RAM60.5% 0.09GB] CF003 - Acti

[2020-07-27 02:03:47 RAM60.2% 0.09GB] CF003 - Active - Day - (221)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CF003 - Active - Day - (222)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CF003 - Active - Day - (223)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CF003 - Active - Day - (224)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CF003 - Active - Day - (225)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CF003 - Active - Day - (226)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CF003 - Active - Day - (227)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CJ001 - Missing Queen - Day -  (100)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CJ001 - Missing Queen - Day -  (101)
[2020-07-27 02:03:47 RAM60.2% 0.09GB] CJ001 - Missing Queen - Day -  (102)
[2020-07-27 02:03:47 RAM60.1% 0.09GB] CJ001 - Missing Queen - Day -  (103)
[2020-07-27 02:03:48 RAM60.1% 0.09GB] CJ001 - Missing Queen - Day -  (104)
[2020-07-27 02:03:48 RAM60.1% 0.09GB] Hive1_12_06_2018_QueenBee_H1_audio___15_00_00
[2020-07-27 02:03:48 RAM60.1% 0.09GB] Hive1_12_06_2018_QueenBee_H1_audio

In [11]:
import proxycodelib
from chunker import build_dataset_labels

liste = build_dataset_labels(DATASET, './workdir')

liste

['CF001 - Missing Queen - Day -_chunk0021',
 'CF001 - Missing Queen - Day -_chunk0022',
 'CF001 - Missing Queen - Day -_chunk0023',
 'GH001 - Active - Day - 141022_0659_0751_chunk0120',
 'GH001 - Active - Day - 141022_0659_0751_chunk0121',
 'GH001 - Active - Day - 141022_0659_0751_chunk0122',
 'GH001 - Active - Day - 141022_0659_0751_chunk0137',
 'GH001 - Active - Day - 141022_0659_0751_chunk0138',
 'GH001 - Active - Day - 141022_0659_0751_chunk0139',
 'GH001 - Active - Day - 141022_0659_0751_chunk0148',
 'GH001 - Active - Day - 141022_0659_0751_chunk0149',
 'GH001 - Active - Day - 141022_0659_0751_chunk0150',
 'GH001 - Active - Day - 141022_0659_0751_chunk0151',
 'GH001 - Active - Day - 141022_0659_0751_chunk0152',
 'GH001 - Active - Day - 141022_0659_0751_chunk0153',
 'GH001 - Active - Day - 141022_0659_0751_chunk0154',
 'GH001 - Active - Day - 141022_0659_0751_chunk0186',
 'GH001 - Active - Day - 141022_0659_0751_chunk0187',
 'GH001 - Active - Day - 141022_0659_0751_chunk0188',
 'GH

In [9]:
import proxycodelib
from jupytools import mooltipath
from chunker import get_list_samples_names

WORKING_DIR = mooltipath('datasets/SMALL/chunks')

print(len(get_list_samples_names(WORKING_DIR)))


[2020-07-26 21:52:21 RAM83.9% 0.2GB] D:\Jupyter\ShowBees\datasets\SMALL\chunks
4744
