In [1]:
import warnings                            # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore')          # commented out till the final version, to avoid missing "real" warnings

import kilroy_was_here                        # Mandatory. Allow access to shared python code from repository root
from audace.jupytools import iprint           # timstamped (to the ms) print with CPU and RAM consumption information
from audace.audiodataset import AudioDataset  # Class for audio dataset handling
from audace import providers
from audace import transformers
from audace import featurizers


# Path where to find initial annotated dataset (audio and lab files)
SOURCE_PATH ='/Users/jpg/Documents/Nolasco'

# Dataset name is the master key for dataset adressing
DATASET_NAME = 'MONO1000'

# Initialize Dataset Object. 
try:
    #By providing a source path,we implicitly indicates that you want to CREATE the data set.
    # Run with a pool of 4 processes
    iprint("Dataset creation. This may take up to 10 mn. Go grab a coffee.")
    ds = AudioDataset(DATASET_NAME, SOURCE_PATH, nprocs=4)
    
except FileExistsError:
    # To allow rerun, we catch the exception in case the dataset was already created.
    # Ideally, you should create the dataset once for all in a dedicated notebook,
    # and then retrieve it from other notebooks when needed
    # Here, by not providing a source path, we implicitly express the intent of RETRIEVING
    # an existing dataset rather than CREATING a new one
    iprint("Failing back to existing dataset retrieval")
    ds = AudioDataset(DATASET_NAME)
    iprint("Dataset retrieved")
    
# The following line provides some information about the newly created (or retrived) AudioDataset object    
ds.info()

ds.addLabel("nobee")
ds.addLabel("queen")

# The "queen" label value is deduced from the source file name, using a StringMapper transformer
# This transformer iterates over a list 2-uples (regular expression, target value) and return
# the target value as soon as a match is found. Thus, you must order your list from stricter to looser
trsfrm_queen = transformers.StringMapper(
        [('(?i)active', 1), 
         ('(?i)missing queen', 0),
         ('NO_QueenBee', 0),
         ('QueenBee', 1)     
        ])

# The transformer is then used over the source filenames, using the FromFileName labelizer
# This labelizer does not provide label strength.

n = ds.setLabel('queen', providers.FromFileName(trsfrm_queen))
iprint(n, "samples where processed for 'queen' label")

# The "nobee" label value comes from annotation files, (.lab files using the same base name as the audio
# source file they annotate), using the FromAnnotation labelizer, with no transformation.
# This labelizer takes 2 arguments:
# - a mandatory source path, pointing to the directory where the .lab files reside
# - an optional threshold, allowing to disregard any "label" event with a duration under this treshold
# The label strength over a sample is computed by summing the duration of "label" events (if > th) and dividing
# this sum by the sample duration.

# Here we use a 0.5s threshold
n = ds.setLabel('nobee', providers.FromAnnotation(SOURCE_PATH, th=0.5))
iprint(n, "samples where processed for 'nobee' label")

#The string matcher transformer behave differently than the StringMapper. It uses regexp
# capture group to retrieve part pf a string matching a specific pattern. This can be used
# either for complex or very basic matching. Here we just ask for the five first chars,
# provided they belong to characters valid for identifiers (A-Z, a-z,0-9 and underscore)
ds.addAttribute('hive')
ds.setAttribute('hive', providers.FromFileName(transformers.StringMatcher("^(\w{5})")))

ds.addFeature('MFCC')
ds.setFeature('MFCC', providers.FromSample(ds.samples_path, featurizers.MFCC(n_mfcc=20)))

# Display dataset as a pandas dataframe
ds.dumpDataFrame()

[2020-08-11/14:20:21.477|16.4%|78.0%|0.26GB] Dataset creation. This may take up to 10 mn. Go grab a coffee.
[2020-08-11/14:20:21.479|00.0%|78.0%|0.26GB] >>>>> Starting Dataset MONO1000 build
[2020-08-11/14:20:21.501|17.6%|78.0%|0.26GB] Starting to process 2 audio files.
[2020-08-11/14:20:53.404|27.7%|74.5%|0.26GB] Creating Database
[2020-08-11/14:20:53.417|00.0%|74.5%|0.26GB] Database created
[2020-08-11/14:20:53.418|00.0%|74.5%|0.26GB] Please wait, computing checksum...
[2020-08-11/14:20:53.927|20.8%|75.4%|0.26GB]   Computed checksum 28f6a83834ab4d0bdb7847b37b68c7b0
[2020-08-11/14:20:53.927|00.0%|75.4%|0.26GB]   Expected checksum 6671ad6663eb2019bd3af30170705bb3
[2020-08-11/14:20:53.927|00.0%|75.4%|0.26GB] >>>>> Dataset MONO1000 successfully created.
[2020-08-11/14:20:53.928|00.0%|75.4%|0.26GB] ------------------------------------------------------
[2020-08-11/14:20:53.928|00.0%|75.4%|0.26GB] DATASET PATH          : D:\Jupyter\ShowBees\datasets\MONO1000
[2020-08-11/14:20:53.928|00.0%|

HBox(children=(FloatProgress(value=0.0, description='Annotating nobee', max=2.0, style=ProgressStyle(descripti…


[2020-08-11/14:20:54.526|16.6%|75.4%|0.26GB] 2366 samples where processed for 'nobee' label


HBox(children=(FloatProgress(value=0.0, description='Computing MFCC', max=2366.0, style=ProgressStyle(descript…




Unnamed: 0,name,file_id,start_t,end_t,nobee,queen,hive,MFCC
0,00-000000,1,0.0,1.0,0.0,0.0,Hive3,"[[-394.88925, -395.52295, -411.27222, -399.069..."
1,00-000001,1,0.5,1.5,0.0,0.0,Hive3,"[[-401.44745, -394.54202, -396.4122, -404.9671..."
2,00-000002,1,1.0,2.0,0.0,0.0,Hive3,"[[-419.504, -409.64413, -407.69608, -416.86414..."
3,00-000003,1,1.5,2.5,0.0,0.0,Hive3,"[[-420.08606, -418.07767, -417.40793, -404.803..."
4,00-000004,1,2.0,3.0,0.0,0.0,Hive3,"[[-372.56345, -371.13123, -384.61908, -378.365..."
...,...,...,...,...,...,...,...,...
2361,01-001178,2,589.0,590.0,0.0,1.0,Hive3,"[[-456.26517, -455.3929, -456.27307, -448.8443..."
2362,01-001179,2,589.5,590.5,0.0,1.0,Hive3,"[[-438.46368, -431.82635, -440.27603, -449.474..."
2363,01-001180,2,590.0,591.0,0.0,1.0,Hive3,"[[-444.8451, -443.43448, -448.0073, -452.25623..."
2364,01-001181,2,590.5,591.5,0.0,1.0,Hive3,"[[-455.87585, -447.3699, -445.45035, -446.2260..."
