In [1]:
import warnings                            # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore')          # commented out till the final version, to avoid missing "real" warnings

import kilroy_was_here                        # Mandatory. Allow access to shared python code from repository root
from audace.jupytools import iprint           # timstamped (to the ms) print with CPU and RAM consumption information
from audace.audiodataset import AudioDataset  # Class for audio dataset handling
from audace import providers
from audace import transformers
from audace import featurizers


# Path where to find initial annotated dataset (audio and lab files)
SOURCE_PATH ='/Users/jpg/Documents/Nolasco'

# Dataset name is the master key for dataset adressing
DATASET_NAME = 'MAIN1000'

# Initialize Dataset Object. 
try:
    #By providing a source path,we implicitly indicates that you want to CREATE the data set.
    # Run with a pool of 4 processes
    iprint("Dataset creation. This may take up to 10 mn. Go grab a coffee.")
    ds = AudioDataset(DATASET_NAME, SOURCE_PATH, nprocs=4)
    
except FileExistsError:
    # To allow rerun, we catch the exception in case the dataset was already created.
    # Ideally, you should create the dataset once for all in a dedicated notebook,
    # and then retrieve it from other notebooks when needed
    # Here, by not providing a source path, we implicitly express the intent of RETRIEVING
    # an existing dataset rather than CREATING a new one
    iprint("Failing back to existing dataset retrieval")
    ds = AudioDataset(DATASET_NAME)
    iprint("Dataset retrieved")
    
# The following line provides some information about the newly created (or retrived) AudioDataset object    
ds.info()

ds.addLabel("nobee")
ds.addLabel("queen")

# The "queen" label value is deduced from the source file name, using a StringMapper transformer
# This transformer iterates over a list 2-uples (regular expression, target value) and return
# the target value as soon as a match is found. Thus, you must order your list from stricter to looser
trsfrm_queen = transformers.StringMapper(
        [('(?i)active', 1), 
         ('(?i)missing queen', 0),
         ('NO_QueenBee', 0),
         ('QueenBee', 1)     
        ])

# The transformer is then used over the source filenames, using the FromFileName labelizer
# This labelizer does not provide label strength.

n = ds.setLabel('queen', providers.FromFileName(trsfrm_queen))
iprint(n, "samples where processed for 'queen' label")

# The "nobee" label value comes from annotation files, (.lab files using the same base name as the audio
# source file they annotate), using the FromAnnotation labelizer, with no transformation.
# This labelizer takes 2 arguments:
# - a mandatory source path, pointing to the directory where the .lab files reside
# - an optional threshold, allowing to disregard any "label" event with a duration under this treshold
# The label strength over a sample is computed by summing the duration of "label" events (if > th) and dividing
# this sum by the sample duration.

# Here we use a 0.5s threshold
n = ds.setLabel('nobee', providers.FromAnnotation(SOURCE_PATH, th=0.5))
iprint(n, "samples where processed for 'nobee' label")

#The string matcher transformer behave differently than the StringMapper. It uses regexp
# capture group to retrieve part pf a string matching a specific pattern. This can be used
# either for complex or very basic matching. Here we just ask for the five first chars,
# provided they belong to characters valid for identifiers (A-Z, a-z,0-9 and underscore)
ds.addAttribute('hive')
ds.setAttribute('hive', providers.FromFileName(transformers.StringMatcher("^(\w{5})")))

# Display dataset as a pandas dataframe
#ds.dumpDataFrame()

[2020-08-10/12:35:17.547|12.0%|58.4%|0.26GB] Dataset creation. This may take up to 10 mn. Go grab a coffee.
[2020-08-10/12:35:17.547|00.0%|58.4%|0.26GB] >>>>> Starting Dataset MAIN1000 build
[2020-08-10/12:35:17.575|25.0%|58.4%|0.26GB] Starting to process 48 audio files.
[2020-08-10/12:41:44.027|51.6%|55.5%|0.26GB] Creating Database
[2020-08-10/12:41:44.097|22.5%|55.5%|0.26GB] Database created
[2020-08-10/12:41:44.100|00.0%|55.4%|0.26GB] Please wait, computing checksum...
[2020-08-10/12:41:54.493|11.8%|55.2%|0.26GB]   Computed checksum 964a87370f449298e0ef681efe6094bb
[2020-08-10/12:41:54.493|00.0%|55.2%|0.26GB]   Expected checksum 1ba14f84b713fcaf1c7dccff8b1e36a7
[2020-08-10/12:41:54.493|00.0%|55.2%|0.26GB] >>>>> Dataset MAIN1000 successfully created.
[2020-08-10/12:41:54.494|00.0%|55.2%|0.26GB] ------------------------------------------------------
[2020-08-10/12:41:54.494|00.0%|55.2%|0.26GB] DATASET PATH          : D:\Jupyter\ShowBees\datasets\MAIN1000
[2020-08-10/12:41:54.494|00.0%

HBox(children=(FloatProgress(value=0.0, description='Annotating nobee', max=48.0, style=ProgressStyle(descript…


[2020-08-10/12:42:01.408|14.1%|54.9%|0.27GB] 24788 samples where processed for 'nobee' label


24788

In [2]:
ds.addFeature('MFCC')
ds.setFeature('MFCC', providers.FromSample(ds.samples_path, featurizers.MFCC(n_mfcc=20)))

HBox(children=(FloatProgress(value=0.0, description='Computing MFCC', max=24788.0, style=ProgressStyle(descrip…




24788