In [1]:
import warnings                            # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore')          # commented out till the final version, to avoid missing "real" warnings

import kilroy_was_here                        # Mandatory. Allow access to shared python code from repository root
from audace.jupytools import iprint           # timstamped (to the ms) print with CPU and RAM consumption information
from audace.audiodataset import AudioDataset  # Class for audio dataset handling
from audace import providers
from audace import transformers
from audace import featurizers


# Path where to find initial annotated dataset (audio and lab files)
SOURCE_PATH ='D:/datasets/sounds/Nolasco'

# Dataset name is the master key for dataset adressing
DATASET_NAME = 'MAIN1000'

# Initialize Dataset Object. 
try:
    #By providing a source path,we implicitly indicates that you want to CREATE the data set.
    # Run with a pool of 4 processes
    iprint("Dataset creation. This may take up to 10 mn. Go grab a coffee.")
    ds = AudioDataset(DATASET_NAME, SOURCE_PATH, nprocs=4)
    
except FileExistsError:
    # To allow rerun, we catch the exception in case the dataset was already created.
    # Ideally, you should create the dataset once for all in a dedicated notebook,
    # and then retrieve it from other notebooks when needed
    # Here, by not providing a source path, we implicitly express the intent of RETRIEVING
    # an existing dataset rather than CREATING a new one
    iprint("Failing back to existing dataset retrieval")
    ds = AudioDataset(DATASET_NAME)
    iprint("Dataset retrieved")
    
# The following line provides some information about the newly created (or retrived) AudioDataset object    
ds.info()

ds.addLabel("nobee")
ds.addLabel("queen")

# The "queen" label value is deduced from the source file name, using a StringMapper transformer
# This transformer iterates over a list 2-uples (regular expression, target value) and return
# the target value as soon as a match is found. Thus, you must order your list from stricter to looser
trsfrm_queen = transformers.StringMapper(
        [('(?i)active', 1), 
         ('(?i)missing queen', 0),
         ('NO_QueenBee', 0),
         ('QueenBee', 1)     
        ])

# The transformer is then used over the source filenames, using the FromFileName labelizer
# This labelizer does not provide label strength.

n = ds.setLabel('queen', providers.FromFileName(trsfrm_queen))
iprint(n, "samples where processed for 'queen' label")

# The "nobee" label value comes from annotation files, (.lab files using the same base name as the audio
# source file they annotate), using the FromAnnotation labelizer, with no transformation.
# This labelizer takes 2 arguments:
# - a mandatory source path, pointing to the directory where the .lab files reside
# - an optional threshold, allowing to disregard any "label" event with a duration under this treshold
# The label strength over a sample is computed by summing the duration of "label" events (if > th) and dividing
# this sum by the sample duration.

# Here we use a 0.5s threshold
n = ds.setLabel('nobee', providers.FromAnnotation(SOURCE_PATH, th=0.5))
iprint(n, "samples where processed for 'nobee' label")

#The string matcher transformer behave differently than the StringMapper. It uses regexp
# capture group to retrieve part pf a string matching a specific pattern. This can be used
# either for complex or very basic matching. Here we just ask for the five first chars,
# provided they belong to characters valid for identifiers (A-Z, a-z,0-9 and underscore)
ds.addAttribute('hive')
ds.setAttribute('hive', providers.FromFileName(transformers.StringMatcher("^(\w{5})")))

# Display dataset as a pandas dataframe
ds.dumpDataFrame()

[2020-08-10/08:04:04.540|15.9%|54.3%|0.26GB] Dataset creation. This may take up to 10 mn. Go grab a coffee.
[2020-08-10/08:04:04.540|00.0%|54.3%|0.26GB] The dataset directory (D:\Jupyter\ShowBees\datasets\MAIN1000) already exists.
[2020-08-10/08:04:04.541|00.0%|54.3%|0.26GB] If you really intent to CREATE this dataset, please erase this directory first
[2020-08-10/08:04:04.541|00.0%|54.3%|0.26GB] ### ABORTING! ###
[2020-08-10/08:04:04.541|00.0%|54.3%|0.26GB] Failing back to existing dataset retrieval
[2020-08-10/08:04:04.543|00.0%|54.3%|0.26GB] Dataset retrieved
[2020-08-10/08:04:04.543|00.0%|54.3%|0.26GB] ------------------------------------------------------
[2020-08-10/08:04:04.543|00.0%|54.3%|0.26GB] DATASET PATH          : D:\Jupyter\ShowBees\datasets\MAIN1000
[2020-08-10/08:04:04.543|00.0%|54.3%|0.26GB] DATASET DB PATH       : D:\Jupyter\ShowBees\datasets\MAIN1000\database.db
[2020-08-10/08:04:04.543|00.0%|54.3%|0.26GB] DATASET SAMPLES PATH  : D:\Jupyter\ShowBees\datasets\MAIN100

HBox(children=(FloatProgress(value=0.0, description='Annotating nobee', max=48.0, style=ProgressStyle(descript…


[2020-08-10/08:04:14.588|16.4%|54.2%|0.27GB] 24788 samples where processed for 'nobee' label


Unnamed: 0,name,file_id,start_t,end_t,nobee,queen,hive,MFCC
0,00-000000,1,0.0,1.0,0.00,0.0,CF001,"[[-452.69193, -455.08923, -466.46518, -480.284..."
1,00-000001,1,1.0,2.0,0.00,0.0,CF001,"[[-495.33542, -498.95233, -484.57263, -478.002..."
2,00-000002,1,2.0,3.0,0.00,0.0,CF001,"[[-445.4288, -443.79807, -444.25098, -442.6542..."
3,00-000003,1,3.0,4.0,0.00,0.0,CF001,"[[-460.95575, -462.3591, -463.8176, -466.13812..."
4,00-000004,1,4.0,5.0,0.00,0.0,CF001,"[[-466.61942, -456.0218, -446.7533, -444.27115..."
...,...,...,...,...,...,...,...,...
24783,47-000587,48,587.0,588.0,0.00,1.0,Hive3,"[[-413.74405, -412.19983, -413.3146, -416.2855..."
24784,47-000588,48,588.0,589.0,0.85,1.0,Hive3,"[[-400.12128, -390.9304, -391.67404, -399.0412..."
24785,47-000589,48,589.0,590.0,1.00,1.0,Hive3,"[[-431.32666, -431.38943, -435.44553, -431.093..."
24786,47-000590,48,590.0,591.0,1.00,1.0,Hive3,"[[-429.80112, -429.24075, -434.11508, -435.633..."


In [2]:

ds.addFeature('MFCC')
ds.setFeature('MFCC', providers.FromSample(ds.samples_path, featurizers.MFCC(n_mfcc=20)))

HBox(children=(FloatProgress(value=0.0, description='Computing MFCC', max=24788.0, style=ProgressStyle(descrip…




24788