In [1]:
import warnings                            # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore')          # commented out till the final version, to avoid missing "real" warnings

import kilroy_was_here                        # Mandatory. Allow access to shared python code from repository root
from audace.jupytools import iprint           # timstamped (to the ms) print with CPU and RAM consumption information
from audace.audiodataset import AudioDataset  # Class for audio dataset handling
from audace import providers
from audace import transformers
from audace import featurizers


# Path where to find initial annotated dataset (audio and lab files)
SOURCE_PATH ='/Users/jpg/Documents/Nolasco'

# Dataset name is the master key for dataset adressing. Change it according to the
# dataset you wan to generate
DATASET_NAME = 'DUO0500'

# Initialize Dataset Object. 
try:
    #By providing a source path,we implicitly indicates that you want to CREATE the data set.
    # Run with a pool of 4 processes
    iprint("Dataset creation. This may take up to 10 mn. Go grab a coffee.")
    ds = AudioDataset(DATASET_NAME, SOURCE_PATH, nprocs=4)
    
except FileExistsError:
    # To allow rerun, we catch the exception in case the dataset was already created.
    # Ideally, you should create the dataset once for all in a dedicated notebook,
    # and then retrieve it from other notebooks when needed
    # Here, by not providing a source path, we implicitly express the intent of RETRIEVING
    # an existing dataset rather than CREATING a new one
    iprint("Failing back to existing dataset retrieval")
    ds = AudioDataset(DATASET_NAME)
    iprint("Dataset retrieved")
    
# The following line provides some information about the newly created (or retrived) AudioDataset object    
ds.info()

ds.addLabel("nobee")
ds.addLabel("queen")

# The "queen" label value is deduced from the source file name, using a StringMapper transformer
# This transformer iterates over a list 2-uples (regular expression, target value) and return
# the target value as soon as a match is found. Thus, you must order your list from stricter to looser
trsfrm_queen = transformers.StringMapper(
        [('(?i)active', 1), 
         ('(?i)missing queen', 0),
         ('NO_QueenBee', 0),
         ('QueenBee', 1)     
        ])

# The transformer is then used over the source filenames, using the FromFileName labelizer
# This labelizer does not provide label strength.

n = ds.setLabel('queen', providers.FromFileName(trsfrm_queen))
iprint(n, "samples where processed for 'queen' label")

# The "nobee" label value comes from annotation files, (.lab files using the same base name as the audio
# source file they annotate), using the FromAnnotation labelizer, with no transformation.
# This labelizer takes 2 arguments:
# - a mandatory source path, pointing to the directory where the .lab files reside
# - an optional threshold, allowing to disregard any "label" event with a duration under this treshold
# The label strength over a sample is computed by summing the duration of "label" events (if > th) and dividing
# this sum by the sample duration.

# Here we use a 0.0s threshold
n = ds.setLabel('nobee', providers.FromAnnotation(SOURCE_PATH, th=0.0))
iprint(n, "samples where processed for 'nobee' label")

#The string matcher transformer behave differently than the StringMapper. It uses regexp
# capture group to retrieve part pf a string matching a specific pattern. This can be used
# either for complex or very basic matching. Here we just ask for the five first chars,
# provided they belong to characters valid for identifiers (A-Z, a-z,0-9 and underscore)
ds.addAttribute('hive')
ds.setAttribute('hive', providers.FromFileName(transformers.StringMatcher("^(\w{5})")))

#Here we use an MFCCfeaturiser, the providers beeing FromSample, which use audio chunks as a source
ds.addFeature('mfcc20')
ds.setFeature('mfcc20', providers.FromSample(ds.samples_path, featurizers.MFCC(n_mfcc=20)))




# Display dataset as a pandas dataframe
ds.dumpDataFrame()

[2020-08-14/09:55:58.550|13.2%|67.5%|0.26GB] Dataset creation. This may take up to 10 mn. Go grab a coffee.
[2020-08-14/09:55:58.551|00.0%|67.5%|0.26GB] >>>>> Starting Dataset DUO0500 build
[2020-08-14/09:55:58.579|00.0%|67.5%|0.26GB] Starting to process 4 audio files.
[2020-08-14/09:56:37.137|42.9%|65.5%|0.26GB] Creating Database
[2020-08-14/09:56:37.155|25.0%|65.5%|0.26GB] Database created
[2020-08-14/09:56:37.157|37.5%|65.5%|0.26GB] Please wait, computing checksum...
[2020-08-14/09:56:37.954|14.7%|66.4%|0.26GB]   Computed checksum 911ddb192d1d338cf187b0b58ecdca34
[2020-08-14/09:56:37.955|00.0%|66.4%|0.26GB]   Expected checksum e1de6cad8d2f2f15ec2df61c518af462
[2020-08-14/09:56:37.955|00.0%|66.4%|0.26GB] >>>>> Dataset DUO0500 successfully created.
[2020-08-14/09:56:37.955|00.0%|66.4%|0.26GB] ------------------------------------------------------
[2020-08-14/09:56:37.956|00.0%|66.4%|0.26GB] DATASET NAME          : DUO0500
[2020-08-14/09:56:37.956|00.0%|66.4%|0.26GB] DATASET PATH      

HBox(children=(FloatProgress(value=0.0, description='Annotating nobee', max=4.0, style=ProgressStyle(descripti…


[2020-08-14/09:56:39.086|14.6%|66.4%|0.26GB] 4748 samples where processed for 'nobee' label


HBox(children=(FloatProgress(value=0.0, description='Computing mfcc20', max=4748.0, style=ProgressStyle(descri…




Unnamed: 0,name,file_id,start_t,end_t,nobee,queen,hive,mfcc20
0,00-000000,1,0.0,0.5,0.0,1.0,Hive1,"[-408.8679504394531, -408.73077392578125, -409..."
1,00-000001,1,0.5,1.0,0.0,1.0,Hive1,"[-412.67633056640625, -409.9009704589844, -411..."
2,00-000002,1,1.0,1.5,0.0,1.0,Hive1,"[-423.73028564453125, -417.4234619140625, -409..."
3,00-000003,1,1.5,2.0,0.0,1.0,Hive1,"[-412.2464294433594, -414.35784912109375, -417..."
4,00-000004,1,2.0,2.5,0.0,1.0,Hive1,"[-402.0168762207031, -405.77630615234375, -413..."
...,...,...,...,...,...,...,...,...
4743,03-001179,4,589.5,590.0,0.0,1.0,Hive3,"[-438.46368408203125, -431.82635498046875, -44..."
4744,03-001180,4,590.0,590.5,0.0,1.0,Hive3,"[-444.8450927734375, -443.4344787597656, -448...."
4745,03-001181,4,590.5,591.0,0.0,1.0,Hive3,"[-455.8758544921875, -447.3699035644531, -445...."
4746,03-001182,4,591.0,591.5,0.0,1.0,Hive3,"[-449.1347961425781, -441.712890625, -444.2016..."
