In [1]:
import warnings                            # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore')          # commented out till the final version, to avoid missing "real" warnings

import kilroy_was_here                        # Mandatory. Allow access to shared python code from repository root
from audace.jupytools import iprint           # timstamped (to the ms) print with CPU and RAM consumption information
from audace.audiodataset import AudioDataset  # Class for audio dataset handling
from audace import providers
from audace import transformers
from audace import featurizers


# Path where to find initial annotated dataset (audio and lab files)
SOURCE_PATH ='/Users/jpg/Documents/Nolasco'

# Dataset name is the master key for dataset adressing. Change it according to the
# dataset you wan to generate
DATASET_NAME = 'MAIN1000'

          #############################
########### Initialize Dataset Object #
          #############################
try:
    #By providing a source path,we implicitly indicates that you want to CREATE the data set.
    # Run with a pool of 4 processes
    iprint("Dataset creation. This may take up to 10 mn. Go grab a coffee.")
    ds = AudioDataset(DATASET_NAME, SOURCE_PATH, nprocs=4)
    
except FileExistsError:
    # To allow rerun, we catch the exception in case the dataset was already created.
    # Ideally, you should create the dataset once for all in a dedicated notebook,
    # and then retrieve it from other notebooks when needed
    # Here, by not providing a source path, we implicitly express the intent of RETRIEVING
    # an existing dataset rather than CREATING a new one
    iprint("Failing back to existing dataset retrieval")
    ds = AudioDataset(DATASET_NAME)
    iprint("Dataset retrieved")
    
# Display AudioDataset summary   
ds.info()

          ##################
########### Compute Labels #
          ##################
ds.addLabel("nobee")
ds.addLabel("queen")

# The "queen" label value is deduced from the source file name, using a StringMapper transformer
# This transformer iterates over a list 2-uples (regular expression, target value) and return
# the target value as soon as a match is found. Thus, you must order your list from stricter to looser
trsfrm_queen = transformers.StringMapper(
        [('(?i)active', 1), 
         ('(?i)missing queen', 0),
         ('NO_QueenBee', 0),
         ('QueenBee', 1)     
        ])

# The transformer is then used over the source filenames, using the FromFileName labelizer
# This labelizer does not provide label strength.

n = ds.setLabel('queen', providers.FromFileName(trsfrm_queen))
iprint(n, "samples were processed for 'queen' label")

# The "nobee" label value comes from annotation files, (.lab files using the same base name as the audio
# source file they annotate), using the FromAnnotation labelizer, with no transformation.
# This labelizer takes 2 arguments:
# - a mandatory source path, pointing to the directory where the .lab files reside
# - an optional threshold, allowing to disregard any "label" event with a duration under this treshold
# The label strength over a sample is computed by summing the duration of "label" events (if > th) and dividing
# this sum by the sample duration.

# Here we use a 0.0s threshold
n = ds.setLabel('nobee', providers.FromAnnotation(SOURCE_PATH, th=0.0))
iprint(n, "samples were processed for 'nobee' label")


          ######################
########### Compute Attributes #
          ######################
#The string matcher transformer behave differently than the StringMapper. It uses regexp
# capture group to retrieve part pf a string matching a specific pattern. This can be used
# either for complex or very basic matching. Here we just ask for the five first chars,
# provided they belong to characters valid for identifiers (A-Z, a-z,0-9 and underscore)
ds.addAttribute('hive')
ds.setAttribute('hive', providers.FromFileName(transformers.StringMatcher("^(\w{5})")))


# Compute attribute fold from hive
ds.addAttribute('fold')

e = {'CF001':1, 'CF003':1, 'CJ001':2, 'GH001':2, 'Hive1':3, 'Hive3': 4}
ds.setAttribute('fold', providers.FromQuery('hive', transformers.Decode(e)))


# Display dataset as a pandas dataframe
ds.dumpDataFrame()


[2020-08-18/14:59:34.289|14.5%|69.5%|0.26GB] Dataset creation. This may take up to 10 mn. Go grab a coffee.
[2020-08-18/14:59:34.290|00.0%|69.5%|0.26GB] >>>>> Starting Dataset MAIN1000 build
[2020-08-18/14:59:34.314|40.0%|69.5%|0.26GB] Starting to process 48 audio files.
[2020-08-18/15:05:58.900|48.8%|57.1%|0.26GB] Creating Database
[2020-08-18/15:05:58.999|62.5%|57.0%|0.26GB] Database created
[2020-08-18/15:05:59.003|37.5%|56.9%|0.26GB] Please wait, computing checksum...
[2020-08-18/15:06:08.284|11.9%|59.1%|0.26GB]   Computed checksum 964a87370f449298e0ef681efe6094bb
[2020-08-18/15:06:08.285|00.0%|59.1%|0.26GB]   Expected checksum 1ba14f84b713fcaf1c7dccff8b1e36a7
[2020-08-18/15:06:08.285|00.0%|59.1%|0.26GB] >>>>> Dataset MAIN1000 successfully created.
[2020-08-18/15:06:08.285|00.0%|59.1%|0.26GB] ------------------------------------------------------
[2020-08-18/15:06:08.285|00.0%|59.1%|0.26GB] DATASET NAME          : MAIN1000
[2020-08-18/15:06:08.285|00.0%|59.1%|0.26GB] DATASET PATH  

HBox(children=(FloatProgress(value=0.0, description='Annotating nobee', max=48.0, style=ProgressStyle(descript…


[2020-08-18/15:06:15.653|18.1%|55.4%|0.21GB] 24788 samples were processed for 'nobee' label


Unnamed: 0,hive,queen,count(*)
0,CF001,0.0,14
1,CF003,1.0,3649
2,CJ001,0.0,790
3,GH001,1.0,1396
4,Hive1,0.0,1473
5,Hive1,1.0,2684
6,Hive3,0.0,6545
7,Hive3,1.0,654


In [2]:
# Display cardinalities by fold attribute and queen label for samples with no external perturbation
sql = """
    select distinct fold, queen, count(*)
    from samples
    where nobee = 0
    group by fold, queen
    order by fold
    """
ds.queryDataFrame(sql)

Unnamed: 0,fold,queen,count(*)
0,1,0.0,14
1,1,1.0,3649
2,2,0.0,790
3,2,1.0,1396
4,3,0.0,1473
5,3,1.0,2684
6,4,0.0,6545
7,4,1.0,654
