In [1]:
import warnings                            # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore')          # commented out till the final version, to avoid missing "real" warnings 

import kilroy_was_here                     # Mandatory. Allow access to shared python code from repository root
from lib.jupytools import iprint           # timstamped (to the ms) print with CPU and RAM consumption information
from lib.audiodataset import AudioDataset  # Class for audio dataset handling
from lib import labelizers
from lib import attributors
from lib import transformers


# Path where to find initial annotated dataset (audio and lab files)
SOURCE_PATH ='D:/datasets/sounds/Nolasco'

# Dataset name is the master key for dataset adressing
DATASET_NAME = 'MAIN1000'

# Initialize Dataset Object. 
try:
    #By providing a source path,we implicitly indicates that you want to CREATE the data set.
    # Run with a pool of 4 processes
    iprint("Dataset creation. This may take up to 10 mn. Go grab a coffee.")
    ds = AudioDataset(DATASET_NAME, SOURCE_PATH, nprocs=4)
    
except FileExistsError:
    # To allow rerun, we catch the exception in case the dataset was already created.
    # Ideally, you should create the dataset once for all in a dedicated notebook,
    # and then retrieve it from other notebooks when needed
    # Here, by not providing a source path, we implicitly express the intent of RETRIEVING
    # an existing dataset rather than CREATING a new one
    iprint("Failing back to existing dataset retrieval")
    ds = AudioDataset(DATASET_NAME)
    iprint("Dataset retrieved")
    
# The following line provides some information about the newly created (or retrived) AudioDataset object    
ds.info()

ds.addLabel("nobee")
ds.addLabel("queen")

# The "queen" label value is deduced from the source file name, using a StringMapper transformer
# This transformer iterates over a list 2-uples (regular expression, target value) and return
# the target value as soon as a match is found. Thus, you must order your list from stricter to looser
trsfrm_queen = transformers.StringMapper(
        [('(?i)active', 1), 
         ('(?i)missing queen', 0),
         ('NO_QueenBee', 0),
         ('QueenBee', 1)     
        ])

# The transformer is then used over the source filenames, using the FromFileName labelizer
# This labelizer does not provide label strength.

n = ds.setLabel('queen', labelizers.FromFileName(trsfrm_queen))
iprint(n, "samples where processed for 'queen' label")

# The "nobee" label value comes from annotation files, (.lab files using the same base name as the audio
# source file they annotate), using the FromAnnotation labelizer, with no transformation.
# This labelizer takes 2 arguments:
# - a mandatory source path, pointing to the directory where the .lab files reside
# - an optional threshold, allowing to disregard any "label" event with a duration under this treshold
# The label strength over a sample is computed by summing the duration of "label" events (if > th) and dividing
#   this sum by the sample duration
# Here we use a 0.5 threshold
 
n = ds.setLabel('nobee', labelizers.FromAnnotation(SOURCE_PATH, th=0.5))
iprint(n, "samples where processed for 'nobee' label")

#The string matcher transformer behave differently than the StringMapper. It uses regexp
# capture group to retrieve part pf a string matching a specific pattern. This can be used
# either for complex or very basic matching. Here we just ask for the five first chars,
# provided they belong to characters valid for identifiers (A-Z, a-z,0-9 and underscore)
ds.addAttribute('hive')
ds.setAttribute('hive', attributors.FromFileName(transformers.StringMatcher("^(\w{5})")))

# Display dataset as a pandas dataframe
ds.getDataFrame()

[2020-08-07/16:38:42.125|15.4%|71.7%|0.25GB] Dataset creation. This may take up to 10 mn. Go grab a coffee.
[2020-08-07/16:38:42.125|00.0%|71.7%|0.25GB] The dataset directory (D:\Jupyter\ShowBees\datasets\MAIN1000) already exists.
[2020-08-07/16:38:42.125|00.0%|71.7%|0.25GB] If you really intent to CREATE this dataset, please erase this directory first
[2020-08-07/16:38:42.125|00.0%|71.7%|0.25GB] ### ABORTING! ###
[2020-08-07/16:38:42.125|00.0%|71.7%|0.25GB] Failing back to existing dataset retrieval
[2020-08-07/16:38:42.133|12.5%|71.7%|0.25GB] Dataset retrieved
[2020-08-07/16:38:42.133|00.0%|71.7%|0.25GB] ------------------------------------------------------
[2020-08-07/16:38:42.133|00.0%|71.7%|0.25GB] DATASET PATH          : D:\Jupyter\ShowBees\datasets\MAIN1000
[2020-08-07/16:38:42.133|00.0%|71.7%|0.25GB] DATASET DB PATH       : D:\Jupyter\ShowBees\datasets\MAIN1000\database.db
[2020-08-07/16:38:42.133|00.0%|71.7%|0.25GB] DATASET SAMPLES PATH  : D:\Jupyter\ShowBees\datasets\MAIN100

Unnamed: 0,parent_id,name,file_id,augment_id,start_t,end_t,hive,nobee,queen
0,0,00-000000,1,,0.0,1.0,CF001,0.00,0.0
1,0,00-000001,1,,1.0,2.0,CF001,0.00,0.0
2,0,00-000002,1,,2.0,3.0,CF001,0.00,0.0
3,0,00-000003,1,,3.0,4.0,CF001,0.00,0.0
4,0,00-000004,1,,4.0,5.0,CF001,0.00,0.0
...,...,...,...,...,...,...,...,...,...
24783,0,47-000587,48,,587.0,588.0,Hive3,0.00,1.0
24784,0,47-000588,48,,588.0,589.0,Hive3,0.85,1.0
24785,0,47-000589,48,,589.0,590.0,Hive3,1.00,1.0
24786,0,47-000590,48,,590.0,591.0,Hive3,1.00,1.0


In [14]:
ds.addAttribute('date')
ds.setAttribute('hive', attributors.FromFileName(transformers.StringMatcher("^(\w{5})")))

24788

In [17]:
sql = """
select
    s.rowid,
    s.name,
    s.file_id,
    a.hive,
    l.nobee,
    iif(l.nobee < 0.5, 0, 1) as b_nobee, -- using sqlite builtin function 
    l.queen
from samples s, attributes a, labels l
where a.sample_id = s.rowid
and l.sample_id = s.rowid
and a.hive = 'Hive1'
"""
ds.getDataFrame(sql)

Unnamed: 0,rowid,name,file_id,hive,nobee,b_nobee,queen
0,8840,21-000000,22,Hive1,0.0,0,1.0
1,8841,21-000001,22,Hive1,0.0,0,1.0
2,8842,21-000002,22,Hive1,0.0,0,1.0
3,8843,21-000003,22,Hive1,0.0,0,1.0
4,8844,21-000004,22,Hive1,0.0,0,1.0
...,...,...,...,...,...,...,...
5953,14793,30-000593,31,Hive1,0.0,0,0.0
5954,14794,30-000594,31,Hive1,0.0,0,0.0
5955,14795,30-000595,31,Hive1,0.0,0,0.0
5956,14796,30-000596,31,Hive1,0.0,0,0.0


In [39]:
l = (1,4)
a = 'hive'


def test(*kwargs):
    for kwarg in kwargs:
        print(kwarg)

def getOnAttribute(dict):
    for i, (k,v) in enumerate(dict.items()):
        print(i,k,v)

        
test({'toto':'a', 'titi':'c'})        
        
getOnAttribute({'toto':('a', 'b'), 'titi':('c','d')})

sql = F"select rowid, file_id, name from samples where file_id in {str(l)}"
ds.getDataFrame(sql)

{'toto': 'a', 'titi': 'c'}
0 toto ('a', 'b')
1 titi ('c', 'd')


Unnamed: 0,rowid,file_id,name
0,1,1,00-000000
1,2,1,00-000001
2,3,1,00-000002
3,4,1,00-000003
4,5,1,00-000004
...,...,...,...
310,3735,4,03-000295
311,3736,4,03-000296
312,3737,4,03-000297
313,3738,4,03-000298
