In [16]:
import warnings                            # This block prevents display of harmless warnings, but should be
warnings.filterwarnings('ignore')          # commented out till the final version, to avoid missing "real" warnings 

import kilroy_was_here                     # Mandatory. Allow access to shared python code from repository root
from audace.jupytools import iprint           # timstamped (to the ms) print with CPU and RAM consumption information
from audace.audiodataset import AudioDataset  # Class for audio dataset handling
from audace import providers
from audace import featurizers


# Dataset name is the master key for dataset adressing
# Changing according to the dataset you want to process
DATASET_NAME = 'DUO1000'

# Initialize Dataset Object. 
ds = AudioDataset(DATASET_NAME)
    
# The following line provides some information about the retrieved AudioDataset object    
ds.info()


#ds.addFeature('mfcc10')
#ds.setFeature('mfcc10', providers.FromSample(ds.samples_path, featurizers.MFCC(n_mfcc=10)))

#ds.addFeature('mfcc05')
#ds.setFeature('mfcc05', providers.FromSample(ds.samples_path, featurizers.MFCC(n_mfcc=5)))

ds.addFeature('magic05')
ds.setFeature('magic05', providers.FromSample(ds.samples_path, featurizers.Magic(10,1000,10)))


[2020-08-13/16:40:54.571|31.0%|84.8%|0.09GB] ------------------------------------------------------
[2020-08-13/16:40:54.572|00.0%|84.8%|0.09GB] DATASET NAME          : DUO1000
[2020-08-13/16:40:54.572|00.0%|84.8%|0.09GB] DATASET PATH          : D:\Jupyter\ShowBees\datasets\DUO1000
[2020-08-13/16:40:54.572|00.0%|84.8%|0.09GB] DATASET DB PATH       : D:\Jupyter\ShowBees\datasets\DUO1000\database.db
[2020-08-13/16:40:54.574|00.0%|84.8%|0.09GB] DATASET SAMPLES PATH  : D:\Jupyter\ShowBees\datasets\DUO1000\samples
[2020-08-13/16:40:54.574|00.0%|84.8%|0.09GB] NB SOURCE AUDIO FILES : 4
[2020-08-13/16:40:54.574|00.0%|84.8%|0.09GB] SAMPLE RATE           : 22050
[2020-08-13/16:40:54.574|00.0%|84.8%|0.09GB] DURATION              : 1.0
[2020-08-13/16:40:54.574|00.0%|84.8%|0.09GB] OVERLAP               : 0.0
[2020-08-13/16:40:54.575|00.0%|84.8%|0.09GB] NB AUDIO CHUNKS       : 2374
[2020-08-13/16:40:54.575|00.0%|84.8%|0.09GB] ------------------------------------------------------


HBox(children=(FloatProgress(value=0.0, description='Computing magic05', max=2374.0, style=ProgressStyle(descr…




2374

In [4]:
# Display cardinalities by hives and queen/noqueen for samples with no external perturbation
sql = """
    select distinct hive, queen, count(*)
    from samples
    where nobee = 0
    group by hive, queen
    order by hive
    """
ds.queryDataFrame(sql)

Unnamed: 0,hive,queen,count(*)
0,Hive1,0.0,409
1,Hive1,1.0,391
2,Hive3,0.0,406
3,Hive3,1.0,304


In [5]:
# Get pandas dataframe
sql = "select name, file_id, hive, queen, hivequeen, mfcc20 from samples where nobee = 0"
df = ds.queryDataFrame(sql)
df

Unnamed: 0,name,file_id,hive,queen,hivequeen,mfcc20
0,00-000000,1,Hive1,1.0,Hive1-1.0,"[-408.8679504394531, -408.73077392578125, -409..."
1,00-000001,1,Hive1,1.0,Hive1-1.0,"[-423.73028564453125, -417.4234619140625, -409..."
2,00-000002,1,Hive1,1.0,Hive1-1.0,"[-402.0168762207031, -405.77630615234375, -413..."
3,00-000003,1,Hive1,1.0,Hive1-1.0,"[-415.6734619140625, -411.8334655761719, -410...."
4,00-000004,1,Hive1,1.0,Hive1-1.0,"[-410.3650817871094, -402.5377197265625, -407...."
...,...,...,...,...,...,...
1505,03-000587,4,Hive3,1.0,Hive3-1.0,"[-435.4527893066406, -431.11077880859375, -440..."
1506,03-000588,4,Hive3,1.0,Hive3-1.0,"[-462.9355163574219, -454.59075927734375, -456..."
1507,03-000589,4,Hive3,1.0,Hive3-1.0,"[-456.2651672363281, -455.3929138183594, -456...."
1508,03-000590,4,Hive3,1.0,Hive3-1.0,"[-444.8450927734375, -443.4344787597656, -448...."


In [4]:
ds.query("select name, file_id, hive, queen from samples where nobee = 0")

('00-000000', 1, 'Hive1', 1.0)
('00-000001', 1, 'Hive1', 1.0)
('00-000002', 1, 'Hive1', 1.0)
('00-000003', 1, 'Hive1', 1.0)
('00-000004', 1, 'Hive1', 1.0)
('00-000005', 1, 'Hive1', 1.0)
('00-000006', 1, 'Hive1', 1.0)
('00-000007', 1, 'Hive1', 1.0)
('00-000008', 1, 'Hive1', 1.0)
('00-000009', 1, 'Hive1', 1.0)
('00-000010', 1, 'Hive1', 1.0)
('00-000011', 1, 'Hive1', 1.0)
('00-000012', 1, 'Hive1', 1.0)
('00-000013', 1, 'Hive1', 1.0)
('00-000014', 1, 'Hive1', 1.0)
('00-000015', 1, 'Hive1', 1.0)
('00-000016', 1, 'Hive1', 1.0)
('00-000017', 1, 'Hive1', 1.0)
('00-000018', 1, 'Hive1', 1.0)
('00-000019', 1, 'Hive1', 1.0)
('00-000020', 1, 'Hive1', 1.0)
('00-000021', 1, 'Hive1', 1.0)
('00-000022', 1, 'Hive1', 1.0)
('00-000023', 1, 'Hive1', 1.0)
('00-000024', 1, 'Hive1', 1.0)
('00-000025', 1, 'Hive1', 1.0)
('00-000026', 1, 'Hive1', 1.0)
('00-000027', 1, 'Hive1', 1.0)
('00-000028', 1, 'Hive1', 1.0)
('00-000029', 1, 'Hive1', 1.0)
('00-000030', 1, 'Hive1', 1.0)
('00-000031', 1, 'Hive1', 1.0)
('00-000

In [19]:
import csv
from pathlib import Path



# Iterating over one column - `f` is some function that processes your data
# result = [f(x) for x in df['col']]
# Iterating over two columns, use `zip`
# result = [f(x, y) for x, y in zip(df['col1'], df['col2'])]
# Iterating over multiple columns - same data type
#result = [f(row[0], ..., row[n]) for row in df[['col1', ...,'coln']].to_numpy()]
# Iterating over multiple columns - differing data type
# result = [f(row[0], ..., row[n]) for row in zip(df['col1'], ..., df['coln'])]



def exportTSV(ds, sql, output_dir, label_names, feature_names):
    output_path = Path(output_dir)
    if not output_path.exists():
        output_path.mkdir(parents=True)
     
    df = ds.queryDataFrame(sql)
    df_labels = df[label_names]
    df_features = df[feature_names]
    
    if isinstance(label_names, list):
        str_labels = '_'.join(label_names)
    else:
        str_labels = label_names
        
    labels_ouput_path = Path(output_path, ds.ds_name + '_labels_'+ str_labels +'.tsv')
 
    df_labels.to_csv(
        labels_ouput_path,
        sep='\t', 
        index=False, 
        header = (isinstance(label_names, list) and len(label_names) > 1)          
    )    
    


    if isinstance(feature_names, list):
        str_features = '_'.join(feature_names)
    else:
        str_features = feature_names 
        
    features_output_path = Path(output_path, ds.ds_name + '_features_'+ str_features +'.tsv')
    
    # TODO: Manage list of features
    with open(features_output_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        for f in df_features:
            writer.writerow(f)        

    
exportTSV(
    ds,
    "select hive, queen, mfcc20 from samples where nobee = 0",
    "./output",
    'hive',
    'mfcc20'
)   

exportTSV(
    ds,
    "select hive, queen, mfcc20 from samples where nobee = 0",
    "./output",
    ['hive', 'queen'],
    'mfcc20'
)            

exportTSV(
    ds,
    "select hive||'_'||queen as hivequeen, mfcc20 from samples where nobee = 0",
    "./output",
    'hivequeen',
    'mfcc20'
)         

            
            
exportTSV(
    ds,
    "select hive, queen, mfcc20 from samples where nobee = 0",
    "./output",
    'queen',
    'mfcc20'
)

exportTSV(
    ds,
    "select queen, mfcc10 from samples where nobee = 0",
    "./output",
    'queen',
    'mfcc10'
)

exportTSV(
    ds,
    "select queen, mfcc05 from samples where nobee = 0",
    "./output",
    'queen',
    'mfcc05'
)
            

    
exportTSV(
    ds,
    "select queen, magic05 from samples where nobee = 0",
    "./output",
    'queen',
    'magic05'
)    
    