In [1]:
import os
import sys
import glob
import numpy as np
import librosa
import librosa.display
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import wavfile
from aubio import onset
from scipy.signal import butter, lfilter
from multiprocessing import Pool, cpu_count

import signalproc
from birdutils import read_labels

%matplotlib inline

In [2]:
def find_overlap(features, labels):
    features['overlap'] = False
    labels['overlap'] = False
    filenames = labels['File Name'].unique()

    for filename in filenames:
        df_label = labels[labels['File Name'] == filename]
        df_feat = features[features['File Name'] == filename]
        for index1, row_label in df_label.iterrows():
            for index2, row_feat in df_feat.iterrows():
                feat_start = row_feat['Time Start']
                feat_end = row_feat['Time End']
                label_start = row_label['Time Start']
                label_end = row_label['Time End']
                overlap = (label_start <= feat_end) and (label_end >= feat_start)
                if overlap:
                    labels.at[index1, 'overlap'] = True
                    features.at[index2, 'overlap'] = True
    return features, labels

In [3]:
def find_onsets(path):
    name = os.path.splitext(os.path.basename(path))[0]
    y, sr = librosa.load(path, sr=16000)
    y = signalproc.bandpass_filter(y, sr, 200, 5000)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, feature=librosa.feature.melspectrogram, n_mels=128,
                                            fmax=8000, aggregate=np.mean, detrend=True, center=False)
    onset_s = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', backtrack=True, hop_length=512)
    onset_s = np.unique(onset_s)
    onset_end_s = onset_s + 0.8
    df = pd.DataFrame(data={'File Name': name, 'Time Start': onset_s, 'Time End': onset_end_s})
    return df

In [4]:
bird_calls_labels_path = '/mnt/data/Birdman/sthelena_labels.xls'
samples_dir = '/mnt/data/Birdman/samples/recordings/'
samples_paths = glob.glob(samples_dir + 'STHELENA-02_20140605_200000*.wav')
labels_dict = read_labels(bird_calls_labels_path)
labels = pd.concat(labels_dict, axis=0, ignore_index=True)

In [5]:
with Pool(cpu_count()) as p:
    onset_all = p.map(find_onsets, samples_paths)

In [6]:
onsets_df = pd.concat(onset_all)
print(len(onsets_df))

2870


In [7]:
feat, lab = find_overlap(onsets_df, labels)
sum(lab['overlap']) / len(lab)

0.9226804123711341