In [1]:
import os
import gcsfs
import pandas as pd
from birdutils import read_labels

In [2]:
fs = gcsfs.GCSFileSystem(project='birdman-project')
with fs.open('storm-petrels/samples/features/features_warbler_buffer_250ms.csv') as f:
    warbler_df = pd.read_csv(f)
with fs.open('storm-petrels/samples/labels/petrels_all.csv') as f:
    labels_df = pd.read_csv(f)
with fs.open('storm-petrels/samples/labels/sthelena_labels.xls') as f:
    labels_dict = read_labels(f)

In [3]:
filenames = labels_df['File Name'].unique()

In [4]:
labels_df = pd.concat(labels_dict, axis=0, ignore_index=True)

In [5]:
def get_acc(features, labels, filenames):
    features['sound.files'] = features['sound.files'].str.replace('.wav', '')
    features['overlap'] = False
    labels['overlap'] = False

    for filename in filenames:
        df_label = labels[labels['File Name'] == filename]
        df_feat = features[features['sound.files'] == filename]
        for index1, row_label in df_label.iterrows():
            for index2, row_feat in df_feat.iterrows():
                feat_start = row_feat['Time Start']
                feat_end = row_feat['Time End']
                label_start = row_label['Time Start']
                label_end = row_label['Time End']
                overlap = (label_start <= feat_end) and (label_end >= feat_start)
                if overlap:
                    labels.at[index1, 'overlap'] = True
                    features.at[index2, 'overlap'] = True
    return sum(labels['overlap']) / len(labels)

In [6]:
get_acc(warbler_df, labels_df, filenames)

0.6726804123711341

In [7]:
filenames

array(['STHELENA-02_20140605_200000_1', 'STHELENA-02_20140605_200000_10',
       'STHELENA-02_20140605_200000_11', 'STHELENA-02_20140605_200000_12',
       'STHELENA-02_20140605_200000_13', 'STHELENA-02_20140605_200000_2',
       'STHELENA-02_20140605_200000_3', 'STHELENA-02_20140605_200000_4',
       'STHELENA-02_20140605_200000_5', 'STHELENA-02_20140605_200000_6',
       'STHELENA-02_20140605_200000_7', 'STHELENA-02_20140605_200000_8',
       'STHELENA-02_20140605_200000_9'], dtype=object)

In [8]:
for feature_name in fs.glob('storm-petrels/samples/features/'):
    with fs.open(feature_name) as f:
        feature = pd.read_csv(f)
    name = os.path.basename(feature_name)
    acc = get_acc(feature, labels_df, filenames) * 100
    print(f'{name}: {acc:.2f}%')

features_samples_ssmooth800_th3.csv: 63.92%
features_samples_ssmooth800_th4.csv: 78.35%
features_samples_th2.csv: 81.44%
features_samples_th3.csv: 76.55%
features_warbler_buffer_250ms.csv: 67.27%
features_warbler_no_buffer.csv: 66.75%


In [4]:
fs.glob('storm-petrels/samples/features/')

['storm-petrels/samples/features/features_samples_ssmooth800_th3.csv',
 'storm-petrels/samples/features/features_samples_ssmooth800_th4.csv',
 'storm-petrels/samples/features/features_samples_th2.csv',
 'storm-petrels/samples/features/features_samples_th3.csv',
 'storm-petrels/samples/features/features_warbler_buffer_250ms.csv',
 'storm-petrels/samples/features/features_warbler_no_buffer.csv']