In [1]:
import os
import glob
import gcsfs
import numpy as np
import pandas as pd

In [3]:
fs = gcsfs.GCSFileSystem(project='birdman-project')
with fs.open('storm-petrels/samples/features/features_warbler_buffer_250ms.csv') as f:
    warbler_df = pd.read_csv(f)
with fs.open('storm-petrels/samples/labels/sthelena_labels_overlap.csv') as f:
    labels_df = pd.read_csv(f)

In [None]:
labels_df.head()

In [None]:
warbler_df.head()

In [None]:
warbler_df = warbler_df.rename(columns={'sound.files': 'File Name'})

In [None]:
extra_petrels_mask_labels = (labels_df['Species'] == 'Storm Petrel') & (~labels_df['overlap'])
extra_petrels_mask_warbler = warbler_df['storm_petrel_ground_truth'] & (~warbler_df['overlap'])
warbler_petrels_mask = warbler_df['storm_petrel_ground_truth'].astype(bool)
print('No of petrels calls NOT found by warbleR:', sum(extra_petrels_mask_labels))
print('No of petrels calls NOT found in labels:', sum(extra_petrels_mask_warbler))

In [None]:
column_selection = ['File Name', 'Time Start', 'Time End', 'duration']
warbler_petrels = warbler_df[warbler_petrels_mask][column_selection]
warbler_petrels = warbler_petrels.rename(columns={'sound.files': 'File Name'})

In [None]:
warbler_petrels.head()

In [None]:
sum(labels_df['overlap']) / len(labels_df)

In [None]:
extra_petrels_labels = labels_df[extra_petrels_mask_labels].copy()
extra_petrels_labels['duration'] = labels_df[extra_petrels_mask_labels]['Time End'] - labels_df[extra_petrels_mask_labels]['Time Start']
extra_petrels_labels = extra_petrels_labels[column_selection]

In [None]:
extra_petrels_labels.head(10)

In [None]:
df = pd.concat([warbler_petrels, extra_petrels_labels], axis=0)

In [None]:
def split_long(df, max_len):

    split = []

    for idx, row in df.iterrows():
        start = row['Time Start']
        end = row['Time End']

        start_r = np.arange(start, end, max_len / 2)
        if len(start_r) == 1 or len(start_r) == 2:
            split.append((idx, row['File Name'], start, end))
        end_r = start_r + max_len
        end_r = end_r[end_r < end]
        start_r = start_r[:len(end_r)]
        for s, e in zip(start_r, end_r):
            split.append((idx, row['File Name'], s, e))
    df = pd.DataFrame.from_records(split, columns=['idx', 'File Name', 'Time Start', 'Time End'])
    
    return df

In [None]:
df_all_patrels = split_long(df, max_len=0.8)

In [None]:
# df_all_patrels.to_csv('petrels.csv', index=False)
# df.to_csv('petrels_all.csv',index=False,  float_format='%.2f')

In [None]:
df_noddy = labels_df[labels_df['Species'] == 'Brown Noddy']

In [None]:
df_noddy_split = split_long(df_noddy, max_len=0.8)

In [None]:
fp_mask = (~warbler_df['storm_petrel_ground_truth']) & (~warbler_df['overlap'])
fp = warbler_df[fp][['File Name', 'Time Start', 'Time End', 'duration']]

In [None]:
len(fp)

In [None]:
len(df_noddy_split)