In [15]:
import os
import glob
import gcsfs
import numpy as np
import pandas as pd

In [3]:
fs = gcsfs.GCSFileSystem(project='birdman-project')
with fs.open('storm-petrels/samples/features_warbler_buffer_250ms.csv') as f:
    warbler_df = pd.read_csv(f)
with fs.open('storm-petrels/samples/sthelena_labels_overlap.csv') as f:
    labels_df = pd.read_csv(f)

In [4]:
labels_df.head()

Unnamed: 0,Date,File Name,Type of Call,Time Start,Time End,Species,Notes,overlap,storm_petrel
0,2014-06-05,STHELENA-02_20140605_200000_1,Flight Call,2.905,4.197,Storm Petrel,,True,True
1,2014-06-05,STHELENA-02_20140605_200000_1,Flight Call,10.008,11.622,Storm Petrel,,True,True
2,2014-06-05,STHELENA-02_20140605_200000_1,Flight Call,14.527,16.464,Storm Petrel,,True,True
3,2014-06-05,STHELENA-02_20140605_200000_1,Flight Call,17.11,18.724,Storm Petrel,,True,True
4,2014-06-05,STHELENA-02_20140605_200000_1,Flight Call,20.338,21.307,Storm Petrel,,True,True


In [5]:
warbler_df.head()

Unnamed: 0,index,sound.files,selec,overlap,storm_petrel,storm_petrel_ground_truth,brown_noddy,Time Start,Time End,duration,...,mindom,maxdom,dfrange,modindx,startdom,enddom,dfslope,meanpeakf,peakf,notes
0,0,STHELENA-02_20140605_200000_10,1,False,0,1,0,10.920001,11.527314,0.607313,...,1.0,2.9375,1.9375,10.677419,1.0625,1.25,0.308737,2.866071,0.016466,
1,1,STHELENA-02_20140605_200000_10,2,True,1,1,0,19.95544,20.619502,0.664063,...,1.0,3.625,2.625,10.714286,1.0,2.625,2.447059,3.181052,0.040659,
2,2,STHELENA-02_20140605_200000_10,3,True,1,1,0,20.296877,21.14094,0.844063,...,1.0,3.625,2.625,17.285714,3.625,1.25,-2.813773,3.118056,0.04502,
3,3,STHELENA-02_20140605_200000_10,4,True,1,1,0,20.700627,21.367252,0.666625,...,1.0,3.1875,2.1875,21.342857,3.0,1.0625,-2.906432,2.992063,0.021001,
4,4,STHELENA-02_20140605_200000_10,5,True,1,1,0,20.899127,21.600002,0.700875,...,1.0,3.3125,2.3125,17.864865,1.0,1.3125,0.445871,2.677083,0.022829,


In [6]:
warbler_df = warbler_df.rename(columns={'sound.files': 'File Name'})

In [7]:
extra_petrels_mask_labels = (labels_df['Species'] == 'Storm Petrel') & (~labels_df['overlap'])
extra_petrels_mask_warbler = warbler_df['storm_petrel_ground_truth'] & (~warbler_df['overlap'])
warbler_petrels_mask = warbler_df['storm_petrel_ground_truth'].astype(bool)
print('No of petrels calls NOT found by warbleR:', sum(extra_petrels_mask_labels))
print('No of petrels calls NOT found in labels:', sum(extra_petrels_mask_warbler))

No of petrels calls NOT found by warbleR: 114
No of petrels calls NOT found in labels: 93


In [8]:
column_selection = ['File Name', 'Time Start', 'Time End', 'duration']
warbler_petrels = warbler_df[warbler_petrels_mask][column_selection]
warbler_petrels = warbler_petrels.rename(columns={'sound.files': 'File Name'})

In [9]:
warbler_petrels.head()

Unnamed: 0,File Name,Time Start,Time End,duration
0,STHELENA-02_20140605_200000_10,10.920001,11.527314,0.607313
1,STHELENA-02_20140605_200000_10,19.95544,20.619502,0.664063
2,STHELENA-02_20140605_200000_10,20.296877,21.14094,0.844063
3,STHELENA-02_20140605_200000_10,20.700627,21.367252,0.666625
4,STHELENA-02_20140605_200000_10,20.899127,21.600002,0.700875


In [10]:
sum(labels_df['overlap']) / len(labels_df)

0.6726804123711341

In [11]:
extra_petrels_labels = labels_df[extra_petrels_mask_labels].copy()
extra_petrels_labels['duration'] = labels_df[extra_petrels_mask_labels]['Time End'] - labels_df[extra_petrels_mask_labels]['Time Start']
extra_petrels_labels = extra_petrels_labels[column_selection]

In [12]:
extra_petrels_labels.head(10)

Unnamed: 0,File Name,Time Start,Time End,duration
6,STHELENA-02_20140605_200000_1,47.134,52.944,5.81
7,STHELENA-02_20140605_200000_1,60.692,61.984,1.292
9,STHELENA-02_20140605_200000_1,67.472,68.763,1.291
10,STHELENA-02_20140605_200000_1,73.606,75.543,1.937
22,STHELENA-02_20140605_200000_1,173.038,174.329,1.291
23,STHELENA-02_20140605_200000_1,184.983,185.629,0.646
24,STHELENA-02_20140605_200000_1,195.959,197.25,1.291
25,STHELENA-02_20140605_200000_1,218.235,219.849,1.614
26,STHELENA-02_20140605_200000_1,229.211,230.502,1.291
27,STHELENA-02_20140605_200000_1,231.148,231.794,0.646


In [13]:
df = pd.concat([warbler_petrels, extra_petrels_labels], axis=0)

In [36]:
def split_long(df, max_len):

    split = []

    for idx, row in df.iterrows():
        start = row['Time Start']
        end = row['Time End']

        start_r = np.arange(start, end, max_len / 2)
        if len(start_r) == 1 or len(start_r) == 2:
            split.append((idx, row['File Name'], start, end))
        end_r = start_r + max_len
        end_r = end_r[end_r < end]
        start_r = start_r[:len(end_r)]
        for s, e in zip(start_r, end_r):
            split.append((idx, row['File Name'], s, e))
    df = pd.DataFrame.from_records(split, columns=['idx', 'File Name', 'Time Start', 'Time End'])
    
    return df

In [37]:
df_all_patrels = split_long(df, max_len=0.8)

In [38]:
# df_all_patrels.to_csv('petrels.csv', index=False)
# df.to_csv('petrels_all.csv',index=False,  float_format='%.2f')

In [39]:
df_noddy = labels_df[labels_df['Species'] == 'Brown Noddy']

In [44]:
df_noddy_split = split_long(df_noddy, max_len=0.8)

In [50]:
fp_mask = (~warbler_df['storm_petrel_ground_truth']) & (~warbler_df['overlap'])
fp = warbler_df[fp][['File Name', 'Time Start', 'Time End', 'duration']]

In [52]:
len(fp)

88

In [53]:
len(df_noddy_split)

259