In [1]:
import pandas as pd
import numpy as np

In [2]:
automated_df = pd.read_csv("reduced_automated_labels.csv")
manual_df = pd.read_csv("reduced_manual_labels.csv")

In [3]:
# Pseudocode outline

# Get a list of unique clips
#      Make sure that the clip lists are the same across both the automated and manual clips
# Create a list of species identified with respect to each clip in both the automated and manual dataframes
# Add confidence onto the automated dataframe

In [4]:
automated_df

Unnamed: 0,IN FILE,MANUAL ID,OFFSET,DURATION,Confidence
0,20210816_030000.WAV,Common Poorwill,36,3,0.108588
1,20210816_014000.WAV,California Towhee,18,3,0.843924
2,20210820_145000.WAV,Bewick's Wren,3,3,0.260280
3,20210820_145000.WAV,Bewick's Wren,9,3,0.200800
4,20210820_145000.WAV,Bewick's Wren,15,3,0.803835
...,...,...,...,...,...
177,20210820_020000_OTH1.WAV,California Towhee,42,3,0.923568
178,20210820_020000_OTH1.WAV,California Towhee,45,3,0.387194
179,20210820_020000_OTH1.WAV,California Towhee,48,3,0.811322
180,20210820_020000_OTH1.WAV,California Towhee,51,3,0.825185


In [5]:
manual_df

Unnamed: 0,IN FILE,CLIP LENGTH,OFFSET,DURATION,MAX FREQ,MIN FREQ,SAMPLE RATE,MANUAL ID,TIME_SPENT,LAST MOD BY,LAST MOD DATE
0,20210818_023000.WAV,60,57.5400,0.50,24000,0,384000,California Gnatcatcher,350.585,JacobGlennAyers,09/15/2021
1,20210818_023000.WAV,60,1.4700,0.30,24000,0,384000,California Thrasher,350.570,JacobGlennAyers,09/15/2021
2,20210812_005000.WAV,60,5.2700,1.02,24000,0,384000,Glaucus-winged Gull,70.793,JacobGlennAyers,09/15/2021
3,20210821_015000.WAV,60,59.7150,0.11,24000,0,384000,California Towhee,849.662,JacobGlennAyers,09/15/2021
4,20210821_015000.WAV,60,6.8750,0.17,24000,0,384000,California Thrasher,849.609,JacobGlennAyers,09/15/2021
...,...,...,...,...,...,...,...,...,...,...,...
131,20210820_020000_OTH1.WAV,60,48.6400,0.39,24000,0,384000,California Towhee,206.798,arathbone,09/26/2021
132,20210820_020000_OTH1.WAV,60,59.7200,0.27,24000,0,384000,California Towhee,206.803,arathbone,09/26/2021
133,20210820_020000_OTH1.WAV,60,52.3600,0.24,24000,0,384000,California Towhee,206.799,arathbone,09/26/2021
134,20210820_020000_OTH1.WAV,60,54.6700,0.27,24000,0,384000,California Towhee,206.800,arathbone,09/26/2021


In [14]:
# acquiring a list of unique clips that received automated labels 
clip_list_automated = automated_df["IN FILE"].to_list()
clip_list_automated = list(dict.fromkeys(clip_list_automated))
# acquiring a list of unique clips that received manual labels
clip_list_manual = manual_df["IN FILE"].to_list()
clip_list_manual = list(dict.fromkeys(clip_list_manual))
# sorting the lists
clip_list_manual.sort()
clip_list_automated.sort()

In [16]:
# quick sanity check
print(clip_list_manual == clip_list_automated)
clip_list = clip_list_manual

True


In [27]:
# creating a new dataframe for both the manual and automated labels that focus on general clip-level metrics
manual_clip_dict = {
    "CLIP" : [],
    "SPECIES" : []
}
automated_clip_dict = {
    "CLIP" : [],
    "SPECIES" : [],
    "CONFIDENCE" : []
}
# going through each audio clip
for clip in clip_list:
    # Isolating labels from the dataset with respect to the day
    temp_clip_manual_df = manual_df[manual_df["IN FILE"] == clip]
    temp_clip_automated_df = automated_df[automated_df["IN FILE"] == clip]
    # creating a unique list of the species from the respective datasets
    temp_manual_species_list = list(dict.fromkeys(temp_clip_manual_df["MANUAL ID"].to_list()))
    temp_automated_species_list = list(dict.fromkeys(temp_clip_automated_df["MANUAL ID"].to_list()))
    # adding the manual species list to its respective dictionary
    manual_clip_dict["SPECIES"].extend(temp_manual_species_list)
    # adding the respective clip name to the dictionary, equal to the length of the manual species list
    manual_clip_dict["CLIP"].extend([clip]*len(temp_manual_species_list))
    # creating a list of unique species found in the automated dataframe with respect to the current clip
    temp_automated_species_list = list(dict.fromkeys(temp_clip_automated_df["MANUAL ID"].to_list()))
    # additional step needed to make sure the highest confidence prediction in a clip is selected for a species
    for species in temp_automated_species_list:
        temp_confidence = temp_clip_automated_df[temp_clip_automated_df["MANUAL ID"] == species]["Confidence"].max()
        automated_clip_dict["CLIP"].append(clip)
        automated_clip_dict["SPECIES"].append(species)
        automated_clip_dict["CONFIDENCE"].append(temp_confidence)   

In [28]:
manual_clip_df = pd.DataFrame.from_dict(manual_clip_dict)
automated_clip_df = pd.DataFrame.from_dict(automated_clip_dict)

In [29]:
manual_clip_df

Unnamed: 0,CLIP,SPECIES
0,20210812_005000.WAV,Glaucus-winged Gull
1,20210813_125000.WAV,California Towhee
2,20210816_014000.WAV,American Crow
3,20210816_014000.WAV,Anna's Hummingbird
4,20210816_014000.WAV,Wrentit
5,20210816_030000.WAV,Common Poorwill
6,20210816_132000.WAV,California Thrasher
7,20210816_132000.WAV,California Towhee
8,20210816_132000.WAV,Wrentit
9,20210816_161000.WAV,California Scrub-Jay


In [30]:
automated_clip_df

Unnamed: 0,CLIP,SPECIES,CONFIDENCE
0,20210812_005000.WAV,California Towhee,0.986042
1,20210812_005000.WAV,Scissor-tailed Nightjar,0.106793
2,20210812_005000.WAV,Malachite Kingfisher,0.766413
3,20210812_005000.WAV,Hooded Warbler,0.115692
4,20210812_005000.WAV,California Gnatcatcher,0.152474
...,...,...,...
65,20210821_015000.WAV,Wrentit,0.667216
66,20210821_015000.WAV,Reed Parrotbill,0.315138
67,20210821_015000.WAV,Black-tailed Crake,0.148713
68,20210821_024000_OFF5.WAV,Arabian Scops-Owl,0.114621


In [34]:
manual_clip_df.to_csv("manual_clip-level_reduced.csv",index=False)
automated_clip_df.to_csv("birdnet-lite_clip-level_reduced.csv",index=False)