In [1]:
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np

In [4]:
automated_df = pd.read_csv("birdnet-lite_clip-level_reduced.csv")
manual_df = pd.read_csv("manual_clip-level_reduced.csv")

In [5]:
# Getting a unique list of all of the species with respect to the automated labels and with
# respect to the manual labels
automated_species_list = automated_df["SPECIES"].to_list()
automated_species_list = list(dict.fromkeys(automated_species_list))
manual_species_list = manual_df["SPECIES"].to_list()
manual_species_list = list(dict.fromkeys(manual_species_list))
# Creating a list of all of the species
union_species_list = list(set().union(manual_species_list,automated_species_list))

In [7]:
# creating a list of the days in which there were both automated and manual classifications
automated_clips_list = automated_df["CLIP"].to_list()
automated_clips_list = list(dict.fromkeys(automated_clips_list))
manual_clips_list = manual_df["CLIP"].to_list()
manual_clips_list = list(dict.fromkeys(manual_clips_list))
union_clips_list = list(set().union(automated_clips_list,manual_clips_list))

In [8]:
# initializing the dictionary that will be used to create the final dataframe
output_dict = {
    "SPECIES" : union_species_list,
    "TRUE POSITIVES" : [],
    "FALSE POSITIVES" : [],
    "FALSE NEGATIVES" : [],
    "TRUE NEGATIVES" : []
}
# initializing the list that will contain all of the relevant metrics
# initializing all of the values to zero
tp_list = [0]*len(union_species_list)
fp_list = [0]*len(union_species_list)
fn_list = [0]*len(union_species_list)
tn_list = [0]*len(union_species_list)

In [11]:
# The idea is that I am going to create a 2-layer nested for loop where the outer loop goes through the 
# species list while the inner for loop goes through each relevant deployment clip, deciding whether each
# species was IDed as a TP, FP, FN, TN with respect to each clip

# Setting threshold by which we consider the confidence of an automated label to be good enough
threshold = 0.3

# looping through each bird species
for ndx in range(len(union_species_list)):
    # collecting current species of interest
    cur_species = union_species_list[ndx]
    # looping through the relevant clips
    for clip in union_clips_list:
        # creating a temporary dataframe that contains the manual annotations from the relevant clip
        tmp_manual_df = manual_df[manual_df["CLIP"] == clip]
        # creating a temporary dataframe that contains the automated annotations from the relevant clip
        tmp_automated_df = automated_df[automated_df["CLIP"] == clip]
        
        # comparing the automated and manual annotations. Breaking down into TP, FP, FN, TN scores
        manual_species_df = tmp_manual_df[tmp_manual_df["SPECIES"] == cur_species]
        automated_species_df = tmp_automated_df[tmp_automated_df["SPECIES"] == cur_species]

        #  resetting the indices to make further calculations easier
        if (not manual_species_df.empty):
            manual_species_df.reset_index(drop=True,inplace=True)
        # 
        if (not automated_species_df.empty):
            automated_species_df.reset_index(drop=True,inplace=True)
        
        # cases for true negative (TN)
        if (automated_species_df.empty and manual_species_df.empty):
            tn_list[ndx] += 1
            
        elif (not automated_species_df.empty) and (automated_species_df["CONFIDENCE"][0] < threshold and manual_species_df.empty):
            tn_list[ndx] += 1
            
        # case for false positive (FP)
        elif (manual_species_df.empty and (automated_species_df["CONFIDENCE"][0] > threshold)):
            fp_list[ndx] += 1
            
        # cases for false negative (FN)
        elif ((not manual_species_df.empty) and (not automated_species_df.empty) and (automated_species_df["CONFIDENCE"][0] < threshold)) :
            fn_list[ndx] += 1
            
        elif ((not manual_species_df.empty) and automated_species_df.empty) :
            fn_list[ndx] += 1
            
        # case for true positive (TP)
        elif((not manual_species_df.empty) and (automated_species_df["CONFIDENCE"][0] > threshold)):
            tp_list[ndx] += 1

In [12]:
output_dict["TRUE POSITIVES"] = tp_list
output_dict["FALSE POSITIVES"] = fp_list
output_dict["FALSE NEGATIVES"] = fn_list
output_dict["TRUE NEGATIVES"] = tn_list
output_df = pd.DataFrame.from_dict(output_dict)

In [13]:
output_df

Unnamed: 0,SPECIES,TRUE POSITIVES,FALSE POSITIVES,FALSE NEGATIVES,TRUE NEGATIVES
0,Black-tailed Crake,0,0,0,13
1,Southern Whiteface,0,0,0,13
2,Common Bulbul,0,0,0,13
3,Bewick's Wren,0,1,0,12
4,Anna's Hummingbird,0,0,1,12
5,Española Mockingbird,0,0,0,13
6,Brown-crested Flycatcher,0,1,0,12
7,Pearl Kite,0,0,0,13
8,Chalk-browed Mockingbird,0,0,0,13
9,Song Thrush,0,1,0,12


In [14]:
# precision = tp/(tp+fp)
precision_list = []
# recall = tp/(tp+fn)
recall_list = []
# f1 = 2*precision*recall/(precision+recall)
f1_list = []
# accuracy = tp/(tp+tn)
accuracy_list = []
for row in output_df.index:
    # getting in confusion matrix scores
    tp = output_df["TRUE POSITIVES"][row]
    fp = output_df["FALSE POSITIVES"][row]
    fn = output_df["FALSE NEGATIVES"][row]
    tn = output_df["TRUE NEGATIVES"][row]
    precision = 0.0
    recall = 0.0
    f1 = 0.0
    accuracy = 0.0
    # calculating precision
    if ((tp + fp) == 0):
        precision_list.append(precision)
    else:
        precision = tp/(tp+fp)
        precision_list.append(precision)
        
    # calculating recall
    if ((tp+fn) == 0):
        recall_list.append(recall)
    else:
        recall = tp/(tp+fn)
        recall_list.append(recall)
    
    # calculating f1
    if ( (precision == 0.0 ) and (recall == 0.0 ) ):
        f1_list.append(f1)
    else:
        f1 = 2*precision*recall/(precision+recall)
        f1_list.append(f1)
        
    # calculating accuracy
    if ((tp+tn) == 0):
        accuracy_list.append(accuracy)
    else:
        accuracy = tp/(tp+tn)
        accuracy_list.append(accuracy)
output_df["PRECISION"] = precision_list
output_df["RECALL"] = recall_list
output_df["F1"] = f1_list
output_df["ACCURACY"] = accuracy_list
output_df["F1"] = output_df["F1"].fillna(0.0)

In [15]:
output_df

Unnamed: 0,SPECIES,TRUE POSITIVES,FALSE POSITIVES,FALSE NEGATIVES,TRUE NEGATIVES,PRECISION,RECALL,F1,ACCURACY
0,Black-tailed Crake,0,0,0,13,0.0,0.0,0.0,0.0
1,Southern Whiteface,0,0,0,13,0.0,0.0,0.0,0.0
2,Common Bulbul,0,0,0,13,0.0,0.0,0.0,0.0
3,Bewick's Wren,0,1,0,12,0.0,0.0,0.0,0.0
4,Anna's Hummingbird,0,0,1,12,0.0,0.0,0.0,0.0
5,Española Mockingbird,0,0,0,13,0.0,0.0,0.0,0.0
6,Brown-crested Flycatcher,0,1,0,12,0.0,0.0,0.0,0.0
7,Pearl Kite,0,0,0,13,0.0,0.0,0.0,0.0
8,Chalk-browed Mockingbird,0,0,0,13,0.0,0.0,0.0,0.0
9,Song Thrush,0,1,0,12,0.0,0.0,0.0,0.0


In [16]:
tp_count = sum(tp_list)
fp_count = sum(fp_list)
fn_count = sum(fn_list)
tn_count = sum(tn_list)
total_precision = (tp_count)/(tp_count+fp_count)
total_recall = (tp_count)/(fn_count+tp_count)
total_f1 = (2*total_precision*total_recall)/(total_precision+total_recall)
total_accuracy = tp_count/(tp_count+tn_count)
summary_dict = {
    "TRUE POSITIVES" : [tp_count],
    "FALSE POSITIVES" : [fp_count],
    "FALSE NEGATIVES" : [fn_count],
    "TRUE NEGATIVES" : [tn_count],
    "PRECISION" : [total_precision],
    "RECALL" : [total_recall],
    "F1" : [total_f1],
    "ACCURACY" : [total_accuracy]
}
summary_df = pd.DataFrame.from_dict(summary_dict)
summary_df

Unnamed: 0,TRUE POSITIVES,FALSE POSITIVES,FALSE NEGATIVES,TRUE NEGATIVES,PRECISION,RECALL,F1,ACCURACY
0,5,18,21,671,0.217391,0.192308,0.204082,0.007396
