# WTS Pipeline Integration with UMAP, HDBSCAN
Exploratory notebook for working on birdnet embeddings

### Imports and Setup

In [1]:
import pandas as pd
import os
import numpy as np
from annotation_post_processing import *

In [2]:
embeddingColumns = [str(i) for i in range(420)] + ["UMAP_0", "UMAP_1"]
columnNames = ["START", "END"] + embeddingColumns
path = './input/cosmos_embeddings/'

In [3]:
embeddings_df = pd.read_csv("./input/umap_cosmos_embeddings.csv")

In [4]:
automated_dfs:list[pd.DataFrame] = []
automated_dfs.append(pd.read_csv("./cosmos_annotations/automated_cosmos_tweety_to_file.csv"))
automated_dfs.append(pd.read_csv("./cosmos_annotations/COSMOS_BirdNET-Lite_Labels_05Conf.csv"))
automated_dfs.append(pd.read_csv("./cosmos_annotations/COSMOS_BirdNET-Lite_Labels_100.csv"))
automated_dfs.append(pd.read_csv("./cosmos_annotations/COSMOS_BirdNET-Lite-Filename_Labels_05Conf.csv"))
automated_dfs.append(pd.read_csv("./cosmos_annotations/COSMOS_Microfaune-Filename_Labels_100.csv"))
print(automated_dfs)

[       Unnamed: 0    OFFSET  DURATION  \
0               0  1.883721  0.116279   
1               1  3.976744  0.046512   
2               2  5.976744  0.046512   
3               3  6.023256  0.046512   
4               4  6.069767  0.116279   
...           ...       ...       ...   
55642       55642  1.511628  0.325581   
55643       55643  1.860465  0.139535   
55644       55644  5.976744  0.372093   
55645       55645  6.627907  0.255814   
55646       55646  0.046512  0.255814   

                                         FOLDER  \
0      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
1      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
2      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
3      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
4      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
...                                         ...   
55642  C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
55643  C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
55644  C:/Users/Siloux/Desktop/E4E/Cosmos_data/

## Clustering with HDBSCAN

In [9]:
# Results for general embedding clustering
# hdbscan_results = pd.read_csv("./ClusteringModels/umap_general.csv")

# Results for species-specific clustering
hdbscan_all = dict()
unique_species = embeddings_df["FILE SPECIES"].unique()
i = 0
for a in [5, 10, 20, 50, 100, 200, 500]:
    for b in [5, 10, 20, 50, 100, 200, 500]:
        hdbscan_results = pd.DataFrame(columns=["FILE SPECIES", "PATH"] + columnNames + ["IN FILE", "LABELS"])
        j = 0
        for species in unique_species:
            species_result = pd.read_csv(f"./ClusteringModels/umap_species_specific/{a}_{b}_{species}.csv").drop(["Unnamed: 0"], axis=1)
            # Method 1: Simply filters out what was labeled as noise in recording
            filter_1 = species_result[species_result["LABELS"] == -1]
            
            # Method 2: Filters out noise and creates the filter by checking the n most frequent values of embedding labels (essentially to see most frequent bird labels, should be dominant)
            n = 2
            species_result = species_result[species_result["LABELS"] != -1]
            max_nums = species_result["LABELS"].value_counts()[:n].index.tolist() # picking n most frequent values
            filter_2 = species_result[~species_result["LABELS"].isin(max_nums)]
            
            # filter = filter_1
            filter = pd.concat([filter_1, filter_2], axis=0)
            
            hdbscan_results = pd.concat([hdbscan_results, filter], axis=0)
            j += 1
            print(f"Done with {j} of {len(unique_species)}")
        hdbscan_all[a,b] = hdbscan_results
        i += 1
        print(f"Done with {i} iterations of hyperparameters")
        

hdbscan_all = [results.reset_index(drop = True) for results in hdbscan_all.values()]

Done with 1 of 10
Done with 2 of 10
Done with 3 of 10
Done with 4 of 10
Done with 5 of 10
Done with 6 of 10
Done with 7 of 10
Done with 8 of 10
Done with 9 of 10
Done with 10 of 10
Done with 1 iterations of hyperparameters
Done with 1 of 10
Done with 2 of 10
Done with 3 of 10
Done with 4 of 10
Done with 5 of 10
Done with 6 of 10
Done with 7 of 10
Done with 8 of 10
Done with 9 of 10
Done with 10 of 10
Done with 2 iterations of hyperparameters
Done with 1 of 10
Done with 2 of 10
Done with 3 of 10
Done with 4 of 10
Done with 5 of 10
Done with 6 of 10
Done with 7 of 10
Done with 8 of 10
Done with 9 of 10
Done with 10 of 10
Done with 3 iterations of hyperparameters
Done with 1 of 10
Done with 2 of 10
Done with 3 of 10
Done with 4 of 10
Done with 5 of 10
Done with 6 of 10
Done with 7 of 10
Done with 8 of 10
Done with 9 of 10
Done with 10 of 10
Done with 4 iterations of hyperparameters
Done with 1 of 10
Done with 2 of 10
Done with 3 of 10
Done with 4 of 10


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
filtered_embeddings = hdbscan_all
print("Created filter")

filtered_embeddings[5,5]

### Applying Mask

In [None]:
count1 = 0
def split_annotations(df: pd.DataFrame):
    all_split_ann = pd.DataFrame(columns = df.columns)
    for i in range(df.shape[0]):
        x = df.iloc[i]
        startsends = np.linspace(3.0 * (int(x["OFFSET"] / 3)), 3.0 * (int((x["OFFSET"] + x["DURATION"])/ 3) + 1), int((x["OFFSET"] + x["DURATION"])/ 3) - int(x["OFFSET"] / 3) + 2)
        starts = startsends[:-1]
        starts[0] = x["OFFSET"]
        ends = startsends[1:]
        ends[-1] = x["OFFSET"] + x["DURATION"]
        split_ann = pd.DataFrame(columns = x.index)
        for i in range(len(starts)):
            new_x = pd.DataFrame(x.copy()).T
            new_x["OFFSET"] = starts[i]
            new_x["DURATION"] = ends[i] - starts[i]
            if np.isclose(new_x["DURATION"], 0):
                continue
            split_ann = pd.concat([split_ann, new_x])
        all_split_ann = pd.concat([all_split_ann, split_ann])
        global count1
        count1 += 1
        print(f"Completed {count1} annotations")
    return all_split_ann.reset_index(drop = True)

count2 = 0
def create_annotation_filter(x: pd.Series, filter: pd.DataFrame) -> pd.DataFrame:
    filter_x = filter[filter["IN FILE"].str.startswith(x["IN FILE"].split(".mp3")[0])]
    starts = filter_x["START"].to_numpy()
    ends = filter_x["END"].to_numpy()
    close_starts = np.isclose(starts, x["OFFSET"]).sum()
    close_ends = np.isclose(ends, x["OFFSET"] + x["DURATION"]).sum()
    middle1 = starts < x["OFFSET"]
    middle2 = ends > x["OFFSET"] + x["DURATION"]
    middle = (middle1*middle2).sum()
    if (close_starts + close_ends + middle) > 0:
        x["FILTERED"] = True
    else:
        x["FILTERED"] = False
    global count2
    count2 += 1
    print(f"Completed {count2} annotations")
    return x

In [None]:
automated_dfs_split = [split_annotations(df) for df in automated_dfs]

print(automated_dfs_split)

In [None]:
automated_dfs_filtered = dict()
for a in [5, 10, 20, 50, 100, 200, 500]:
    for b in [5, 10, 20, 50, 100, 200, 500]:
        automated_dfs_filtered = [df.apply(lambda x: create_annotation_filter(x, filtered_embeddings[a, b]), axis = 1) for df in automated_dfs_split]
        automated_dfs_filtered = [df[~df["FILTERED"]] for df in automated_dfs_filtered]
        automated_dfs_filtered[a,b] = automated_dfs_filtered
print(automated_dfs_filtered[5,5])

In [None]:
print([df.shape[0] for df in automated_dfs])
print([df.shape[0] for df in automated_dfs_split])
print([df.shape[0] for df in automated_dfs_filtered[5,5]])

### Statistics

In [None]:
from statistics_1 import *

In [None]:
manual_df = pd.read_csv("cosmos_annotations/cosmos_labeled_data_files_added.csv")
manual_df["IN FILE"] = manual_df["IN FILE"].apply(lambda x: " ".join(x.split("_")))
manual_df["FOLDER"] = "./cosmos_annotations/"
manual_df

In [None]:
import warnings
warnings.filterwarnings("ignore")
min_cluster_size = 5
min_samples = 5
clip_stats_original = [clip_statistics(df, manual_df, "general") for df in automated_dfs]
clip_stats_filtered = [clip_statistics(df, manual_df, "general") for df in automated_dfs_filtered[5,5]]

In [None]:
class_stats_original = [class_statistics(stats) for stats in clip_stats_original]
class_stats_filtered = [class_statistics(stats) for stats in clip_stats_filtered]

In [None]:
class_stats_original[0]

In [None]:
class_stats_filtered[0]

In [None]:
all_class_stats = [pd.concat([class_stats_original[i].assign(MODEL = "original"), class_stats_filtered[i].assign(MODEL = "filtered")]) for i in range(len(class_stats_original))]
all_class_stats = [df[df["MANUAL ID"] != "Lipaugus vociferans"] for df in all_class_stats]
all_class_stats[0]

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting each model separately
figure, axes = plt.subplots(1, len(class_stats_original), figsize = (30, 5), sharex = False, sharey = True)

x = 0
model_list = ["Tweety to File", "BirdNET-Lite Labels 05Conf", "BirdNET-Lite Labels 100", "BirdNET-Lite to Filename", "Microfaune to Filename"]

for model in model_list:
    plot = sns.barplot(ax = axes[x], data = all_class_stats[x], x = "MANUAL ID", y = "PRECISION", hue = "MODEL")
    for label in plot.get_xticklabels():
        label.set_rotation(90)
    plot.set(title = model)
    x += 1

In [None]:
figure, axes = plt.subplots(1, len(class_stats_original), figsize = (30, 5), sharex = False, sharey = True)

x = 0

for model in model_list:
    plot = sns.barplot(ax = axes[x], data = all_class_stats[x], x = "MANUAL ID", y = "RECALL", hue = "MODEL")
    for label in plot.get_xticklabels():
        label.set_rotation(90)
    plot.set(title = model)
    x += 1