# WTS Pipeline Integration
Exploratory notebook for working on birdnet embeddings

### Imports and Setup

In [None]:
import pandas as pd
import os
import numpy as np
from annotation_post_processing import *

In [None]:
embeddingColumns = [str(i) for i in range(420)]
columnNames = ["START", "END"] + embeddingColumns
path = './input/cosmos_embeddings/'

In [None]:
def generate_embeddings_from_file(path, filename):
    with open(path + filename, 'r') as f:
        data = f.read()
    with open(path + filename, 'w') as f:
        f.write(",".join(data.split("\t")))
    file_df = pd.read_csv(path + filename, names = columnNames)
    file_df["IN FILE"] = filename[:filename.index(".birdnet")] + ".wav"
    return file_df

def generate_embeddings(path):
    df = pd.DataFrame()
    for filename in os.listdir(path):
        try:
            df = pd.concat([df, generate_embeddings_from_file(path, filename)], ignore_index = True)
        except Exception as e:
            print("Something went wrong with: " + filename)
    df["PATH"] = path
    columns = df.columns.tolist()
    columns = columns[-2:] + columns[:-2]
    df = df[columns]
    df = df.sort_values(["IN FILE", "START"], ascending = True)
    df = df.reset_index(drop = True)
    return df

In [None]:
embeddings_df = generate_embeddings(path)
embeddings_df

In [None]:
automated_df = pd.read_csv("cosmos_annotations.csv")
automated_df

### Filtering Embeddings with HDBSCAN

In [None]:
%matplotlib inline
import hdbscan
from hdbscan import HDBSCAN
from hdbscan.prediction import approximate_predict
import pickle
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from sklearn.preprocessing import LabelEncoder

plt.style.use("ggplot")

In [None]:
with open("./ClusteringModels/hdbscan_model.pkl", "rb") as f:
    hdbscan_model:HDBSCAN = pickle.load(f)

hdbscan_model.generate_prediction_data()

In [None]:
embeddings_df["HDBSCAN FILTER"] = approximate_predict(hdbscan_model, embeddings_df[embeddingColumns])
filtered_embeddings = embeddings_df[embeddings_df["HDBSCAN FILTER"] == -1][["IN FILE", "PATH", "START", "END", "HDBSCAN FILTER"]]
filtered_embeddings

### Applying Mask to WTS Pipeline

In [None]:
def create_annotation_filter(x: pd.DataFrame) -> bool:
    
    return False