# WTS Pipeline Integration
Exploratory notebook for working on birdnet embeddings

### Imports and Setup

In [1]:
import pandas as pd
import os
import numpy as np
from annotation_post_processing import *

In [2]:
embeddingColumns = [str(i) for i in range(420)]
columnNames = ["START", "END"] + embeddingColumns
path = './input/cosmos_embeddings/'

In [3]:
import pandas as pd
import os

def generate_embeddings_from_file(path, filename):
    with open(path + filename, 'r') as f:
        data = f.read()
    with open(path + filename, 'w') as f:
        f.write(",".join(data.split("\t")))
    file_df = pd.read_csv(path + filename, names = columnNames)
    file_df["IN FILE"] = filename[:filename.index(".birdnet")] + ".wav"
    return file_df

def generate_embeddings(path):
    df = pd.DataFrame()
    for filename in os.listdir(path):
        try:
            df = pd.concat([df, generate_embeddings_from_file(path, filename)], ignore_index = True)
            print("Done with " + filename)
        except Exception as e:
            print("Something went wrong with: " + filename)
    df["PATH"] = path
    columns = df.columns.tolist()
    columns = columns[-2:] + columns[:-2]
    df = df[columns]
    df = df.sort_values(["IN FILE", "START"], ascending = True)
    df = df.reset_index(drop = True)
    return df

In [4]:
embeddings_df = generate_embeddings(path)
embeddings_df

Done with XC63636 - Southern Nightingale-Wren - Microcerculus marginatus.birdnet.embeddings.txt
Done with XC84810 - Rufous-collared Sparrow - Zonotrichia capensis subtorquata.birdnet.embeddings.txt
Done with XC64386 - Buff-throated Woodcreeper - Xiphorhynchus guttatus.birdnet.embeddings.txt
Done with XC98265 - Yellow-olive Flatbill - Tolmomyias sulphurescens confusus.birdnet.embeddings.txt
Done with XC601007 - Rufous-collared Sparrow - Zonotrichia capensis.birdnet.embeddings.txt
Done with XC484117 - Buff-throated Woodcreeper - Xiphorhynchus guttatus.birdnet.embeddings.txt
Done with XC699683 - Yellow-olive Flatbill - Tolmomyias sulphurescens.birdnet.embeddings.txt
Done with XC301004 - Rufous-collared Sparrow - Zonotrichia capensis.birdnet.embeddings.txt
Done with XC104217 - Slate-throated Whitestart - Myioborus miniatus.birdnet.embeddings.txt
Done with XC609552 - White-throated Toucan - Ramphastos tucanus.birdnet.embeddings.txt
Done with XC128171 - Rufous-collared Sparrow - Zonotrichia 

Unnamed: 0,IN FILE,PATH,START,END,0,1,2,3,4,5,...,410,411,412,413,414,415,416,417,418,419
0,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,0.0,3.0,0.516491,0.650946,0.447532,0.008416,0.976974,0.210846,...,0.472520,0.764997,0.224773,0.307177,0.618153,0.572281,0.898645,0.424682,0.784120,0.582382
1,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,3.0,6.0,0.332206,0.504845,0.053778,0.049535,0.893746,0.282089,...,0.801616,0.414043,0.599959,0.683068,0.572789,0.827336,0.678146,0.956513,0.310204,0.560730
2,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,6.0,9.0,0.508598,0.422192,0.175864,0.046417,1.243934,0.341751,...,0.366465,0.344001,0.751044,0.310172,1.276110,0.756434,0.482373,0.850690,0.280638,1.044262
3,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,9.0,12.0,0.834292,0.763748,0.289305,0.254106,0.966129,0.475406,...,0.715341,0.837616,0.776913,0.834856,1.046136,1.034076,0.706145,0.880231,0.625652,0.480209
4,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,12.0,15.0,0.775968,0.195914,0.237553,0.111716,1.452784,0.297751,...,0.350133,0.739770,1.074138,0.476351,1.474918,1.059757,0.871452,1.035708,0.440124,0.510227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39953,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,0.0,3.0,1.055252,1.559146,1.135013,1.333304,0.308816,0.951709,...,0.162229,0.277709,0.623836,0.808401,0.348465,0.968598,0.382283,2.083897,0.000000,0.639748
39954,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,3.0,6.0,1.132612,0.653347,1.251634,0.573085,0.714166,1.560998,...,0.978767,0.457551,0.526104,0.413813,0.551209,0.755623,0.408722,0.310266,0.451188,0.295859
39955,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,6.0,9.0,1.082742,0.205875,0.709146,0.203989,0.907365,0.756531,...,0.632167,0.156727,0.067946,0.902726,0.399330,0.752100,0.304018,1.156913,0.044347,1.153144
39956,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,9.0,12.0,1.338971,0.901556,1.706767,0.925638,0.152560,1.294218,...,0.814605,0.189816,0.188148,0.987711,0.478155,0.284822,0.135123,0.540614,0.374931,0.047007


In [5]:
automated_dfs:list[pd.DataFrame] = []
automated_dfs.append(pd.read_csv("cosmos_annotations/automated_cosmos_tweety_to_file.csv"))
automated_dfs.append(pd.read_csv("cosmos_annotations/COSMOS_BirdNET-Lite_Labels_05Conf.csv"))
automated_dfs.append(pd.read_csv("cosmos_annotations/COSMOS_BirdNET-Lite_Labels_100.csv"))
automated_dfs.append(pd.read_csv("cosmos_annotations/COSMOS_BirdNET-Lite-Filename_Labels_05Conf.csv"))
automated_dfs.append(pd.read_csv("cosmos_annotations/COSMOS_Microfaune-Filename_Labels_100.csv"))
print(automated_dfs)

[       Unnamed: 0    OFFSET  DURATION  \
0               0  1.883721  0.116279   
1               1  3.976744  0.046512   
2               2  5.976744  0.046512   
3               3  6.023256  0.046512   
4               4  6.069767  0.116279   
...           ...       ...       ...   
55642       55642  1.511628  0.325581   
55643       55643  1.860465  0.139535   
55644       55644  5.976744  0.372093   
55645       55645  6.627907  0.255814   
55646       55646  0.046512  0.255814   

                                         FOLDER  \
0      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
1      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
2      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
3      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
4      C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
...                                         ...   
55642  C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
55643  C:/Users/Siloux/Desktop/E4E/Cosmos_data/   
55644  C:/Users/Siloux/Desktop/E4E/Cosmos_data/

### Filtering Embeddings with HDBSCAN

In [6]:
%matplotlib inline
import hdbscan
from hdbscan import HDBSCAN
from hdbscan.prediction import approximate_predict
import pickle
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from sklearn.preprocessing import LabelEncoder

plt.style.use("ggplot")

In [7]:
with open("./ClusteringModels/hdbscan_cosmos_model.pkl", "rb") as f:
    hdbscan_model:HDBSCAN = pickle.load(f)

In [10]:
embeddings_df["HDBSCAN PREDICTION"] = hdbscan_model.labels_
embeddings_df = embeddings_df.dropna(subset = embeddingColumns).reset_index().drop("index", axis = 1)
embeddings_df

Unnamed: 0,IN FILE,PATH,START,END,0,1,2,3,4,5,...,411,412,413,414,415,416,417,418,419,HDBSCAN PREDICTION
0,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,0.0,3.0,0.516491,0.650946,0.447532,0.008416,0.976974,0.210846,...,0.764997,0.224773,0.307177,0.618153,0.572281,0.898645,0.424682,0.784120,0.582382,743
1,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,3.0,6.0,0.332206,0.504845,0.053778,0.049535,0.893746,0.282089,...,0.414043,0.599959,0.683068,0.572789,0.827336,0.678146,0.956513,0.310204,0.560730,743
2,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,6.0,9.0,0.508598,0.422192,0.175864,0.046417,1.243934,0.341751,...,0.344001,0.751044,0.310172,1.276110,0.756434,0.482373,0.850690,0.280638,1.044262,743
3,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,9.0,12.0,0.834292,0.763748,0.289305,0.254106,0.966129,0.475406,...,0.837616,0.776913,0.834856,1.046136,1.034076,0.706145,0.880231,0.625652,0.480209,613
4,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,12.0,15.0,0.775968,0.195914,0.237553,0.111716,1.452784,0.297751,...,0.739770,1.074138,0.476351,1.474918,1.059757,0.871452,1.035708,0.440124,0.510227,743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39946,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,0.0,3.0,1.055252,1.559146,1.135013,1.333304,0.308816,0.951709,...,0.277709,0.623836,0.808401,0.348465,0.968598,0.382283,2.083897,0.000000,0.639748,-1
39947,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,3.0,6.0,1.132612,0.653347,1.251634,0.573085,0.714166,1.560998,...,0.457551,0.526104,0.413813,0.551209,0.755623,0.408722,0.310266,0.451188,0.295859,-1
39948,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,6.0,9.0,1.082742,0.205875,0.709146,0.203989,0.907365,0.756531,...,0.156727,0.067946,0.902726,0.399330,0.752100,0.304018,1.156913,0.044347,1.153144,-1
39949,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,9.0,12.0,1.338971,0.901556,1.706767,0.925638,0.152560,1.294218,...,0.189816,0.188148,0.987711,0.478155,0.284822,0.135123,0.540614,0.374931,0.047007,-1


In [11]:
filtered_embeddings = embeddings_df[embeddings_df["HDBSCAN PREDICTION"] == -1][["IN FILE", "PATH", "START", "END", "HDBSCAN PREDICTION"]]
print("Created filter")

filtered_embeddings

Created filter


Unnamed: 0,IN FILE,PATH,START,END,HDBSCAN PREDICTION
11,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,33.0,36.0,-1
12,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,36.0,39.0,-1
13,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,39.0,42.0,-1
14,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,42.0,45.0,-1
15,XC100027 - Southern Nightingale-Wren - Microce...,./input/cosmos_embeddings/,45.0,48.0,-1
...,...,...,...,...,...
39946,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,0.0,3.0,-1
39947,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,3.0,6.0,-1
39948,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,6.0,9.0,-1
39949,XC9881 - Blue-grey Tanager - Thraupis episcopu...,./input/cosmos_embeddings/,9.0,12.0,-1


### Applying Mask to WTS Pipeline

In [38]:
count1 = 0
def split_annotations(df: pd.DataFrame):
    all_split_ann = pd.DataFrame(columns = df.columns)
    for i in range(df.shape[0]):
        x = df.iloc[i]
        startsends = np.linspace(3.0 * (int(x["OFFSET"] / 3)), 3.0 * (int((x["OFFSET"] + x["DURATION"])/ 3) + 1), int((x["OFFSET"] + x["DURATION"])/ 3) - int(x["OFFSET"] / 3) + 2)
        starts = startsends[:-1]
        starts[0] = x["OFFSET"]
        ends = startsends[1:]
        ends[-1] = x["OFFSET"] + x["DURATION"]
        split_ann = pd.DataFrame(columns = x.index)
        for i in range(len(starts)):
            new_x = pd.DataFrame(x.copy()).T
            new_x["OFFSET"] = starts[i]
            new_x["DURATION"] = ends[i] - starts[i]
            split_ann = pd.concat([split_ann, new_x])
        all_split_ann = pd.concat([all_split_ann, split_ann])
        global count1
        count1 += 1
        print(f"Completed {count1} annotations")
    return all_split_ann.reset_index(drop = True)

count2 = 0
def create_annotation_filter(x: pd.Series, filter: pd.DataFrame) -> pd.DataFrame:
    filter_x = filter[filter["IN FILE"].str.startswith(x["IN FILE"].split(".mp3")[0])][["START", "END"]]
    for i in range(len(filter_x["START"])):
        start, end = filter_x.iloc[i, 0], filter_x.iloc[i, 1]
        offset = x["OFFSET"]
        duration = x["DURATION"]
        if (np.isclose(offset, start) or np.isclose(offset + duration, end)):
            x["FILTERED"] = True
        else:
            x["FILTERED"] = False
    global count2
    count2 += 1
    print(f"Completed {count2} annotations")
    return x

# For backup
# def split_annotations(df: pd.DataFrame):
#     all_split_ann = pd.DataFrame(columns=df.columns)
#     for i in range(df.shape[0]):
#         split_ann = pd.DataFrame(df.iloc[i]).T
#         while True:
#             ind = split_ann.shape[0] - 1
#             row = split_ann.index[ind]
#             if (int(split_ann.at[row,"OFFSET"] / 3) == int((split_ann.at[row,"OFFSET"] + split_ann.at[row,"DURATION"])/3)):
#                 break
#             new_x = pd.DataFrame(split_ann.iloc[ind].copy()).T
#             new_x.index = [row + 1]
#             new_x["OFFSET"] = 3.0 * (int(split_ann.at[row,"OFFSET"] / 3) + 1)
#             split_ann.at[row,"DURATION"] = 3.0 * (int(split_ann.at[row,"OFFSET"] / 3) + 1) - split_ann.at[row,"OFFSET"]
#             split_ann = pd.concat([split_ann, new_x])
#         all_split_ann = pd.concat([all_split_ann, split_ann])
#         global count1
#         count1 += 1
#         print(f"Completed {count1} annotations")
#     return all_split_ann

# def create_annotation_filter(x: pd.Series, filter: pd.DataFrame) -> pd.DataFrame:
#     filter_x = filter[filter["IN FILE"].str.startswith(x["IN FILE"].split(".mp3")[0])][["START", "END"]]
#     for i in range(len(filter_x["START"])):
#         start, end = filter_x.iloc[i, 0], filter_x.iloc[i, 1]
#         offset = x["OFFSET"]
#         duration = x["DURATION"]
#         if (start <= offset <= end):
#             x["OFFSET"] = end
#         if (start <= offset + duration <= end):
#             x["DURATION"] = start - x["OFFSET"]
#         if x["DURATION"] < 0:
#             x["FILTERED"] = True
#             break
#     if (np.isclose(x["DURATION"], 0) or x["DURATION"] < 0):
#         x["FILTERED"] = True
#     else:
#         x["FILTERED"] = False
#     global count2
#     count2 += 1
#     print(f"Completed {count2} annotations")
#     return x

In [40]:
# automated_dfs_split = [split_annotations(df) for df in automated_dfs]
automated_dfs_split = split_annotations(automated_dfs[0])
automated_dfs_split

Completed 1 annotations
Completed 2 annotations
Completed 3 annotations
Completed 4 annotations
Completed 5 annotations
Completed 6 annotations
Completed 7 annotations
Completed 8 annotations
Completed 9 annotations
Completed 10 annotations
Completed 11 annotations
Completed 12 annotations
Completed 13 annotations
Completed 14 annotations
Completed 15 annotations
Completed 16 annotations
Completed 17 annotations
Completed 18 annotations
Completed 19 annotations
Completed 20 annotations
Completed 21 annotations
Completed 22 annotations
Completed 23 annotations
Completed 24 annotations
Completed 25 annotations
Completed 26 annotations
Completed 27 annotations
Completed 28 annotations
Completed 29 annotations
Completed 30 annotations
Completed 31 annotations
Completed 32 annotations
Completed 33 annotations
Completed 34 annotations
Completed 35 annotations
Completed 36 annotations
Completed 37 annotations
Completed 38 annotations
Completed 39 annotations
Completed 40 annotations
Completed

Unnamed: 0.1,Unnamed: 0,OFFSET,DURATION,FOLDER,IN FILE,CHANNEL,CLIP LENGTH,SAMPLE RATE,MANUAL ID,CONFIDENCE,FILEPATH
0,0,1.883721,0.116279,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC100027 - Southern Nightingale-Wren - Microce...,0,81.528163,44100,Microcerculus marginatus,0.543653,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC1000...
1,1,3.976744,0.046512,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC100027 - Southern Nightingale-Wren - Microce...,0,81.528163,44100,Microcerculus marginatus,0.230806,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC1000...
2,2,5.976744,0.023256,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC100027 - Southern Nightingale-Wren - Microce...,0,81.528163,44100,Microcerculus marginatus,0.204163,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC1000...
3,2,6.000000,0.023256,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC100027 - Southern Nightingale-Wren - Microce...,0,81.528163,44100,Microcerculus marginatus,0.204163,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC1000...
4,3,6.023256,0.046512,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC100027 - Southern Nightingale-Wren - Microce...,0,81.528163,44100,Microcerculus marginatus,0.264257,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC1000...
...,...,...,...,...,...,...,...,...,...,...,...
79884,55643,1.860465,0.139535,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC9881 - Blue-grey Tanager - Thraupis episcopu...,0,14.7,44100,Thraupis episcopus,0.855924,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC9881...
79885,55644,5.976744,0.023256,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC9881 - Blue-grey Tanager - Thraupis episcopu...,0,14.7,44100,Thraupis episcopus,0.775321,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC9881...
79886,55644,6.000000,0.348837,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC9881 - Blue-grey Tanager - Thraupis episcopu...,0,14.7,44100,Thraupis episcopus,0.775321,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC9881...
79887,55645,6.627907,0.255814,C:/Users/Siloux/Desktop/E4E/Cosmos_data/,XC9881 - Blue-grey Tanager - Thraupis episcopu...,0,14.7,44100,Thraupis episcopus,0.736337,C:/Users/Siloux/Desktop/E4E/Cosmos_data/XC9881...


In [42]:
automated_dfs_filtered = automated_dfs_split.apply(lambda x: create_annotation_filter(x, filtered_embeddings), axis = 1)
automated_dfs_filtered

Completed 1 annotations
Completed 2 annotations
Completed 3 annotations
Completed 4 annotations
Completed 5 annotations
Completed 6 annotations
Completed 7 annotations
Completed 8 annotations
Completed 9 annotations
Completed 10 annotations
Completed 11 annotations
Completed 12 annotations
Completed 13 annotations
Completed 14 annotations
Completed 15 annotations
Completed 16 annotations
Completed 17 annotations
Completed 18 annotations
Completed 19 annotations
Completed 20 annotations
Completed 21 annotations
Completed 22 annotations
Completed 23 annotations
Completed 24 annotations
Completed 25 annotations
Completed 26 annotations
Completed 27 annotations
Completed 28 annotations
Completed 29 annotations
Completed 30 annotations
Completed 31 annotations
Completed 32 annotations
Completed 33 annotations
Completed 34 annotations
Completed 35 annotations
Completed 36 annotations
Completed 37 annotations
Completed 38 annotations
Completed 39 annotations
Completed 40 annotations
Completed

KeyboardInterrupt: 

In [None]:
automated_dfs_filtered = [df.apply(lambda x: create_annotation_filter(x, filtered_embeddings), axis = 1) for df in automated_dfs_split]
automated_dfs_filtered = [df[~df["FILTERED"]] for df in automated_dfs_filtered]

In [None]:
automated_dfs_filtered[0]

In [None]:
# Generate class statistics
# Do this for manual --> automated WTS data
# Run class statistics for embedding filtered automated annotations