# Embeddings Work
##### By Pranjal Chatterjee
Exploratory notebook for working on birdnet embeddings

### Imports and Setup

In [1]:
import pandas as pd
import os
from annotation_post_processing import *

In [2]:
embeddingColumns = [str(i) for i in range(420)]
columnNames = ["START", "END"] + embeddingColumns
path = './input/xc_embeddings/'

In [3]:
def generate_embeddings_from_file(path, filename):
    with open(path + filename, 'r') as f:
        data = f.read()
    with open(path + filename, 'w') as f:
        f.write(",".join(data.split("\t")))
    file_df = pd.read_csv(path + filename, names = columnNames)
    file_df["IN FILE"] = filename[:filename.index(".birdnet")] + ".wav"
    return file_df

def generate_embeddings(path):
    df = pd.DataFrame()
    for filename in os.listdir(path):
        df = pd.concat([df, generate_embeddings_from_file(path, filename)], ignore_index = True)
    df["PATH"] = path
    columns = df.columns.tolist()
    columns = columns[-2:] + columns[:-2]
    df = df[columns]
    df = df.sort_values("IN FILE", ascending = True)
    df = df.reset_index(drop = True)
    return df

In [4]:
embeddings_df = generate_embeddings(path)
annotations_df = pd.read_csv("xc_annotations.csv")

In [5]:
manual_df = pd.read_csv("mixed_bird_sample.csv")
manual_df = annotation_chunker(manual_df, 3)
manual_df["MANUAL ID"] = manual_df["MANUAL ID"].apply(lambda x: " ".join(x.split(" ")[:2]))
manual_df

Unnamed: 0,IN FILE,CLIP LENGTH,CHANNEL,OFFSET,DURATION,SAMPLE RATE,MANUAL ID
0,Amazona-aestiva-52744.wav,19.5657,0,0.0,3.0,44100,Amazona aestiva
1,Amazona-aestiva-52744.wav,19.5657,0,3.0,3.0,44100,Amazona aestiva
2,Amazona-aestiva-52744.wav,19.5657,0,6.0,3.0,44100,Amazona aestiva
3,Amazona-aestiva-52744.wav,19.5657,0,9.0,3.0,44100,Amazona aestiva
4,Amazona-aestiva-52744.wav,19.5657,0,12.0,3.0,44100,Amazona aestiva
...,...,...,...,...,...,...,...
976,Zenaida-auriculata-105264.wav,82.9910,0,48.0,3.0,44100,Zenaida auriculata
977,Zenaida-auriculata-105264.wav,82.9910,0,57.0,3.0,44100,Zenaida auriculata
978,Zenaida-auriculata-105264.wav,82.9910,0,60.0,3.0,44100,Zenaida auriculata
979,Zenaida-auriculata-105264.wav,82.9910,0,66.0,3.0,44100,Zenaida auriculata


In [6]:
# Data cleaning to avoid file does not exist
intersection_files = list(set(embeddings_df["IN FILE"].unique()).intersection(set(annotations_df["IN FILE"])))
embeddings_df = embeddings_df[embeddings_df["IN FILE"].isin(intersection_files)]
annotations_df = annotations_df[annotations_df["IN FILE"].isin(intersection_files)]
manual_df = manual_df[manual_df["IN FILE"].isin(intersection_files)]

In [7]:
# To change all of the birdnet annotations manual ids to species names
birdnet_species = pd.read_csv("birdnet_species.csv")
birdnet_species.columns = ["SPECIES"]
birdnet_species = birdnet_species.assign(
    SPECIES = birdnet_species["SPECIES"].apply(lambda x: x.split("_")[0]), 
    COMMON = birdnet_species["SPECIES"].apply(lambda x: x.split("_")[1])
)
birdnet_species = birdnet_species.set_index("COMMON").to_dict()["SPECIES"]

annotations_df["MANUAL ID"] = annotations_df["MANUAL ID"].apply(lambda x: birdnet_species[x])

In [8]:
embeddings_df

Unnamed: 0,IN FILE,PATH,START,END,0,1,2,3,4,5,...,410,411,412,413,414,415,416,417,418,419
0,Amazona-aestiva-52744.wav,./input/xc_embeddings/,6.0,9.0,0.524409,1.336747,0.553674,0.334472,0.743905,0.547638,...,0.784357,0.484552,0.180524,0.618135,0.340749,0.336239,1.109600,0.923518,0.698893,0.677029
1,Amazona-aestiva-52744.wav,./input/xc_embeddings/,0.0,3.0,0.762929,1.000408,0.432522,0.452946,0.656488,0.737373,...,0.533952,1.013328,0.400595,0.883827,1.193243,0.283634,1.542077,0.981433,0.721521,1.163081
2,Amazona-aestiva-52744.wav,./input/xc_embeddings/,9.0,12.0,0.436074,0.507001,0.915852,0.123285,0.442399,0.537654,...,1.134432,0.630764,0.510933,0.126209,0.478544,0.347008,1.033731,0.530908,0.840781,0.675452
3,Amazona-aestiva-52744.wav,./input/xc_embeddings/,12.0,15.0,0.433620,1.535415,0.639228,0.053500,0.584404,0.518180,...,1.255511,1.091713,0.233526,0.647830,0.581350,0.101694,0.756994,0.921158,1.034858,0.702051
4,Amazona-aestiva-52744.wav,./input/xc_embeddings/,15.0,18.0,0.703729,1.012433,0.550976,0.607597,0.871026,0.796031,...,0.778416,1.413776,0.716892,0.309670,1.341814,0.523564,1.665481,1.057191,1.319292,1.052040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,Zenaida-auriculata-105264.wav,./input/xc_embeddings/,48.0,51.0,0.567677,0.336204,2.099955,0.175008,0.208602,1.150878,...,0.055681,0.259498,0.530284,0.155591,0.500197,0.867391,0.263993,0.493390,0.452804,0.472982
1270,Zenaida-auriculata-105264.wav,./input/xc_embeddings/,51.0,54.0,0.134432,0.210063,2.103433,0.231113,0.575302,1.138219,...,0.168995,0.114395,0.727569,0.384680,0.215787,0.761537,0.530923,0.342721,0.165045,0.351401
1271,Zenaida-auriculata-105264.wav,./input/xc_embeddings/,54.0,57.0,0.398779,0.000000,1.539110,0.218812,0.344461,0.749494,...,0.339884,0.174788,0.237779,0.315368,0.519122,0.773385,0.105009,0.378415,0.301615,0.233420
1272,Zenaida-auriculata-105264.wav,./input/xc_embeddings/,57.0,60.0,0.675475,0.314947,1.514462,0.318162,0.175607,1.050441,...,0.427013,0.356942,0.381076,0.383396,0.739124,0.598633,0.789171,0.173363,0.460750,0.464910


In [9]:
annotations_df

Unnamed: 0,FOLDER,IN FILE,CLIP LENGTH,CHANNEL,OFFSET,DURATION,SAMPLE RATE,MANUAL ID,CONFIDENCE
0,../passive-acoustic-biodiversity/BirdnetEmbedd...,Amazona-aestiva-52744.wav,42.592063,1,0.0,3.0,48000,Amazona ochrocephala,0.4382
1,../passive-acoustic-biodiversity/BirdnetEmbedd...,Amazona-aestiva-52744.wav,42.592063,1,0.0,3.0,48000,Amazona aestiva,0.1709
2,../passive-acoustic-biodiversity/BirdnetEmbedd...,Amazona-aestiva-52744.wav,42.592063,1,3.0,3.0,48000,Amazona aestiva,0.9149
3,../passive-acoustic-biodiversity/BirdnetEmbedd...,Amazona-aestiva-52744.wav,42.592063,1,3.0,3.0,48000,Amazona ochrocephala,0.2826
4,../passive-acoustic-biodiversity/BirdnetEmbedd...,Amazona-aestiva-52744.wav,42.592063,1,6.0,3.0,48000,Amazona ochrocephala,0.3460
...,...,...,...,...,...,...,...,...,...
1482,../passive-acoustic-biodiversity/BirdnetEmbedd...,Zenaida-auriculata-105264.wav,180.660726,1,75.0,3.0,48000,Patagioenas fasciata,0.1653
1483,../passive-acoustic-biodiversity/BirdnetEmbedd...,Zenaida-auriculata-105264.wav,180.660726,1,75.0,3.0,48000,Dendrocopos leucotos,0.1399
1484,../passive-acoustic-biodiversity/BirdnetEmbedd...,Zenaida-auriculata-105264.wav,180.660726,1,78.0,3.0,48000,Podilymbus podiceps,0.4580
1485,../passive-acoustic-biodiversity/BirdnetEmbedd...,Zenaida-auriculata-105264.wav,180.660726,1,78.0,3.0,48000,Zenaida auriculata,0.4318


In [10]:
manual_df

Unnamed: 0,IN FILE,CLIP LENGTH,CHANNEL,OFFSET,DURATION,SAMPLE RATE,MANUAL ID
0,Amazona-aestiva-52744.wav,19.5657,0,0.0,3.0,44100,Amazona aestiva
1,Amazona-aestiva-52744.wav,19.5657,0,3.0,3.0,44100,Amazona aestiva
2,Amazona-aestiva-52744.wav,19.5657,0,6.0,3.0,44100,Amazona aestiva
3,Amazona-aestiva-52744.wav,19.5657,0,9.0,3.0,44100,Amazona aestiva
4,Amazona-aestiva-52744.wav,19.5657,0,12.0,3.0,44100,Amazona aestiva
...,...,...,...,...,...,...,...
976,Zenaida-auriculata-105264.wav,82.9910,0,48.0,3.0,44100,Zenaida auriculata
977,Zenaida-auriculata-105264.wav,82.9910,0,57.0,3.0,44100,Zenaida auriculata
978,Zenaida-auriculata-105264.wav,82.9910,0,60.0,3.0,44100,Zenaida auriculata
979,Zenaida-auriculata-105264.wav,82.9910,0,66.0,3.0,44100,Zenaida auriculata


In [11]:
# Checking to see if same number of files are in all of the dataframes
print(len(embeddings_df["IN FILE"].unique()), len(annotations_df["IN FILE"].unique()), len(manual_df["IN FILE"].unique()))

99 99 99


### Centroid Exploration