In [1]:
%pip install esp-aves pandas seaborn requests scikit-learn tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
print(os.getcwd())

/data2/nitin/main/clustering


In [3]:
import pandas as pd
import os
from glob import glob

# Directory containing your wav files
data_dir = "../separation/american_crow_calls"

# Get all .wav files in the directory, sorted
fps = sorted(glob(os.path.join(data_dir, '*.wav')))

individuals = []
call_types = []
file_names = []
fps_sub = []

for fp in fps:
    bn = os.path.basename(fp)
    # Example filename: brachyrynchos_00001_1.wav
    parts = bn.split("_")

    # Make sure filename has at least 3 parts before .wav
    if len(parts) < 3:
        continue

    individual = parts[0]  # e.g., brachyrynchos
    call_type = parts[2].split(".")[0]  # e.g., 1 (after removing .wav)

    individuals.append(individual)
    call_types.append(call_type)
    file_names.append(bn)
    fps_sub.append(fp)

# Create a DataFrame from the collected data
df = pd.DataFrame({
    "filename": file_names,
    "filepath": fps_sub,
    "individual": individuals,
    "call_type": call_types
})

# Display the first few rows
print(df.head())


                    filename  \
0  brachyrynchos_00001_1.wav   
1  brachyrynchos_00002_1.wav   
2  brachyrynchos_00003_1.wav   
3  brachyrynchos_00005_1.wav   
4  brachyrynchos_00006_1.wav   

                                            filepath     individual call_type  
0  ../separation/american_crow_calls/brachyryncho...  brachyrynchos         1  
1  ../separation/american_crow_calls/brachyryncho...  brachyrynchos         1  
2  ../separation/american_crow_calls/brachyryncho...  brachyrynchos         1  
3  ../separation/american_crow_calls/brachyryncho...  brachyrynchos         1  
4  ../separation/american_crow_calls/brachyryncho...  brachyrynchos         1  


In [4]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchaudio.models import wav2vec2_model
import json
import torch.nn as nn

In [5]:
class Vox(Dataset):
    def __init__(self, dataset_dataframe, audio_sr, annotation_names):
        """ Dataset for vocalization classification with AVES

        Input
        -----
        dataset_dataframe (pandas dataframe): indicating the filepath, annotations and partition of a signal
        audio_sr (int): sampling rate expected by network
        annotation_name (list[str]): string corresponding to the annotation columns in the dataframe, e.g. ["call_type","recording_date"]
        """
        super().__init__()
        self.audio_sr = audio_sr
        self.annotation_names = annotation_names
        self.dataset_info = dataset_dataframe

    def __len__(self):
        return len(self.dataset_info)

    def get_one_item(self, idx):
      """ Load base audio """
      row = self.dataset_info.iloc[idx]
      x, sr = torchaudio.load(row["filepath"])
      if len(x.size()) == 2:
          x = x[0, :]
      if sr != self.audio_sr:
          x = torchaudio.functional.resample(x, sr, self.audio_sr)
      return x, row

    def __getitem__(self, idx):
        x, row = self.get_one_item(idx)
        out = {"x" : x, "filepath" : row['filepath'], "filename" : row['filename']}
        for k in self.annotation_names:
          out[k] = row[k]
        return out

def get_dataloader(dataset_dataframe, audio_sr, annotation_names):
    return DataLoader(
            Vox(dataset_dataframe, audio_sr, annotation_names),
            batch_size=1,
            shuffle=False,
            drop_last=False
        )

In [6]:
import requests

def download_file(url, filename):
    print(f"Downloading {filename}...")
    response = requests.get(url)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Saved {filename}")

download_file("https://storage.googleapis.com/esp-public-files/ported_aves/aves-base-bio.torchaudio.pt", "aves-base-bio.torchaudio.pt")
download_file("https://storage.googleapis.com/esp-public-files/ported_aves/aves-base-bio.torchaudio.model_config.json", "aves-base-bio.torchaudio.model_config.json")


Downloading aves-base-bio.torchaudio.pt...
Saved aves-base-bio.torchaudio.pt
Downloading aves-base-bio.torchaudio.model_config.json...
Saved aves-base-bio.torchaudio.model_config.json


In [7]:
import sklearn.cluster
from tqdm import tqdm
import matplotlib.pyplot as plt

In [8]:
class AvesMeanEmbedding(nn.Module):
    """ Uses AVES Hubert to embed sounds and classify """
    def __init__(self, config_path, model_path, embedding_dim=768):
        super().__init__()
        # reference: https://pytorch.org/audio/stable/_modules/torchaudio/models/wav2vec2/utils/import_fairseq.html
        self.config = self.load_config(config_path)
        self.model = wav2vec2_model(**self.config, aux_num_out=None)
        self.model.load_state_dict(torch.load(model_path))
        self.audio_sr = 16000

    def load_config(self, config_path):
        with open(config_path, 'r') as ff:
            obj = json.load(ff)
        return obj

    def forward(self, sig):
        """
        Input
          sig (Tensor): (batch, time)
        Returns
          mean_embedding (Tensor): (batch, output_dim)
          logits (Tensor): (batch, n_classes)
        """
        # extract_feature in the sorchaudio version will output all 12 layers' output, -1 to select the final one
        out = self.model.extract_features(sig)[0][-1]
        mean_embedding = out.mean(dim=1) #over time
        return mean_embedding

In [9]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

def run_save_embeddings_only(dataset_dataframe, annotation_names, aves_sr=16000, output_csv="aves_embeddings_base_bio.csv"):

    print("~~ Setting up model")
    embedding_model = AvesMeanEmbedding(
        config_path="aves-base-bio.torchaudio.model_config.json",
        model_path="aves-base-bio.torchaudio.pt"
    )
    embedding_model.eval()
    if torch.cuda.is_available():
        embedding_model.cuda()

    print("~~ Setting up dataloader")
    dataloader = get_dataloader(dataset_dataframe, embedding_model.audio_sr, annotation_names)

    print(f"~~ Compute AVES embeddings")
    features = []
    filenames = []

    with torch.no_grad():
        for data_idx, data_dict in enumerate(tqdm(dataloader)):
            x = data_dict["x"]
            if torch.cuda.is_available():
                x = x.cuda()
            emb = embedding_model(x).cpu().numpy()  # shape: (batch_size, embedding_dim)
            
            # Assuming batch size=1 usually, but handle batch dimension anyway:
            for i in range(emb.shape[0]):
                features.append(emb[i])
                filenames.append(data_dict["filename"][i])

    # Convert list of embeddings to numpy array
    X = np.vstack(features)  # shape (n_samples, embedding_dim)

    # Prepare DataFrame with filenames and embeddings as columns
    df_out = pd.DataFrame(X, columns=[f"embedding_{i}" for i in range(X.shape[1])])
    df_out.insert(0, "filename", filenames)

    # Save to CSV
    df_out.to_csv(output_csv, index=False)
    print(f"Saved embeddings to {output_csv}")

    return df_out

In [10]:
df_embeddings = run_save_embeddings_only(df, annotation_names=[])

~~ Setting up model


  self.model.load_state_dict(torch.load(model_path))


~~ Setting up dataloader
~~ Compute AVES embeddings


100%|██████████| 37665/37665 [1:08:17<00:00,  9.19it/s]


Saved embeddings to aves_embeddings_base_bio.csv
