# Cluster and Visualize Unlabeled Acoustic Embeddings (PCA)

This notebook:
1. Loads the 1536-dim Perch embeddings
2. Reduces dimensions with **PCA** to **256D** (configurable for clustering)
3. Clusters with **K-means** on PCA embeddings
4. Adds cluster assignments to the manifest
5. Saves PCA embeddings and clustering outputs for review

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from os import environ

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load Embeddings and Manifest

In [2]:
env_dir = environ.get("POSIDONIA_DATASET_DIR")
DATASET_DIR = Path(env_dir) if env_dir else Path("D:\\Posidonia Soundscapes\\Fondeo 1_Formentera Ille Espardell\\Embeddings_2\\dataset")

EMBEDDINGS_PATH = DATASET_DIR / "npy_files" / "unlabeled_embeddings.npy"
MANIFEST_PATH = DATASET_DIR / "unlabeled_manifest.csv"

embeddings = np.load(str(EMBEDDINGS_PATH))
manifest_df = pd.read_csv(str(MANIFEST_PATH))

print(f"Loaded {embeddings.shape[0]} embeddings of dimension {embeddings.shape[1]}")
print(f"Manifest has {len(manifest_df)} rows")
manifest_df.head()

Loaded 392400 embeddings of dimension 1536
Manifest has 392400 rows


Unnamed: 0,original_audio,embedding_path,segment_path,audio_path,file_name,embedding_dim
0,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-03.wav,1536
1,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-08.wav,1536
2,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-13.wav,1536
3,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-18.wav,1536
4,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-23.wav,1536


## 2. Dimensionality Reduction: PCA

Reduce 1536D embeddings to **256D** PCA components for clustering.

In [3]:
output_dir = Path(EMBEDDINGS_PATH).parent
pca_output_dir = output_dir / "PCA_256D"
pca_output_dir.mkdir(parents=True, exist_ok=True)

pca_file = pca_output_dir / "pca_embeddings_256d.npy"

if pca_file.exists():
    print("Loading pre-computed PCA embeddings...")
    pca_results = np.load(str(pca_file))
    print(f"Loaded PCA embeddings: {pca_results.shape}")
else:
    print("Running PCA to 256D...")
    embeddings_fp32 = embeddings.astype(np.float32, copy=False)
    pca_model = PCA(n_components=256, random_state=42)
    pca_results = pca_model.fit_transform(embeddings_fp32)

    np.save(str(pca_file), pca_results)
    print(f"Saved PCA embeddings to: {pca_file}")

manifest_df["pca_x"] = pca_results[:, 0]
manifest_df["pca_y"] = pca_results[:, 1]
manifest_df["pca_z"] = pca_results[:, 2]

print("PCA embeddings added to manifest.")

Running PCA to 256D...
Saved PCA embeddings to: D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\dataset\npy_files\PCA_256D\pca_embeddings_256d.npy
PCA embeddings added to manifest.


## 3. K-means Clustering on PCA

Cluster the PCA embeddings with K-means (flexible k, start with ~25).

In [4]:
k = 25  # Change this as needed

pca_labels_file = pca_output_dir / f"pca_kmeans_labels_k{k}.npy"

if pca_labels_file.exists():
    print(f"Loading pre-computed PCA K-means clustering (k={k})...")
    pca_labels = np.load(str(pca_labels_file))

    if len(pca_labels) != len(pca_results):
        raise ValueError("Loaded labels do not match PCA embedding length.")

    centroids_pca = np.vstack([
        pca_results[pca_labels == i].mean(axis=0) for i in range(k)
    ])
else:
    print(f"Running K-means clustering with k={k} on PCA embeddings...")
    kmeans_pca = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_pca.fit(pca_results)

    pca_labels = kmeans_pca.labels_
    centroids_pca = kmeans_pca.cluster_centers_

    np.save(str(pca_labels_file), pca_labels)
    print(f"Saved clustering results to: {pca_labels_file}")

manifest_df["pca_cluster"] = pca_labels

cluster_sizes = np.bincount(pca_labels, minlength=k)

print(f"Clustering done! Found {k} clusters")
print("\nCluster size distribution (np.bincount):")
print(cluster_sizes)

manifest_df.head()

Running K-means clustering with k=25 on PCA embeddings...


[WinError 2] The system cannot find the file specified
  File "c:\Users\USER\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\USER\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Saved clustering results to: D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\dataset\npy_files\PCA_256D\pca_kmeans_labels_k25.npy
Clustering done! Found 25 clusters

Cluster size distribution (np.bincount):
[20199 10360 27126  9163 44162 16135  3860  7366 11850 12733 22614 10742
 33893 12962 31618 17412 13520  8825 19238  7808  5844 15139  7389 15138
  7304]


Unnamed: 0,original_audio,embedding_path,segment_path,audio_path,file_name,embedding_dim,pca_x,pca_y,pca_z,pca_cluster
0,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-03.wav,1536,1.44626,-1.600876,0.444317,6
1,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-08.wav,1536,-0.665084,-0.287637,-0.277907,12
2,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-13.wav,1536,-0.639054,-0.512064,-0.482361,12
3,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-18.wav,1536,-0.627045,-0.383027,-0.420253,12
4,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,channelA_2025-05-16_14-00-23.wav,1536,-0.652139,-0.313657,-0.455247,12


## 4. Sample Strategy: 20 nearest + 20 farthest + 60 random per cluster

For each PCA cluster, select up to 100 audio samples for review.

In [5]:
from tqdm import tqdm

def sample_cluster(cluster_indices, cluster_embeddings, centroid, n_nearest=20, n_farthest=20, n_random=60):
    dists = np.linalg.norm(cluster_embeddings - centroid, axis=1)

    order = np.argsort(dists)
    nearest_idx = cluster_indices[order[: min(n_nearest, len(order))]]
    farthest_idx = cluster_indices[order[-min(n_farthest, len(order)):]]

    selected = np.unique(np.concatenate([nearest_idx, farthest_idx]))
    remaining_idx = np.setdiff1d(cluster_indices, selected)

    if len(remaining_idx) > n_random:
        rng = np.random.default_rng(42)
        random_idx = rng.choice(remaining_idx, size=n_random, replace=False)
    else:
        random_idx = remaining_idx

    sampled_idx = np.concatenate([selected, random_idx])
    return np.unique(sampled_idx)

pca_csv = pca_output_dir / f"subsample_pca_kmeans_k{k}.csv"

if pca_csv.exists():
    print("Loading pre-computed PCA sample indices...")
    pca_subsample_df = pd.read_csv(str(pca_csv))
    pca_subsample_indices = pca_subsample_df["reduced_embeddings_idx"].values
    print(f"Loaded {len(pca_subsample_indices)} PCA samples")
else:
    print("Sampling from PCA clusters...")
    pca_subsample_indices = []

    for i in tqdm(range(k)):
        cluster_indices = np.where(pca_labels == i)[0]
        if len(cluster_indices) == 0:
            continue

        cluster_embeddings = pca_results[cluster_indices]
        centroid = centroids_pca[i]
        sampled = sample_cluster(cluster_indices, cluster_embeddings, centroid)
        pca_subsample_indices.extend(sampled.tolist())

    pca_subsample_indices = np.unique(pca_subsample_indices)
    print(f"Selected {len(pca_subsample_indices)} samples from PCA clustering")

    pca_subsample_df = pd.DataFrame({
        "audio_path": manifest_df.iloc[pca_subsample_indices]["audio_path"].values,
        "embedding_path": manifest_df.iloc[pca_subsample_indices]["embedding_path"].values,
        "reduced_embedding_filepath": str(pca_file),
        "reduced_embeddings_idx": pca_subsample_indices,
        "method": "pca_kmeans",
        "cluster": pca_labels[pca_subsample_indices],
    })

pca_subsample_df.head()

Sampling from PCA clusters...


100%|██████████| 25/25 [00:00<00:00, 73.06it/s]

Selected 2500 samples from PCA clustering





Unnamed: 0,audio_path,embedding_path,reduced_embedding_filepath,reduced_embeddings_idx,method,cluster
0,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,D:\Posidonia Soundscapes\Fondeo 1_Formentera I...,147,pca_kmeans,13
1,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,D:\Posidonia Soundscapes\Fondeo 1_Formentera I...,819,pca_kmeans,4
2,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,D:\Posidonia Soundscapes\Fondeo 1_Formentera I...,836,pca_kmeans,4
3,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,D:\Posidonia Soundscapes\Fondeo 1_Formentera I...,855,pca_kmeans,0
4,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,/mnt/d/Posidonia Soundscapes/Fondeo 1_Formente...,D:\Posidonia Soundscapes\Fondeo 1_Formentera I...,858,pca_kmeans,4


## 5. Save Results

Save PCA subsamples, labels, and manifest for downstream review.

In [6]:
pca_output_dir.mkdir(parents=True, exist_ok=True)

pca_csv = pca_output_dir / f"subsample_pca_kmeans_k{k}.csv"
pca_labels_out = pca_output_dir / f"pca_kmeans_labels_k{k}.npy"
manifest_out = pca_output_dir / f"manifest_pca_kmeans_k{k}.csv"

pca_subsample_df.to_csv(pca_csv, index=False)
np.save(pca_labels_out, pca_labels)
manifest_df.to_csv(manifest_out, index=False)

print(f"✓ Saved PCA subsample: {pca_csv}")
print(f"✓ Saved PCA labels: {pca_labels_out}")
print(f"✓ Saved manifest with PCA columns: {manifest_out}")

✓ Saved PCA subsample: D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\dataset\npy_files\PCA_256D\subsample_pca_kmeans_k25.csv
✓ Saved PCA labels: D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\dataset\npy_files\PCA_256D\pca_kmeans_labels_k25.npy
✓ Saved manifest with PCA columns: D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\dataset\npy_files\PCA_256D\manifest_pca_kmeans_k25.csv


## 6. Export PCA Audio Samples to Review Folder

Copy sampled audio files to `Review/PCA` with cluster folders (folder index = cluster + 1).

In [7]:
import shutil

def convert_wsl_to_windows_path(wsl_path):
    """Convert WSL path (/mnt/d/...) to Windows path (D:/...)"""
    if isinstance(wsl_path, float):  # Handle NaN
        return wsl_path
    wsl_path = str(wsl_path).replace("\\", "/")
    if wsl_path.startswith("/mnt/"):
        drive = wsl_path[5].upper()  # Get drive letter (d -> D)
        rest = wsl_path[7:]  # Get everything after /mnt/d/
        return f"{drive}:\\{rest}".replace("/", "\\")
    return wsl_path

REVIEW_DIR = Path("/mnt/d/Posidonia Soundscapes/Fondeo 1_Formentera Ille Espardell/Embeddings_2/diagnostics/Review")
REVIEW_DIR_WINDOWS = Path(convert_wsl_to_windows_path(str(REVIEW_DIR)))

PCA_DEST = REVIEW_DIR / "PCA_256D"
PCA_DEST_WINDOWS = REVIEW_DIR_WINDOWS / "PCA_256D"

# Use WINDOWS paths for all file operations
PCA_ALL = PCA_DEST_WINDOWS / "All"
PCA_CLUSTERS = PCA_DEST_WINDOWS / "Clusters"

PCA_ALL.mkdir(parents=True, exist_ok=True)
PCA_CLUSTERS.mkdir(parents=True, exist_ok=True)

print(f"PCA All destination (Windows): {PCA_ALL}")
print(f"PCA Clusters destination (Windows): {PCA_CLUSTERS}")

# Safety check: do nothing if destination already contains files
pca_has_files = any(p.is_file() for p in PCA_DEST_WINDOWS.rglob("*"))

if pca_has_files:
    print("\nDetected existing files in PCA destination folder.")
    print("Skipping copy step to avoid duplicating files.")
else:
    pca_csv = pca_output_dir / f"subsample_pca_kmeans_k{k}.csv"
    print(f"\nReading from: {pca_csv}")

    pca_df = pd.read_csv(str(pca_csv))
    print(f"Loaded {len(pca_df)} PCA samples")

    print(f"\nCreating {k} cluster folders for PCA...")
    for i in range(k):
        (PCA_CLUSTERS / str(i + 1)).mkdir(exist_ok=True)

    print("\n" + "=" * 60)
    print("Copying PCA audio files...")
    print("=" * 60)
    pca_copied = 0
    pca_failed = 0

    for idx, row in pca_df.iterrows():
        # Convert WSL path to Windows path
        audio_path_wsl = row['audio_path']
        audio_path_windows = convert_wsl_to_windows_path(audio_path_wsl)
        src = Path(audio_path_windows)
        cluster_num = int(row['cluster']) + 1

        if src.exists():
            dst_all = PCA_ALL / src.name
            dst_cluster = PCA_CLUSTERS / str(cluster_num) / src.name

            try:
                shutil.copy2(src, dst_all)
                shutil.copy2(src, dst_cluster)
                pca_copied += 1
            except Exception as e:
                print(f"  ✗ Error copying {src.name}: {e}")
                pca_failed += 1
        else:
            print(f"  ✗ Source not found: {src}")
            pca_failed += 1

        # Print progress every 100 files
        if (idx + 1) % 100 == 0:
            print(f"  Progress: {idx + 1}/{len(pca_df)}")

    print(f"\n✓ PCA: Copied {pca_copied} files, {pca_failed} failed")

    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"PCA All folder ({PCA_ALL}): {len(list(PCA_ALL.glob('*.wav')))} files")

    print(f"\nPCA Clusters:")
    for i in range(k):
        cluster_folder = PCA_CLUSTERS / str(i + 1)
        file_count = len(list(cluster_folder.glob('*.wav')))
        print(f"  Cluster {i + 1}: {file_count} files")

    print(f"\nTotal copied: {pca_copied} files")
    print(f"Total failed: {pca_failed} files")

PCA All destination (Windows): D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\diagnostics\Review\PCA_256D\All
PCA Clusters destination (Windows): D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\diagnostics\Review\PCA_256D\Clusters

Reading from: D:\Posidonia Soundscapes\Fondeo 1_Formentera Ille Espardell\Embeddings_2\dataset\npy_files\PCA_256D\subsample_pca_kmeans_k25.csv
Loaded 2500 PCA samples

Creating 25 cluster folders for PCA...

Copying PCA audio files...
  Progress: 100/2500
  Progress: 200/2500
  Progress: 300/2500
  Progress: 400/2500
  Progress: 500/2500
  Progress: 600/2500
  Progress: 700/2500
  Progress: 800/2500
  Progress: 900/2500
  Progress: 1000/2500
  Progress: 1100/2500
  Progress: 1200/2500
  Progress: 1300/2500
  Progress: 1400/2500
  Progress: 1500/2500
  Progress: 1600/2500
  Progress: 1700/2500
  Progress: 1800/2500
  Progress: 1900/2500
  Progress: 2000/2500
  Progress: 2100/2500
  Progress: 2200/2500
  Progr