# DINOv3 Whale Individuals Experiments
Minimal notebook to load local images, compute DINOv3 embeddings + foreground masks, cluster by pose/individual, show examples with overlays, and export grouped images.


## 1. Install (if needed)
Uncomment to install dependencies.


In [1]:
# !pip install --upgrade pip
# !pip install fiftyone jupyter git+https://github.com/huggingface/transformers huggingface_hub torch pillow scikit-learn numpy


## 2. Imports and dataset load from /Data
Recursively ingests JPGs under /Data into a FiftyOne dataset.


In [None]:
import fiftyone as fo
from pathlib import Path

data_root = Path('/home/trudes/Projects/Dinov3-WhaleID/Data')
image_paths = sorted(str(p) for p in data_root.rglob('*.jpg'))
print(f'Found {len(image_paths)} images under {data_root}')

if fo.dataset_exists('whale-local-dinov3-exp'):
    fo.delete_dataset('whale-local-dinov3-exp')
dataset = fo.Dataset.from_images(image_paths, name='whale-local-dinov3-exp')
dataset.persistent = True
dataset


Found 0 images under /Data
 100% |█████████████████████| 0/0 [7.7ms elapsed, ? remaining, ? samples/s]  


Name:        whale-local-dinov3-exp
Media type:  None
Num samples: 0
Persistent:  True
Tags:        []
Sample fields:
    id:               fiftyone.core.fields.ObjectIdField
    filepath:         fiftyone.core.fields.StringField
    tags:             fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:         fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.Metadata)
    created_at:       fiftyone.core.fields.DateTimeField
    last_modified_at: fiftyone.core.fields.DateTimeField

## 3. Build DINOv3 model wrapper


In [None]:
import transformers
import fiftyone.utils.transformers as fouhft

transformers_model = transformers.AutoModel.from_pretrained('facebook/dinov3-vitl16-pretrain-lvd1689m')
model_config = fouhft.FiftyOneTransformerConfig({
    'model': transformers_model,
    'name_or_path': 'facebook/dinov3-vitl16-pretrain-lvd1689m',
})
model = fouhft.FiftyOneTransformer(model_config)


Loading weights:   0%|          | 0/415 [00:00<?, ?it/s]

## 4. Compute embeddings


In [None]:
dataset.compute_embeddings(model, embeddings_field='embeddings_dinov3')


## 5. PCA/CLS foreground masks + heatmaps


In [None]:
import numpy as np
from PIL import Image, ImageOps
import torch
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModel
from fiftyone import Segmentation, Heatmap

def build_pca_fg_masks(
    dataset: fo.Dataset,
    model_id: str = 'facebook/dinov3-vits16-pretrain-lvd1689m',
    field: str = 'pca_fg',
    heatmap_field: str | None = 'pca_fg_heat',
    thresh: float = 0.5,
    smooth_k: int = 3,
    device: str | None = None,
):
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    processor = AutoImageProcessor.from_pretrained(model_id)
    m = AutoModel.from_pretrained(model_id).to(device).eval()

    if not dataset.has_field(field):
        dataset.add_sample_field(field, fo.EmbeddedDocumentField, embedded_doc_type=fo.Segmentation)
    if heatmap_field and not dataset.has_field(heatmap_field):
        dataset.add_sample_field(heatmap_field, fo.EmbeddedDocumentField, embedded_doc_type=fo.Heatmap)
    mt = dict(dataset.mask_targets or {})
    mt[field] = {0: 'background', 1: 'foreground'}
    dataset.mask_targets = mt
    dataset.save()

    @torch.inference_mode()
    def _fg_mask(path: str):
        img = ImageOps.exif_transpose(Image.open(path).convert('RGB'))
        W0, H0 = img.size
        bf = processor(images=img, return_tensors='pt').to(device)
        last = m(**bf).last_hidden_state
        if last.ndim == 3:
            hs = last[0].float()
            num_reg = getattr(m.config, 'num_register_tokens', 0)
            patch = getattr(m.config, 'patch_size', 16)
            patches = hs[1 + num_reg :, :]
            _, _, Hc, Wc = bf['pixel_values'].shape
            gh, gw = Hc // patch, Wc // patch
            cls = hs[0:1, :]
            sims = (F.normalize(patches, dim=1) @ F.normalize(cls, dim=1).T).squeeze(1)
            fg = sims.detach().cpu().view(gh, gw)
        else:
            fm = last[0].float()
            C, gh, gw = fm.shape
            grid = F.normalize(fm.permute(1,2,0).reshape(-1, C), dim=1)
            gvec = F.normalize(fm.mean(dim=(1,2), keepdim=True).squeeze().unsqueeze(0), dim=1)
            fg = (grid @ gvec.T).detach().cpu().reshape(gh, gw)
        fg01 = (fg - fg.min()) / (fg.max() - fg.min() + 1e-8)
        if smooth_k and smooth_k > 1:
            fg01 = F.avg_pool2d(fg01.unsqueeze(0).unsqueeze(0), smooth_k, 1, smooth_k//2).squeeze()
        mask_small = (fg01 > thresh).to(torch.uint8).numpy()
        mask_full = Image.fromarray(mask_small * 255).resize((W0, H0), Image.NEAREST)
        soft_full = Image.fromarray((fg01.numpy() * 255).astype(np.uint8)).resize((W0, H0), Image.BILINEAR)
        mask = (np.array(mask_full) > 127).astype(np.uint8)
        soft = np.array(soft_full).astype(np.float32) / 255.0
        return mask, soft

    skipped = 0
    for s in dataset.iter_samples(autosave=True, progress=True):
        try:
            msk, soft = _fg_mask(s.filepath)
            s[field] = Segmentation(mask=msk)
            if heatmap_field:
                s[heatmap_field] = Heatmap(map=soft)
        except Exception:
            s[field] = None
            if heatmap_field:
                s[heatmap_field] = None
            skipped += 1
    msg = "wrote masks to '{field}'".format(field=field)
    if heatmap_field:
        msg += " and heatmaps to '{hf}'".format(hf=heatmap_field)
    msg += f". skipped: {skipped}"
    print(msg)

build_pca_fg_masks(dataset)


## 6. Foreground-only CLS embeddings


In [None]:
import numpy as np
from PIL import Image
from transformers import AutoImageProcessor

fg_processor = AutoImageProcessor.from_pretrained('facebook/dinov3-vitl16-pretrain-lvd1689m')

def masked_cls_embedding(path, mask, model, processor, device=None):
    if mask is None:
        return None
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    img = Image.open(path).convert('RGB')
    m = mask
    if m.shape != (img.height, img.width):
        m = np.array(Image.fromarray(m.astype(np.uint8) * 255).resize((img.width, img.height), Image.NEAREST)) > 127
    arr = np.array(img)
    arr[~m] = 0
    inputs = processor(images=Image.fromarray(arr), return_tensors='pt').to(device)
    with torch.no_grad():
        out = model(**inputs).last_hidden_state
    cls = out[0, 0].detach().cpu().numpy()
    return cls

device = 'cuda' if torch.cuda.is_available() else 'cpu'
transformers_model = transformers_model.to(device)

for sample in dataset.iter_samples(autosave=True, progress=True):
    seg = sample['pca_fg'] if sample.has_field('pca_fg') else None
    mask = None if seg is None else seg.mask
    emb = masked_cls_embedding(sample.filepath, mask, transformers_model, fg_processor, device=device)
    sample['embeddings_dinov3_fg'] = emb


## 7. Cluster poses and individuals (KMeans on foreground embeddings)


In [None]:
import numpy as np
from sklearn.cluster import KMeans
from fiftyone import ViewField as F

# Pose clustering
pose_k = 3
embs = dataset.values('embeddings_dinov3_fg')
ids = dataset.values('id')
mask = [e is not None for e in embs]
X = np.stack([e for e,m in zip(embs,mask) if m], axis=0) if any(mask) else None
ids_masked = [i for i,m in zip(ids,mask) if m]
if X is None or len(ids_masked) < 2:
    print('Not enough embeddings for pose clustering.')
else:
    k = min(pose_k, max(2, len(ids_masked)))
    kmeans = KMeans(n_clusters=k, random_state=51, n_init='auto').fit(X)
    for sid, lbl in zip(ids_masked, kmeans.labels_.tolist()):
        sample = dataset[sid]
        sample['pose_cluster'] = int(lbl)
        sample.save()
    print('Pose clusters sizes:', dict(zip(*np.unique(kmeans.labels_, return_counts=True))))

# Individual clustering per pose
individual_k = 5
pose_vals = dataset.values('pose_cluster') if dataset.has_field('pose_cluster') else []
if not pose_vals:
    print('No pose_cluster labels available; run pose clustering first.')
else:
    for pose_id in sorted(set(p for p in pose_vals if p is not None)):
        pose_embs = [e for e,p in zip(embs, pose_vals) if p == pose_id and e is not None]
        pose_ids  = [i for i,p in zip(ids,  pose_vals) if p == pose_id and p is not None]
        if len(pose_embs) < 2:
            continue
        k = min(individual_k, max(2, len(pose_embs)))
        kmeans = KMeans(n_clusters=k, random_state=pose_id, n_init='auto').fit(np.stack(pose_embs))
        for sid, lbl in zip(pose_ids, kmeans.labels_.tolist()):
            sample = dataset[sid]
            sample['individual_cluster'] = int(lbl)
            sample.save()
    print('Done assigning individual_cluster labels per pose.')


## 8. Visualize pose examples (mask overlay)


In [None]:
import matplotlib.pyplot as plt
from PIL import Image
from fiftyone import ViewField as F

max_poses = 3
max_per_pose = 5
pose_vals = dataset.values('pose_cluster') if dataset.has_field('pose_cluster') else []
if not pose_vals:
    print('No pose_cluster field; run clustering first.')
else:
    pose_ids = sorted(set(p for p in pose_vals if p is not None))[:max_poses]
    for pid in pose_ids:
        view = dataset.match(F('pose_cluster') == pid)
        samples = list(view.shuffle(seed=pid).limit(max_per_pose))
        print(f'Pose {pid}: showing {len(samples)} of {view.count()}')
        if len(samples) == 0:
            continue
        fig, axes = plt.subplots(1, len(samples), figsize=(4*len(samples), 4))
        if len(samples) == 1:
            axes = [axes]
        for ax, sample in zip(axes, samples):
            img = Image.open(sample.filepath).convert('RGB')
            ax.imshow(img)
            mask = None
            if sample.has_field('pca_fg') and sample['pca_fg'] is not None:
                mask = sample['pca_fg'].mask
            if mask is not None:
                ax.imshow(mask, cmap='inferno', alpha=0.35)
            ax.set_xticks([]); ax.set_yticks([])
        plt.show()


## 9. Visualize individual examples (heatmap overlay)


In [None]:
import matplotlib.pyplot as plt
from PIL import Image
from fiftyone import ViewField as F
import numpy as np

max_individuals = 5
max_per_individual = 5
if not dataset.has_field('individual_cluster'):
    print('No individual_cluster field; run individual clustering first.')
else:
    clusters = sorted(set(c for c in dataset.values('individual_cluster') if c is not None))[:max_individuals]
    for cid in clusters:
        view = dataset.match(F('individual_cluster') == cid)
        samples = list(view.shuffle(seed=cid).limit(max_per_individual))
        print(f'individual_{cid}: showing {len(samples)} of {view.count()}')
        if len(samples) == 0:
            continue
        fig, axes = plt.subplots(1, len(samples), figsize=(4*len(samples), 4))
        if len(samples) == 1:
            axes = [axes]
        for ax, sample in zip(axes, samples):
            img = Image.open(sample.filepath).convert('RGB')
            ax.imshow(img)
            heat = None
            if sample.has_field('pca_fg_heat') and sample['pca_fg_heat'] is not None:
                heat = sample['pca_fg_heat'].map
            if heat is not None:
                ax.imshow(heat, cmap='inferno', alpha=0.35)
            ax.set_xticks([]); ax.set_yticks([])
        plt.show()


## 10. Export clustered images to runs/run_name


In [None]:
from pathlib import Path
import shutil
from fiftyone import ViewField as F

run_name = 'run1'  # change per run
run_dir = Path('runs') / run_name
pose_dir = run_dir / 'pose_clusters'
indiv_dir = run_dir / 'individuals'
pose_dir.mkdir(parents=True, exist_ok=True)
indiv_dir.mkdir(parents=True, exist_ok=True)

if dataset.has_field('pose_cluster'):
    pose_vals = dataset.values('pose_cluster')
    pose_ids = sorted(set(p for p in pose_vals if p is not None))
    for pid in pose_ids:
        cluster_dir = pose_dir / f'pose_{pid}'
        cluster_dir.mkdir(parents=True, exist_ok=True)
        view = dataset.match(F('pose_cluster') == pid)
        for s in view:
            dest = cluster_dir / f"{s.id}_{Path(s.filepath).name}"
            if not dest.exists():
                shutil.copy2(s.filepath, dest)
    print(f'Saved pose clusters to {pose_dir}')
else:
    print('No pose_cluster field; run pose clustering first.')

if dataset.has_field('individual_cluster'):
    indiv_vals = dataset.values('individual_cluster')
    indiv_ids = sorted(set(c for c in indiv_vals if c is not None))
    for cid in indiv_ids:
        cluster_dir = indiv_dir / f'individual_{cid}'
        cluster_dir.mkdir(parents=True, exist_ok=True)
        view = dataset.match(F('individual_cluster') == cid)
        for s in view:
            dest = cluster_dir / f"{s.id}_{Path(s.filepath).name}"
            if not dest.exists():
                shutil.copy2(s.filepath, dest)
    print(f'Saved individual clusters to {indiv_dir}')
else:
    print('No individual_cluster field; run individual clustering first.')
