In [2]:
# V15_unsup recap --> using SSL for unsup img learning on unlabeled data

In [6]:
import os
import torch
import platform
import datetime
import psutil
import numpy as np
import matplotlib.pyplot as plt
import time
from PIL import Image
from tqdm import tqdm
from sklearn.cluster import MiniBatchKMeans
from sklearn.manifold import TSNE
try:
    import umap
    HAVE_UMAP = True
except ImportError:
    HAVE_UMAP = False

from transformers import AutoImageProcessor, AutoModel

In [7]:
# ========== CONFIG SECTION (Version-controlled) ==========
CFG_VERSION = "V15_unsup_2025_06_03"
IMAGE_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/temp_autogluon_images"
MODEL_NAME = "facebook/dinov2-small"
DATASET_NAME = os.path.basename(os.path.normpath(IMAGE_DIR))
DATESTAMP = datetime.datetime.now().strftime("%Y%m%d")

N_CLUSTERS = 20               # Use 8, 12, 20+ for more granular grouping
KL_OOD_THRESHOLD = 0.55
EUCLIDEAN_OUTLIER_PERCENTILE = 98
BATCH_SIZE = 32
VALID_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp', '.jfif')
T_SNE_SUBSAMPLE = 5000        # Max points for t-SNE/UMAP for speed
CLUSTER_N_INIT = 10           # Increase for final clustering stability

# Auto-versioned output filenames
OUT_PREFIX = f"{CFG_VERSION}_{DATASET_NAME}_clusters{N_CLUSTERS}_kl{KL_OOD_THRESHOLD}_eu{EUCLIDEAN_OUTLIER_PERCENTILE}_{DATESTAMP}"

In [8]:
# ========== HARDWARE/ENV LOG ==========
print("===== SYSTEM INFO =====")
print("Python:", platform.python_version())
print("Platform:", platform.platform())
print("CPU:", platform.processor())
print("RAM (GB):", round(psutil.virtual_memory().total / 1e9, 2))
print("=======================")
print(f"üñ•Ô∏è Using CPU only for DINOv2 embedding extraction (Mac MPS/upsample_bicubic2d error workaround)")

===== SYSTEM INFO =====
Python: 3.10.16
Platform: macOS-15.5-arm64-arm-64bit
CPU: arm
RAM (GB): 17.18
üñ•Ô∏è Using CPU only for DINOv2 embedding extraction (Mac MPS/upsample_bicubic2d error workaround)


In [9]:
# ========== 1. LOAD SSL MODEL ==========
print(f"üîÑ Loading backbone: {MODEL_NAME}")
model = AutoModel.from_pretrained(MODEL_NAME).to("cpu").eval()
processor = AutoImageProcessor.from_pretrained(MODEL_NAME)

üîÑ Loading backbone: facebook/dinov2-small


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
# ========== 2. IMAGE PATHS ==========
image_paths = [
    os.path.join(IMAGE_DIR, f)
    for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith(VALID_EXTENSIONS)
]
print(f"üñºÔ∏è Found {len(image_paths)} images.")

üñºÔ∏è Found 31002 images.


In [11]:
#  ========== 3. EMBEDDING EXTRACTION (skipped if npy exists) ==========
EMBED_FILE = f"{OUT_PREFIX}_embeddings.npy"
PATHS_FILE = f"{OUT_PREFIX}_paths.npy"
if os.path.exists(EMBED_FILE) and os.path.exists(PATHS_FILE):
    all_embeddings = np.load(EMBED_FILE)
    all_files = np.load(PATHS_FILE).tolist()
    print(f"‚úÖ Loaded embeddings: {all_embeddings.shape}")
    print(f"‚úÖ Loaded image paths: {len(all_files)}")
else:
    all_embeddings = []
    all_files = []
    for i in tqdm(range(0, len(image_paths), BATCH_SIZE)):
        batch_paths = image_paths[i:i+BATCH_SIZE]
        images = []
        for path in batch_paths:
            try:
                img = Image.open(path).convert("RGB")
                images.append(img)
                all_files.append(path)
            except Exception as e:
                print(f"Skip: {path} ({e})")
        if not images:
            continue
        inputs = processor(images=images, return_tensors="pt")
        with torch.no_grad():
            features = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.append(features)
    all_embeddings = np.vstack(all_embeddings)
    assert all_embeddings.shape[0] == len(all_files), "Some images were skipped unexpectedly."
    print("‚úÖ Extracted embeddings shape:", all_embeddings.shape)
    np.save(EMBED_FILE, all_embeddings)
    np.save(PATHS_FILE, np.array(all_files))
    print(f"üíæ Saved embeddings and image paths to disk as {EMBED_FILE}, {PATHS_FILE}")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 969/969 [17:39<00:00,  1.09s/it]

‚úÖ Extracted embeddings shape: (31002, 384)
üíæ Saved embeddings and image paths to disk as V15_unsup_2025_06_03_temp_autogluon_images_clusters20_kl0.55_eu98_20250603_embeddings.npy, V15_unsup_2025_06_03_temp_autogluon_images_clusters20_kl0.55_eu98_20250603_paths.npy





In [12]:
# ========== 4. MINI-BATCH KMEANS CLUSTERING ==========
print("‚è≥ Starting MiniBatchKMeans clustering...")
start = time.perf_counter()

kmeans = MiniBatchKMeans(
    n_clusters=N_CLUSTERS,
    batch_size=1024,
    max_iter=100,
    n_init=CLUSTER_N_INIT,
    init="random",
    random_state=42,
    verbose=1
)

cluster_labels = kmeans.fit_predict(all_embeddings)

end = time.perf_counter()
print(f"‚úÖ MiniBatchKMeans done in {end - start:.2f} sec")
np.save(f"{OUT_PREFIX}_cluster_labels.npy", cluster_labels)

‚è≥ Starting MiniBatchKMeans clustering...
Init 1/10 with method random
Inertia for init 1/10: 1589431.125
Init 2/10 with method random
Inertia for init 2/10: 1593941.5
Init 3/10 with method random
Inertia for init 3/10: 1618250.625
Init 4/10 with method random
Inertia for init 4/10: 1634503.0
Init 5/10 with method random
Inertia for init 5/10: 1615002.625
Init 6/10 with method random
Inertia for init 6/10: 1591923.5
Init 7/10 with method random
Inertia for init 7/10: 1629631.25
Init 8/10 with method random
Inertia for init 8/10: 1616996.5
Init 9/10 with method random
Inertia for init 9/10: 1527215.875
Init 10/10 with method random
Inertia for init 10/10: 1556265.75
Minibatch step 1/3027: mean batch inertia: 500.27264404296875
Minibatch step 2/3027: mean batch inertia: 321.0315856933594, ewa inertia: 321.0315856933594
Minibatch step 3/3027: mean batch inertia: 320.23876953125, ewa inertia: 320.9792137454834
Minibatch step 4/3027: mean batch inertia: 309.8197326660156, ewa inertia: 320.

In [13]:
# ========== 5. CLUSTER VISUALIZATION (t-SNE/UMAP, subsampled) ==========
def visualize_embeddings(embeddings, cluster_labels, method="tsne", out_path="embedding_grid.png", subsample=T_SNE_SUBSAMPLE):
    n_points = embeddings.shape[0]
    if n_points > subsample:
        idx = np.random.choice(n_points, subsample, replace=False)
        emb = embeddings[idx]
        lbl = np.array(cluster_labels)[idx]
    else:
        emb = embeddings
        lbl = np.array(cluster_labels)
    if method == "tsne":
        reducer = TSNE(n_components=2, perplexity=50, random_state=42)
    elif method == "umap" and HAVE_UMAP:
        reducer = umap.UMAP(n_components=2, random_state=42)
    else:
        print("Unknown or unavailable method; using t-SNE.")
        reducer = TSNE(n_components=2, perplexity=50, random_state=42)
    coords = reducer.fit_transform(emb)
    plt.figure(figsize=(10, 8))
    for k in range(N_CLUSTERS):
        mask = (lbl == k)
        plt.scatter(coords[mask, 0], coords[mask, 1], s=7, alpha=0.7, label=f"C{k}")
    plt.legend()
    plt.title(f"{MODEL_NAME} SSL {method.upper()} by cluster")
    plt.savefig(out_path)
    print(f"üñºÔ∏è Saved {method.upper()} visualization to {out_path}")
    plt.close()

visualize_embeddings(
    all_embeddings, cluster_labels,
    method="tsne",
    out_path=f"{OUT_PREFIX}_tsne.png"
)

üñºÔ∏è Saved TSNE visualization to V15_unsup_2025_06_03_temp_autogluon_images_clusters20_kl0.55_eu98_20250603_tsne.png


In [14]:
# ========== 6. EUCLIDEAN OUTLIER DETECTION ==========
distances = np.linalg.norm(all_embeddings - kmeans.cluster_centers_[cluster_labels], axis=1)
distance_threshold = np.percentile(distances, EUCLIDEAN_OUTLIER_PERCENTILE)
euclid_outlier_indices = np.where(distances > distance_threshold)[0]
print(f"Flagged {len(euclid_outlier_indices)} outlier images by Euclidean distance.")

Flagged 621 outlier images by Euclidean distance.


In [15]:
# ========== 7. KL-DIVERGENCE OOD DETECTION ==========
def compute_kl_divergence(feats, cluster_centers, labels):
    kl_scores = []
    for f, lbl in zip(feats, labels):
        p = torch.softmax(torch.tensor(f), dim=0)
        q = torch.softmax(torch.tensor(cluster_centers[lbl]), dim=0)
        kl = torch.sum(p * torch.log((p + 1e-8) / (q + 1e-8))).item()
        kl_scores.append(kl)
    return kl_scores

kl_scores = compute_kl_divergence(all_embeddings, kmeans.cluster_centers_, cluster_labels)
kl_outlier_indices = np.where(np.array(kl_scores) > KL_OOD_THRESHOLD)[0]
print(f"Flagged {len(kl_outlier_indices)} OOD images by KL-divergence.")

Flagged 3464 OOD images by KL-divergence.


In [16]:
# ========== 8. SAVE LABELS/OUTLIERS ==========
out_csv = os.path.join(IMAGE_DIR, f"{OUT_PREFIX}_ssl_pseudolabels.csv")
with open(out_csv, "w") as f:
    f.write("path,cluster,euclid_dist,kl,euclid_ood,kl_ood\n")
    for idx, (p, c, dist, kl) in enumerate(zip(all_files, cluster_labels, distances, kl_scores)):
        euclid_flag = int(idx in euclid_outlier_indices)
        kl_flag = int(idx in kl_outlier_indices)
        f.write(f"{p},{c},{dist:.3f},{kl:.3f},{euclid_flag},{kl_flag}\n")
print(f"‚úÖ Saved pseudo-labels and OOD flags to {out_csv}")

‚úÖ Saved pseudo-labels and OOD flags to /Users/natalyagrokh/AI/ml_expressions/img_datasets/temp_autogluon_images/V15_unsup_2025_06_03_temp_autogluon_images_clusters20_kl0.55_eu98_20250603_ssl_pseudolabels.csv


In [17]:
# ========== 9. PER-CLUSTER REPORT ==========
for c in range(N_CLUSTERS):
    cluster_indices = np.where(cluster_labels == c)[0]
    print(f"Cluster {c}: {len(cluster_indices)} samples")
    # Optionally: Save or inspect sample images per cluster

Cluster 0: 731 samples
Cluster 1: 1367 samples
Cluster 2: 1385 samples
Cluster 3: 2188 samples
Cluster 4: 1659 samples
Cluster 5: 1381 samples
Cluster 6: 1677 samples
Cluster 7: 1770 samples
Cluster 8: 1816 samples
Cluster 9: 1604 samples
Cluster 10: 1460 samples
Cluster 11: 1095 samples
Cluster 12: 1736 samples
Cluster 13: 1071 samples
Cluster 14: 2415 samples
Cluster 15: 1834 samples
Cluster 16: 1055 samples
Cluster 17: 1473 samples
Cluster 18: 1263 samples
Cluster 19: 2022 samples


In [18]:
# ========== 10. SAVE OOD/JUNK FILES ==========
OOD_DIR = os.path.join(IMAGE_DIR, f"{OUT_PREFIX}_ood_flagged")
os.makedirs(OOD_DIR, exist_ok=True)
for idx in set(list(euclid_outlier_indices) + list(kl_outlier_indices)):
    src = all_files[idx]
    try:
        fname = os.path.basename(src)
        os.symlink(src, os.path.join(OOD_DIR, fname))
    except FileExistsError:
        continue

print(f"‚úÖ OOD (junk) images symlinked to {OOD_DIR}")

‚úÖ OOD (junk) images symlinked to /Users/natalyagrokh/AI/ml_expressions/img_datasets/temp_autogluon_images/V15_unsup_2025_06_03_temp_autogluon_images_clusters20_kl0.55_eu98_20250603_ood_flagged


In [19]:
# ========== 11. CLUSTER EXPLORER & CENTROID FACES ==========
N_PER_CENTROID = 40   # Or adjust to your preferred # of centroids per cluster
CLUSTER_BROWSER_ROOT = f"{OUT_PREFIX}_cluster_browser"

# Compute centroids for each cluster
centroids = np.vstack([
    all_embeddings[cluster_labels == c].mean(axis=0)
    for c in range(N_CLUSTERS)
])

for c in range(N_CLUSTERS):
    cdir = os.path.join(CLUSTER_BROWSER_ROOT, f"cluster_{c:02d}")
    centroids_dir = os.path.join(cdir, "centroid_faces")
    os.makedirs(cdir, exist_ok=True)
    os.makedirs(centroids_dir, exist_ok=True)
    indices = np.where(cluster_labels == c)[0]
    if len(indices) == 0:
        continue
    # 1. Symlink/copy all cluster images
    for idx in indices:
        src = all_files[idx]
        dst = os.path.join(cdir, os.path.basename(src))
        try:
            os.symlink(src, dst)
        except FileExistsError:
            continue
        except OSError:
            shutil.copy(src, dst)
    # 2. Find and symlink/copy centroid images
    c_embeds = all_embeddings[indices]
    dists = np.linalg.norm(c_embeds - centroids[c], axis=1)
    sorted_idx = np.argsort(dists)
    chosen = indices[sorted_idx[:N_PER_CENTROID]]
    for idx in chosen:
        src = all_files[idx]
        dst = os.path.join(centroids_dir, os.path.basename(src))
        try:
            os.symlink(src, dst)
        except FileExistsError:
            continue
        except OSError:
            shutil.copy(src, dst)
    print(f"Cluster {c:02d}: {len(indices)} total, {len(chosen)} centroid faces exported")

print(f"‚úÖ Cluster browser and centroid faces exported to {CLUSTER_BROWSER_ROOT}/cluster_##/ and centroid_faces/")

Cluster 00: 731 total, 40 centroid faces exported
Cluster 01: 1367 total, 40 centroid faces exported
Cluster 02: 1385 total, 40 centroid faces exported
Cluster 03: 2188 total, 40 centroid faces exported
Cluster 04: 1659 total, 40 centroid faces exported
Cluster 05: 1381 total, 40 centroid faces exported
Cluster 06: 1677 total, 40 centroid faces exported
Cluster 07: 1770 total, 40 centroid faces exported
Cluster 08: 1816 total, 40 centroid faces exported
Cluster 09: 1604 total, 40 centroid faces exported
Cluster 10: 1460 total, 40 centroid faces exported
Cluster 11: 1095 total, 40 centroid faces exported
Cluster 12: 1736 total, 40 centroid faces exported
Cluster 13: 1071 total, 40 centroid faces exported
Cluster 14: 2415 total, 40 centroid faces exported
Cluster 15: 1834 total, 40 centroid faces exported
Cluster 16: 1055 total, 40 centroid faces exported
Cluster 17: 1473 total, 40 centroid faces exported
Cluster 18: 1263 total, 40 centroid faces exported
Cluster 19: 2022 total, 40 centr