In [8]:
import os
import shutil
import numpy as np
from deepface import DeepFace
from sklearn.cluster import DBSCAN
import cv2 # Still useful for reading images if needed, but DeepFace handles its own loading





In [None]:
import paths

model_name = 'Facenet512' # You can also try 'VGG-Face', 'OpenFace', 'DeepFace', etc.

# --- Configuration ---
INPUT_DIR = paths.INPUT_DIR # Folder containing the original photos
last_folder_name = os.path.basename(INPUT_DIR)
OUTPUT_DIR = f"results/{last_folder_name}_{model_name}" # Folder to save the final results

# Ensure directories exist
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# List of valid image extensions
IMAGE_EXTENSIONS = ('.png', '.jpg', '.jpeg', '.webp', '.bmp')


| Model Name | Embedding Length (Dimensions) |
| :--- | :--- |
| VGG-Face | 2622 |
| FaceNet | 128 |
| FaceNet512 | 512 |
| OpenFace | 128 |
| DeepFace | 4096 |
| DeepID | 160 |
| ArcFace | 512 |
| SFace | 128 |
| GhostFaceNet | 512 |

In [None]:
def get_all_face_data(directory):
    """
    Scans the directory, detects ALL faces in ALL images, and generates embeddings.
    
    Crucial: DeepFace.represent() returns a list of dictionaries for ALL faces detected in an image.
    We need to flatten this into a single list of (embedding, path) pairs.
    """
    
    print("ðŸš€ Starting face detection and embedding generation...")
    
    filenames = os.listdir(directory)
    print(f'---Total files {len(filenames)}---')
    
    face_data = []
    
    for idx, filename in enumerate(filenames):
        print(idx, end='')
        if not filename.lower().endswith(IMAGE_EXTENSIONS):
            continue
            
        path = os.path.join(directory, filename)
        
        try:
            results = DeepFace.represent(
                img_path=path, 
                model_name=model_name, 
                detector_backend="yolov12n",
                # l2_normalize=True, # Normalize embeddings to unit length (important for distance-based clustering)
                enforce_detection=True # Set to False only if you know every image has a face
            )
            
            # Each 'results' item is a dictionary for ONE detected face in the image
            for face_obj in results:
                embedding = face_obj['embedding']
                x = face_obj['facial_area']['x']
                y = face_obj['facial_area']['y']
                w = face_obj['facial_area']['w']
                h = face_obj['facial_area']['h']
                face_location = (x, y, w, h)
                face_data.append({
                    'embedding': embedding,
                    'image_path': path,
                    'face_location': face_location
                })
            
            print(f"  -> Processed {filename} Found {len(results)} face(s).")
        
            
        except ValueError as e:
            # DeepFace raises ValueError if no face is detected
            if "Face could not be detected" in str(e):
                print(f"  -> No face detected in {filename}. Skipping.")
            else:
                print(f"  -> Error processing {filename}: {e}")
        except Exception as e:
             print(f"  -> Unexpected error processing {filename}: {e}")
    
    print(f"Length of embedding is {len(face_data[0]['embedding'])} and total faces detected: {len(face_data)}")

    return face_data


In [150]:
def cluster_and_sort_photos(all_face_data, eps=1.5):
    """
    Clusters the embeddings and sorts the photos into named folders.
    """
    
    if not all_face_data:
        print("\nðŸš« No faces found to cluster. Check your 'input_photos' folder.")
        return

    # delete all the contents of OUTPUT_DIR before saving new results
    for filename in os.listdir(OUTPUT_DIR):
        file_path = os.path.join(OUTPUT_DIR, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')
            
    # Extract only the embedding vectors into a NumPy array
    embeddings = np.array([d['embedding'] for d in all_face_data])
    
    print(f"\nClustering {len(embeddings)} total face embeddings...")

    # --- DBSCAN Clustering ---
    # eps: The max distance for two embeddings to be considered the same person.
    # ArcFace (used here) has 512-dimensional embeddings, so the distance is larger 
    # than the 128-dim vectors from Dlib/face_recognition.
    # You might need to tune this value (try 1.0 to 1.5)
    
    cl = DBSCAN(metric="cosine", n_jobs=-1, eps=eps, min_samples=2).fit(embeddings)
    label_ids = cl.labels_
    
    # Get all unique cluster labels/IDs
    unique_labels = np.unique(label_ids)
    
    # 2. Sorting and Saving
    print("\nðŸ“¦ Sorting photos into person folders...")
    
    # A set to keep track of which image files have already been copied
    copied_files = set()
    
    for label in unique_labels:
        # Check if the label is the noise cluster (-1)
        is_noise = (label == -1)
        folder_name = "Unknown_Faces" if is_noise else f"Person_{label}"
        person_dir = os.path.join(OUTPUT_DIR, folder_name)
        os.makedirs(person_dir, exist_ok=True)
        
        # Find the indices corresponding to this cluster ID
        indices = np.where(label_ids == label)[0]
        
        # Track the unique image paths that belong to this person/cluster
        images_for_this_person = set()
        for i in indices:
            image_path_and_face_location = (all_face_data[i]['image_path'], all_face_data[i]['face_location'])
            images_for_this_person.add((image_path_and_face_location))
            # images_for_this_person.add(all_face_data[i]['image_path'])
            
        print(f"  -> Folder '{folder_name}' contains {len(images_for_this_person)} unique photos.")
        
        # Copy the original image files to the new folder
        for src_path, face_location in images_for_this_person:
            # We copy the original photo, not the face crop
            dst_path = os.path.join(person_dir, os.path.basename(src_path))
            
            # shutil.copy(src_path, dst_path)

            img = cv2.imread(src_path)
            x, y, w, h = face_location
            cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2) 
            cv2.imwrite(dst_path, img)

    print("\nâœ… Separation complete!")
    print(f"Results are saved in the '{OUTPUT_DIR}' directory.")



In [199]:
# 1. Load data and generate encodings
all_face_data = get_all_face_data(INPUT_DIR)


ðŸš€ Starting face detection and embedding generation...
---Total files 423---
0  -> Processed 20251009_125323.jpg Found 1 face(s).
1  -> Processed 20251009_131644.jpg Found 1 face(s).
2  -> Processed 20251009_131646.jpg Found 2 face(s).
3  -> Processed 20251009_131649.jpg Found 1 face(s).
4  -> Processed 20251009_131651.jpg Found 1 face(s).
5  -> Processed 20251009_131655.jpg Found 1 face(s).
6  -> No face detected in 20251010_121415.jpg. Skipping.
7  -> No face detected in 20251010_181316.jpg. Skipping.
8  -> Processed 20251011_151256(1)(1).jpg Found 1 face(s).
9  -> Processed 20251011_171050(1).jpg Found 1 face(s).
10  -> Processed 20251011_184821.jpg Found 1 face(s).
11  -> Processed 20251013_201759.jpg Found 1 face(s).
12  -> Processed 20251013_201822.jpg Found 1 face(s).
13  -> Processed 20251013_203101.jpg Found 1 face(s).
14  -> Processed 20251013_203104.jpg Found 1 face(s).
15  -> Processed 20251013_203109.jpg Found 1 face(s).
16  -> No face detected in 20251021_090052.jpg. Sk

In [180]:
from sklearn.metrics.pairwise import pairwise_distances
import random
import cv2 as cv

def cluster_and_sort_photos_with_human_feedback(all_face_data, eps=1.5, buffer=0.1):
    """
    Clusters the embeddings and sorts the photos into named folders.
    """
    
    if not all_face_data:
        print("\nðŸš« No faces found to cluster. Check your 'input_photos' folder.")
        return

    # delete all the contents of OUTPUT_DIR before saving new results
    for filename in os.listdir(OUTPUT_DIR):
        file_path = os.path.join(OUTPUT_DIR, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')
            
    # Extract only the embedding vectors into a NumPy array
    embeddings = np.array([d['embedding'] for d in all_face_data])
    
    print(f"\nClustering {len(embeddings)} total face embeddings...")

    # --- DBSCAN Clustering ---
    # eps: The max distance for two embeddings to be considered the same person.
    # ArcFace (used here) has 512-dimensional embeddings, so the distance is larger 
    # than the 128-dim vectors from Dlib/face_recognition.
    # You might need to tune this value (try 1.0 to 1.5)
    metric = "cosine"
    cl = DBSCAN(metric=metric, n_jobs=-1, eps=eps, min_samples=2).fit(embeddings)
    label_ids = cl.labels_
    
    # Compute the full distance matrix (Cosine is best for faces)
    dist_matrix = pairwise_distances(embeddings, metric=metric)
    
    # Find indices of pairs within the 'Uncertainty Zone'
    neighbour_indices = np.where((dist_matrix > (eps)) & 
                                 (dist_matrix < (eps + buffer)))
    
    # Filter to avoid self-matches and duplicates (i < j)
    candidate_pairs = [(i, j) for i, j in zip(*neighbour_indices) if i < j]
    
    idx = 0
    for i, j in candidate_pairs:
        idx += 1
        print(f"\rQuestion {idx}/{len(candidate_pairs)}", end="")
        
        # Only show pairs that are in different clusters (potentially misclassified)
        if label_ids[i] == label_ids[j]:
            continue
        
        path1, path2 = all_face_data[i]['image_path'], all_face_data[j]['image_path']
        loc1, loc2 = all_face_data[i]['face_location'], all_face_data[j]['face_location']
        
        # Display side-by-side
        img1 = cv.imread(path1)
        img2 = cv.imread(path2)
        # faces are located at (x, y, w, h)
        x1, y1, w1, h1 = loc1
        x2, y2, w2, h2 = loc2
        # cv.rectangle(img1, (x1, y1), (x1+w1, y1+h1), (0, 255, 0), 2)
        # cv.rectangle(img2, (x2, y2), (x2+w2, y2+h2), (0, 255, 0), 2)
        face1 = img1[y1:y1+h1, x1:x1+w1]
        face2 = img2[y2:y2+h2, x2:x2+w2]
        
        if h1 < h2:
            face1 = cv.resize(face1, (int(w1 * h2 / h1), h2))
        else:
            face2 = cv.resize(face2, (int(w2 * h1 / h2), h1))
        # Resize for display
        combined = np.hstack((face1, face2))
        
        cv.imshow("Same Person? (y=Yes, n=No, q=Quit)", combined)
        key = cv.waitKey(0) & 0xFF

        if key == ord('y'):
            label1, label2 = label_ids[i], label_ids[j]
            if label1 == -1 and label2 == -1:
                # If both are noise, assign a new unique label (max existing + 1)
                new_label = max(label_ids) + 1
                label_ids[i] = new_label
                label_ids[j] = new_label
            elif label1 == -1:
                label_ids[i] = label2
            elif label2 == -1:
                label_ids[j] = label1
            else:
                target_label = min(label1, label2)
                collapsed_label = max(label1, label2) 
                # Update all instances of the collapsed label to the target label
                label_ids[label_ids == collapsed_label] = target_label
        elif key == ord('q'):
            break

    cv.destroyAllWindows()
        
    
    # Get all unique cluster labels/IDs
    unique_labels = np.unique(label_ids)
    
    # 2. Sorting and Saving
    print("\nðŸ“¦ Sorting photos into person folders...")
    
    for label in unique_labels:
        # Check if the label is the noise cluster (-1)
        is_noise = (label == -1)
        folder_name = "Unknown_Faces" if is_noise else f"Person_{label}"
        person_dir = os.path.join(OUTPUT_DIR, folder_name)
        os.makedirs(person_dir, exist_ok=True)
        
        # Find the indices corresponding to this cluster ID
        indices = np.where(label_ids == label)[0]
        
        # Track the unique image paths that belong to this person/cluster
        images_for_this_person = set()
        for i in indices:
            image_path_and_face_location = (all_face_data[i]['image_path'], all_face_data[i]['face_location'])
            images_for_this_person.add((image_path_and_face_location))
            # images_for_this_person.add(all_face_data[i]['image_path'])
            
        print(f"  -> Folder '{folder_name}' contains {len(images_for_this_person)} unique photos.")
        
        # Copy the original image files to the new folder
        for src_path, face_location in images_for_this_person:
            # We copy the original photo, not the face crop
            dst_path = os.path.join(person_dir, os.path.basename(src_path))
            
            # shutil.copy(src_path, dst_path)

            img = cv2.imread(src_path)
            x, y, w, h = face_location
            cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2) 
            cv2.imwrite(dst_path, img)

    print("\nâœ… Separation complete!")
    print(f"Results are saved in the '{OUTPUT_DIR}' directory.")



In [200]:
import math
epsilon = math.sin(math.radians(9)) # You may need to tune this value based on your dataset and embedding model
epsilon

0.15643446504023087

In [201]:
cluster_and_sort_photos_with_human_feedback(all_face_data, eps=epsilon, buffer=epsilon/100*20)


Clustering 1005 total face embeddings...
Question 2529/2529
ðŸ“¦ Sorting photos into person folders...
  -> Folder 'Unknown_Faces' contains 248 unique photos.
  -> Folder 'Person_0' contains 14 unique photos.
  -> Folder 'Person_1' contains 3 unique photos.
  -> Folder 'Person_2' contains 169 unique photos.
  -> Folder 'Person_3' contains 31 unique photos.
  -> Folder 'Person_4' contains 160 unique photos.
  -> Folder 'Person_5' contains 30 unique photos.
  -> Folder 'Person_7' contains 2 unique photos.
  -> Folder 'Person_8' contains 2 unique photos.
  -> Folder 'Person_9' contains 3 unique photos.
  -> Folder 'Person_11' contains 2 unique photos.
  -> Folder 'Person_12' contains 3 unique photos.
  -> Folder 'Person_13' contains 4 unique photos.
  -> Folder 'Person_15' contains 12 unique photos.
  -> Folder 'Person_16' contains 6 unique photos.
  -> Folder 'Person_17' contains 2 unique photos.
  -> Folder 'Person_18' contains 3 unique photos.
  -> Folder 'Person_19' contains 9 unique

In [146]:
# 2. Cluster and sort
cluster_and_sort_photos(all_face_data, eps=epsilon)



Clustering 719 total face embeddings...

ðŸ“¦ Sorting photos into person folders...
  -> Folder 'Unknown_Faces' contains 166 unique photos.
  -> Folder 'Person_0' contains 3 unique photos.
  -> Folder 'Person_1' contains 6 unique photos.
  -> Folder 'Person_2' contains 83 unique photos.
  -> Folder 'Person_3' contains 6 unique photos.
  -> Folder 'Person_4' contains 91 unique photos.
  -> Folder 'Person_5' contains 2 unique photos.
  -> Folder 'Person_6' contains 3 unique photos.
  -> Folder 'Person_7' contains 2 unique photos.
  -> Folder 'Person_8' contains 7 unique photos.
  -> Folder 'Person_9' contains 3 unique photos.
  -> Folder 'Person_10' contains 2 unique photos.
  -> Folder 'Person_11' contains 4 unique photos.
  -> Folder 'Person_12' contains 26 unique photos.
  -> Folder 'Person_13' contains 2 unique photos.
  -> Folder 'Person_14' contains 4 unique photos.
  -> Folder 'Person_15' contains 2 unique photos.
  -> Folder 'Person_16' contains 33 unique photos.
  -> Folder 'Pe

In [138]:
from sklearn.metrics.pairwise import pairwise_distances
import random
import cv2 as cv

def get_uncertain_pairs(face_data, epsilon=0.25, buffer=0.05, max_questions=None):
    """
    Finds pairs whose distance is within [epsilon - buffer, epsilon + buffer].
    These are the 'hard' cases for the AI.
    """
    embeddings = np.array([d['embedding'] for d in face_data])
    # Compute the full distance matrix (Cosine is best for faces)
    dist_matrix = pairwise_distances(embeddings, metric='cosine')
    
    # Find indices of pairs within the 'Uncertainty Zone'
    uncertain_indices = np.where((dist_matrix > (epsilon - buffer)) & 
                                 (dist_matrix < (epsilon + buffer)))
    
    # Filter to avoid self-matches and duplicates (i < j)
    candidate_pairs = [(i, j) for i, j in zip(*uncertain_indices) if i < j]
    
    random.shuffle(candidate_pairs)
    return candidate_pairs[:max_questions] if max_questions is not None else candidate_pairs

In [141]:
def collect_human_ground_truth(face_data, pairs):
    must_link = []
    cannot_link = []

    idx = 1
    for i, j in pairs:
        print(f"\rQuestion {idx}/{len(pairs)}", end="")
        path1, path2 = face_data[i]['image_path'], face_data[j]['image_path']
        loc1, loc2 = face_data[i]['face_location'], face_data[j]['face_location']
        
        # Display side-by-side
        img1 = cv.imread(path1)
        img2 = cv.imread(path2)
        # faces are located at (x, y, w, h)
        x1, y1, w1, h1 = loc1
        x2, y2, w2, h2 = loc2
        # cv.rectangle(img1, (x1, y1), (x1+w1, y1+h1), (0, 255, 0), 2)
        # cv.rectangle(img2, (x2, y2), (x2+w2, y2+h2), (0, 255, 0), 2)
        face1 = img1[y1:y1+h1, x1:x1+w1]
        face2 = img2[y2:y2+h2, x2:x2+w2]
        
        if h1 < h2:
            face1 = cv.resize(face1, (int(w1 * h2 / h1), h2))
        else:
            face2 = cv.resize(face2, (int(w2 * h1 / h2), h1))
        # Resize for display
        combined = np.hstack((face1, face2))
        
        cv.imshow("Same Person? (y=Yes, n=No, q=Quit)", combined)
        key = cv.waitKey(0) & 0xFF
        cv.destroyAllWindows()

        if key == ord('y'):
            must_link.append((i, j))
        elif key == ord('n'):
            cannot_link.append((i, j))
        elif key == ord('q'):
            break
        
        idx += 1
            
    return must_link, cannot_link

In [142]:
uncertain_pairs = get_uncertain_pairs(all_face_data, epsilon=epsilon, buffer=epsilon/100*1, max_questions=50)
print(f"Collected {len(uncertain_pairs)} uncertain pairs for human labeling.")

Collected 50 uncertain pairs for human labeling.


In [143]:
must_link, cannot_link = collect_human_ground_truth(face_data=all_face_data, pairs=uncertain_pairs)

Question 50/50

In [144]:
from metric_learn import ITML

# 1. Separate the indices from the labels
# metric-learn ITML.fit(pairs, y) expects:
# pairs: (n_constraints, 2) array of indices
# y: (n_constraints,) array of 1 (same) or -1 (different)
pairs = []
y = []

for i, j in must_link:
    pairs.append((i, j))
    y.append(1)

for i, j in cannot_link:
    pairs.append((i, j))
    y.append(-1)

# 2. Learn the 'New Math' using the base ITML class
# We provide the embeddings matrix as a 'preprocessor' so the 
# model knows how to resolve the indices in 'pairs'.
embeddings = np.array([d['embedding'] for d in all_face_data])
itml = ITML(preprocessor=embeddings)
itml.fit(pairs, y)

# 3. Transform your embeddings
# This applies the learned Mahalanobis transformation to the data
improved_embeddings = itml.transform(embeddings)

# 4. Rebuild your data structure
improved_all_face_data = []
for i, d in enumerate(all_face_data):
    improved_all_face_data.append({
        'embedding': improved_embeddings[i],
        'image_path': d['image_path'],
        'face_location': d['face_location']
    })

In [148]:
cluster_and_sort_photos(improved_all_face_data, eps=0.15)


Clustering 719 total face embeddings...

ðŸ“¦ Sorting photos into person folders...
  -> Folder 'Unknown_Faces' contains 199 unique photos.
  -> Folder 'Person_0' contains 3 unique photos.
  -> Folder 'Person_1' contains 5 unique photos.
  -> Folder 'Person_2' contains 84 unique photos.
  -> Folder 'Person_3' contains 6 unique photos.
  -> Folder 'Person_4' contains 101 unique photos.
  -> Folder 'Person_5' contains 3 unique photos.
  -> Folder 'Person_6' contains 7 unique photos.
  -> Folder 'Person_7' contains 3 unique photos.
  -> Folder 'Person_8' contains 2 unique photos.
  -> Folder 'Person_9' contains 2 unique photos.
  -> Folder 'Person_10' contains 2 unique photos.
  -> Folder 'Person_11' contains 19 unique photos.
  -> Folder 'Person_12' contains 2 unique photos.
  -> Folder 'Person_13' contains 4 unique photos.
  -> Folder 'Person_14' contains 2 unique photos.
  -> Folder 'Person_15' contains 2 unique photos.
  -> Folder 'Person_16' contains 6 unique photos.
  -> Folder 'Pe