In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import insightface
from tqdm import tqdm
import shutil
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pathlib import Path

input_folder = Path("../data").resolve()
output_folder = Path("../output").resolve()

try:
    if output_folder.exists():
        shutil.rmtree(output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)
except Exception as e:
    print(f"Error: {e}")
    
# Delete the existing database
db_path = output_folder / "chroma_db"
if os.path.exists(db_path):
    shutil.rmtree(db_path)

In [3]:
# Initialize ChromaDB (version-safe approach)
try:
    from chromadb.config import Settings
    chroma_client = chromadb.PersistentClient(
        path=str(output_folder / "chroma_db"),
        settings=Settings(allow_reset=True, anonymized_telemetry=False))
    
except ImportError:
    chroma_client = chromadb.PersistentClient(
        path=str(output_folder / "chroma_db"))

# Clear existing collection if exists
if "face_embeddings" in chroma_client.list_collections():
    chroma_client.delete_collection("face_embeddings")


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


In [4]:
def process_folder_to_chroma(input_folder, output_folder, chroma_client):
    # Convert to absolute paths
    input_folder = os.path.abspath(input_folder)
    output_folder = os.path.abspath(output_folder)
    
    # Initialize model
    model = insightface.app.FaceAnalysis()
    model.prepare(ctx_id=0)  # 0=GPU, -1=CPU
    
    # Create output folders
    os.makedirs(output_folder, exist_ok=True)
    os.makedirs(os.path.join(output_folder, "images"), exist_ok=True)
    
    try:
            
        # Create collection (modern API)
        collection = chroma_client.get_or_create_collection(
            name="face_embeddings",
            metadata={"hnsw:space": "cosine"} if hasattr(chromadb, 'config') else None
        )
        
        # Data storage
        df = pd.DataFrame(columns=["image_path", "face_index", "bbox"])
        ids = []
        embeddings_list = []
        metadatas = []

        for img_file in tqdm(os.listdir(input_folder)):
            if not img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                continue
                
            img_path = os.path.join(input_folder, img_file)
            output_img_path = os.path.join(output_folder, "images", img_file)
            
            try:
                img = cv2.imread(img_path)
                if img is None:
                    print(f"⚠️ Could not read {img_file}")
                    continue
                    
                faces = model.get(img)
                
                for i, face in enumerate(faces):
                    # # Visualization
                    bbox = face.bbox.astype(int)
                    cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
                    
                    # Generate unique ID
                    face_id = f"{os.path.splitext(img_file)[0]}_{i}"
                    
                    # Store metadata
                    df.loc[len(df)] = {
                        "image_path": img_path,
                        "face_index": i,
                        "bbox": f"{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}"
                    }
                    
                    # Prepare ChromaDB data
                    ids.append(face_id)
                    embeddings_list.append(face.embedding.tolist())
                    metadatas.append({
                        "image_path": img_path,
                        "bbox": f"{bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}",
                        "face_index": i
                    })
                
                # # Save image
                cv2.imwrite(output_img_path, img)
                
            except Exception as e:
                print(f"❌ Error processing {img_file}: {e}")
        
        # Add all embeddings to ChromaDB (batch insert)
        if ids:  # Only if faces were detected
            try:
                collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas
                )
            except Exception as e:
                print(f"❌ Failed to add to Chroma: {e}")
        
        # Save metadata CSV
        df.to_csv(os.path.join(output_folder, "metadata.csv"), index=False)
        
        print(f"\n✅ Saved:")
        print(f"- ChromaDB storage: {output_folder}/chroma_db")
        print(f"- Metadata CSV: {output_folder}/metadata.csv")
        print(f"- Visualized images: {output_folder}/images/")
        print(f"- Total faces processed: {len(ids)}")
        
    except Exception as e:
        print(f"🔥 Critical ChromaDB Error: {e}")
        raise

In [5]:
process_folder_to_chroma(input_folder, output_folder, chroma_client)



Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/linux_sumit/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/linux_sumit/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/linux_sumit/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/linux_sumit/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}


Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


find model: /home/linux_sumit/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det-size: (640, 640)


  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_


✅ Saved:
- ChromaDB storage: /home/linux_sumit/Projects/face_recognition/output/chroma_db
- Metadata CSV: /home/linux_sumit/Projects/face_recognition/output/metadata.csv
- Visualized images: /home/linux_sumit/Projects/face_recognition/output/images/
- Total faces processed: 352


In [20]:

collection = chroma_client.get_collection("face_embeddings")

# Get all embeddings and metadata
results = collection.get(include=["embeddings", "metadatas"])
embeddings = np.array(results["embeddings"])
metadata = results["metadatas"]  # Contains image_path, bbox info

In [21]:
def parse_metadata(raw_metadata):
    """
    Converts raw metadata with string bbox to structured format.
    """
    parsed_metadata = []
    for item in raw_metadata:
        top, left, bottom, right = map(int, item["bbox"].split(','))
        parsed_metadata.append({
            "image_path": item["image_path"],
            "face_index": int(item["face_index"]),
            "bbox": (top, left, bottom, right)
        })
    return parsed_metadata

metadata = parse_metadata(metadata)

In [22]:
from sklearn.cluster import DBSCAN

# eps=0.5: Lower = stricter clusters (start with 0.5 for ArcFace)
dbscan = DBSCAN(eps=0.5,metric='cosine', min_samples=2)  
clusters = dbscan.fit_predict(embeddings)

print(f"Found {len(set(clusters))-1} people")  # -1 because -1 = noise

Found 68 people


In [29]:
metadata

[{'image_path': '/home/linux_sumit/Projects/face_recognition/face_recognition/data/1744892587052.jpg',
  'face_index': 0,
  'bbox': (536, 146, 622, 248)},
 {'image_path': '/home/linux_sumit/Projects/face_recognition/face_recognition/data/1744892587052.jpg',
  'face_index': 1,
  'bbox': (1028, 372, 1129, 467)},
 {'image_path': '/home/linux_sumit/Projects/face_recognition/face_recognition/data/1744892587052.jpg',
  'face_index': 2,
  'bbox': (1338, 450, 1397, 517)},
 {'image_path': '/home/linux_sumit/Projects/face_recognition/face_recognition/data/1744892587052.jpg',
  'face_index': 3,
  'bbox': (1509, 384, 1597, 481)},
 {'image_path': '/home/linux_sumit/Projects/face_recognition/face_recognition/data/1744892587052.jpg',
  'face_index': 4,
  'bbox': (1015, 161, 1093, 263)},
 {'image_path': '/home/linux_sumit/Projects/face_recognition/face_recognition/data/1744892587052.jpg',
  'face_index': 5,
  'bbox': (685, 184, 757, 275)},
 {'image_path': '/home/linux_sumit/Projects/face_recognition/f

In [31]:
import pandas as pd

df = pd.DataFrame({
    "cluster": clusters,
    "image_path": [item['image_path'] for item in metadata]
})

df.to_csv("clustered_image.csv", index=False)


In [None]:

def generate_cluster_report(output_folder, clusters, metadata):
    """Creates a summary CSV and visualization"""
    from collections import defaultdict
    import pandas as pd
    
    # Create cluster statistics
    cluster_stats = defaultdict(list)
    for cluster_id, meta in zip(clusters, metadata):
        cluster_stats[cluster_id].append(meta["image_path"])
    
    # Save to CSV
    report = []
    for cluster_id, images in cluster_stats.items():
        report.append({
            "cluster_id": cluster_id,
            "num_faces": len(images),
            "sample_image": images[0]
        })
    
    pd.DataFrame(report).to_csv(os.path.join(output_folder, "cluster_report.csv"), index=False)
    
    print(f"\nOrganization complete!")
    print(f"- Total clusters: {len(set(clusters))}")
    print(f"- Outliers (cluster -1): {len(cluster_stats.get(-1, []))} faces")
    print(f"- Report saved to: {os.path.join(output_folder, 'cluster_report.csv')}")

generate_cluster_report(output_folder, clusters, metadata)


Organization complete!
- Total clusters: 69
- Outliers (cluster -1): 171 faces
- Report saved to: /home/linux_sumit/Projects/face_recognition/face_recognition/output/cluster_report.csv


In [None]:
from typing import List, Tuple, Dict
import json

def create_cluster_dirs(cluster_id: int, OUTPUT_BASE_DIR: str):
    """
    Creates folder structure for a cluster: crop/ and originals/
    """
    cluster_dir = os.path.join(OUTPUT_BASE_DIR, f"cluster_{cluster_id}")
    crop_dir = os.path.join(cluster_dir, "crop")
    original_dir = os.path.join(cluster_dir, "originals")
    os.makedirs(crop_dir, exist_ok=True)
    os.makedirs(original_dir, exist_ok=True)
    return cluster_dir, crop_dir, original_dir

def crop_and_save_face(image_path: str, bboxes: Tuple[int, int, int, int], save_path: str):
    """
    Crops the face from the image and save it.
    """ 
    image = cv2.imread(image_path)
    if image is None:
        print(f"[ERROR] Failed to read image: {image_path}")
        return

    try:
                x1, y1, x2, y2 = bboxes #y1, x1, y2, x2
    except Exception as e:
        print(f"[ERROR] Invalid bbox format: {bboxes} for image {image_path}")
    
    # Validate bbox bounds
    h, w = image.shape[:2]
    x1, y1 = max(0, x1), max(0, y1)       # Ensure ≥ 0
    x2, y2 = min(w, x2), min(h, y2)       # Ensure ≤ image dimensions

    if x1 >= x2 or y1 >= y2:
        print(f"[ERROR] Invalid bbox: x1={x1}, y1={y1}, x2={x2}, y2={y2}. Check coordinate order.")
        return
    
    cropped_face = image[y1:y2, x1:x2]
    if cropped_face.size == 0:
        print(f"[ERROR] Cropped region is empty for bbox: {bboxes}")
        return
    cv2.imwrite(save_path, cropped_face)
    
    
def draw_multiple_bboxes(image_path: str, bboxes: List[Tuple[int, int, int, int]]):
    """
    Returns an image with all bounding boxes drawn.
    """
    
    image = cv2.imread(image_path)
    for top, right, bottom, left in bboxes:
        cv2.rectangle(image, (top, left), (bottom, right), (0, 255, 0), 2)
    return image
    
def process_clusters_from_metadata(clusters: List[int], metadata: List[Dict], OUTPUT_BASE_DIR: str):
    """
    For each cluster, save:
    - One cropped face (from first entry)
    - All images in that cluster with bounding boxes drawn for faces from that cluster
    """
    from collections import defaultdict
    
    # Group metadata by cluster_id
    cluster_to_faces = defaultdict(list)
    for cluster_id, meta in zip(clusters, metadata):
        cluster_to_faces[cluster_id].append(meta)
        
    print(f"[INFO] Processing {len(cluster_to_faces)} clusters...")
    
    for cluster_id, faces in cluster_to_faces.items():
        cluster_dir, crop_dir, original_dir = create_cluster_dirs(cluster_id, OUTPUT_BASE_DIR)
        cluster_dict = {}
        # 1. Save cropped face from the first face in the cluster
        rep_face = faces[0]
        crop_path = os.path.join(crop_dir, "face.jpg")
        crop_and_save_face(rep_face["image_path"], rep_face["bbox"], crop_path)
        
        # 2. Group faces by image to draw all boxes per image
        image_to_bboxes = defaultdict(list)
        for face in faces: # Only faces of this cluster
            image_to_bboxes[face["image_path"]].append(face["bbox"])
            
        for i, (img_path, bboxes) in enumerate(image_to_bboxes.items()):
            annotated_img = draw_multiple_bboxes(img_path, bboxes)
            save_path = os.path.join(original_dir, os.path.basename(img_path))
            cv2.imwrite(save_path, annotated_img)
            cluster_dict[str(i)]=img_path
            
        with open(os.path.join(original_dir,'cluster_images.json'), 'w') as f:
            json.dump(cluster_dict, f, indent=4)
            
    print("[INFO] All clusters processed and saved with json files.")
    

In [37]:
process_clusters_from_metadata(clusters=clusters, metadata=metadata, OUTPUT_BASE_DIR = "../clustered_output")

[INFO] Processing 69 clusters...
[INFO] All clusters processed and saved.
