In [2]:
import os
import torch
import numpy as np
import cv2
from facenet_pytorch import InceptionResnetV1
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import cosine
from torchvision import transforms
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
facenet_model = InceptionResnetV1(pretrained='vggface2').eval().to(device)

In [4]:
# Define preprocessing for images
transform = transforms.Compose([
    transforms.Resize((160, 160)),
    transforms.ToTensor(),
])

In [5]:
def preprocess_image(image_path):
    """Load and preprocess an image for FaceNet."""
    img = Image.open(image_path).convert('RGB')
    img = transform(img).unsqueeze(0).to(device)  # Add batch dimension
    return img

In [6]:
def get_embedding(image_path):
    """Generate a FaceNet embedding for an image."""
    img = preprocess_image(image_path)
    with torch.no_grad():
        embedding = facenet_model(img)
    return embedding.cpu().numpy().flatten()


# Use DBScan to filter images with very similar embeddings

In [7]:
def remove_duplicates(folder_path, eps=0.5, min_samples=2, threshold=0.3):
    """
    Remove duplicate face images based on FaceNet embeddings and DBSCAN clustering.
    """
    processed_count = 0  # Track processed folders

    person_folders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
    total_folders = len(person_folders)

    for person in person_folders:
        person_path = os.path.join(folder_path, person)
        print(f"Processing folder {processed_count+1}/{total_folders}: {person}")

        embeddings = []
        image_paths = []
        
        for img_name in os.listdir(person_path):
            img_path = os.path.join(person_path, img_name)
            emb = get_embedding(img_path)
            if emb is not None:
                embeddings.append(emb)
                image_paths.append(img_path)

        if len(embeddings) < 2:
            print(f"Skipping {person} (not enough images).")
            processed_count += 1
            continue  # Skip if only one image

        embeddings = np.array(embeddings)

        # Cluster embeddings
        clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean").fit(embeddings)
        labels = clustering.labels_

        unique_clusters = set(labels) - {-1}  # Remove noise (-1)

        for cluster in unique_clusters:
            cluster_indices = np.where(labels == cluster)[0]
            ref_idx = cluster_indices[0]  # First image as reference
            
            for idx in cluster_indices[1:]:
                if cosine(embeddings[ref_idx], embeddings[idx]) < threshold:
                    print(f"Removing duplicate: {image_paths[idx]}")
                    os.remove(image_paths[idx])

        print(f"Completed {person}.")
        processed_count += 1

    print("All folders processed. Exiting.")

# Run the script
folder_path = "data/processed/final_data/final_data"
remove_duplicates(folder_path)


Processing folder 1/23: abhiral
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/abhiral_2_9.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/abhiral_2_3.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/aabhiral_2_14.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/aabhiral_2_77.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/aabhiral_2_4.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/abhiral_2_5.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/abhiral_2_16.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final_data/abhiral/aabhiral_2_82.jpg
Removing duplicate: /home/ronak/ai_final_project/data/processed/final_data/final