In [1]:
import torch
from PIL import Image
import numpy as np
import os
import open_clip
import pandas as pd
from tqdm import tqdm

In [2]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
device = 'cuda'
model = model.to(device)


def process_folder(folder_path):
    embeddings = []
    file_names = []
    
    # Load and process each image
    for img_file in os.listdir(folder_path):
        if img_file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
            img_path = os.path.join(folder_path, img_file)
            image = Image.open(img_path)
            image = preprocess(image).unsqueeze(0).to(device)
            
            # Generate embedding
            with torch.no_grad():
                embedding = model.encode_image(image)
            embeddings.append(embedding.squeeze(0))
            file_names.append(img_file)
    
    # Convert list of embeddings to a tensor
    embeddings_tensor = torch.stack(embeddings)

    # Normalize embeddings to unit vectors
    embeddings_norm = embeddings_tensor / embeddings_tensor.norm(dim=1, keepdim=True)

    # Compute pairwise cosine similarity as matrix multiplication of normalized embeddings
    cosine_sim_matrix = embeddings_norm @ embeddings_norm.T

    # Fill diagonal with zeros to exclude self-similarity
    cosine_sim_matrix.fill_diagonal_(0)

    # Calculate the average similarity for each embedding to all others
    average_similarities = cosine_sim_matrix.mean(dim=1)

    # Sort files by average similarity (lower similarity first, indicating greater distance)
    sorted_indices = average_similarities.argsort()
    sorted_files = [file_names[i] for i in sorted_indices]

    # Sorted average similarities can be used as a measure of distance (lower = more distant)
    sorted_distances = [average_similarities[i].item() for i in sorted_indices]
    
    return sorted_files, sorted_distances


In [None]:
main_folder = "data/face_images"
csv_file_path = "data/sorted_images.csv"

data = []  # Initialize an empty list to store data

for subfolder in tqdm(os.listdir(main_folder)):
    folder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(folder_path):
        sorted_files, sorted_distances = process_folder(folder_path)

        # Append data for each file to the list
        for file_name, distance in zip(sorted_files, sorted_distances):
            data.append({
                "Folder": subfolder,
                "File Name": file_name,
                "Similarity": distance  
            })

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

 41%|█████████████████████████████████████████████████████████████████████▌                                                                                                    | 5180/12660 [29:19<1:06:37,  1.87it/s]

In [None]:
df.head()