In [1]:
import numpy as np
import cv2
import face_recognition
import os
from tqdm import tqdm
import shutil
from sklearn.metrics import silhouette_score, davies_bouldin_score


In [2]:
import logging
import contextlib
import tensorflow as tf
import warnings
from mtcnn import MTCNN
from tqdm import tqdm

# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logs at level 3 (ERROR)
tf.get_logger().setLevel('ERROR')

# Configure logging to suppress logs from TensorFlow and MTCNN
logging.getLogger('tensorflow').setLevel(logging.FATAL)
logging.getLogger('mtcnn').setLevel(logging.ERROR)

# Suppress warnings
warnings.filterwarnings('ignore')

# Function to suppress stdout and stderr
@contextlib.contextmanager
def suppress_output():
    with open(os.devnull, 'w', encoding='utf-8') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            yield

def detect_faces_mtcnn(image):
    """
    Detect faces using MTCNN and return bounding boxes in a format compatible with face_recognition.
    """
    with suppress_output():
        # Initialize the MTCNN detector without printing logs
        detector = MTCNN()

        # Detect faces in the image
        faces = detector.detect_faces(image)

        # Filter faces with confidence higher than 0.70 and extract their bounding boxes
        rectangles = []
        for face in faces:
            if face['confidence'] > 0.50:
                x, y, width, height = face['box']
                # Convert to top, right, bottom, left format
                top, right, bottom, left = y, x + width, y + height, x
                rectangles.append((top, right, bottom, left))
                
        return rectangles

In [8]:

def get_embeddings(rgb_image, face_locations):
    """
    Extract face embeddings from the given image based on face locations.
    Args:
    rgb_image (np.array): RGB image from which to extract embeddings.
    face_locations (list): List of face locations in (top, right, bottom, left) format.
    Returns:
    list: List of face encodings.
    """
    if not face_locations:
        return []
    # Compute face embeddings
    face_encodings = face_recognition.face_encodings(rgb_image, face_locations)
    return face_encodings

def process_images(dirpath):
    """
    Process images in the given directory to extract face embeddings.
    Args:
    dirpath (str): Path to the directory containing images.
    Returns:
    list, list: List of embeddings and list of corresponding image paths.
    """
    embeddings = []
    image_paths = []

    # Iterate over all files in the directory
    for filename in tqdm(os.listdir(dirpath)):
        file_path = os.path.join(dirpath, filename)
        image = cv2.imread(file_path)

        # Check if the image was loaded successfully
        if image is None:
            print(f"Error: Could not load image from {file_path}")
            continue

        # Convert the image from BGR to RGB for face_recognition
        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Detect faces and extract face images
        face_locations = detect_faces_mtcnn(rgb_image)

        # Extract embeddings for each detected face
        face_encodings = get_embeddings(rgb_image, face_locations)
    
        for encoding in face_encodings:
            embeddings.append(encoding)
            image_paths.append(file_path)

    return embeddings, image_paths


In [9]:
# embeddings[3]

In [11]:
dir= 'train3'
embeddings,image_paths = process_images(dir)

  0%|          | 0/21 [00:00<?, ?it/s]

100%|██████████| 21/21 [00:35<00:00,  1.69s/it]


In [12]:
from sklearn.cluster import DBSCAN
eps = 0.5  # The maximum distance between two samples for one to be considered as in the neighborhood of the other
min_samples = 2  # The number of samples (or total weight) in a neighborhood for a point to be considered as a core point
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
cluster_labels = dbscan.fit_predict(embeddings)


In [35]:
# from sklearn.cluster import KMeans

# num_clusters = 5

# # Apply K-means clustering on extracted features
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# cluster_labels = kmeans.fit_predict(embeddings)

In [13]:
cluster_labels

array([ 0,  1,  1,  1,  1,  0,  1,  0,  0,  1,  1,  0,  1, -1,  1,  1,  0,
        0,  0, -1, -1,  0,  0,  0, -1, -1,  0], dtype=int64)

In [14]:
base_cluster_dir = f'Clusters_{dir}_m'

# Iterate over the image paths and cluster labels simultaneously
for i, image_path in enumerate(image_paths):
    cluster_label = cluster_labels[i]
    
    # Determine the destination cluster folder
    cluster_folder = f'{base_cluster_dir}/Cluster_{cluster_label}/'
    
    # Create the cluster directory if it doesn't exist
    os.makedirs(cluster_folder, exist_ok=True)
    
    # Get the filename from the image path
    filename = os.path.basename(image_path)
    
    # Determine the destination path
    destination_path = os.path.join(cluster_folder, filename)
    
    try:
        # Check if the file exists at the source path
        if not os.path.exists(image_path):
            print(f"Error: Source file '{image_path}' not found")
            continue
        
        # Perform the copy operation
        shutil.copyfile(image_path, destination_path)
        print(f"Copied '{filename}' to '{cluster_folder}'")

    except FileExistsError:
        print(f"Error: '{filename}' already exists in '{cluster_folder}'")
    except Exception as e:
        print(f"Unexpected error while copying '{filename}': {str(e)}")

print("\n\nProcessing completed.")

Copied '00b170622b.jpg' to 'Clusters_train3_m/Cluster_0/'
Copied '04f5eca5b8.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '0aacb5a14e.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '0ab0076764.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '0b9a662284.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '0bd2a5d439.jpg' to 'Clusters_train3_m/Cluster_0/'
Copied '0be0f65707.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '1b30f19f9a.jpg' to 'Clusters_train3_m/Cluster_0/'
Copied '1b9b1d8ba9.jpg' to 'Clusters_train3_m/Cluster_0/'
Copied '1cdae56e93.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '1cdae56e93.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '1d69e13527.jpg' to 'Clusters_train3_m/Cluster_0/'
Copied '26d23cf667.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '26d23cf667.jpg' to 'Clusters_train3_m/Cluster_-1/'
Copied '4d62ab4d63.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '4dcfc56ae0.jpg' to 'Clusters_train3_m/Cluster_1/'
Copied '5a09ab1835.jpg' to 'Clusters_train3_m/Cluster_0/'
Copied '63e6a

In [15]:
silhouette_avg = silhouette_score(embeddings, cluster_labels)
print(f'Silhouette Score: {silhouette_avg}')
    
# Calculate Davies-Bouldin Index
db_index = davies_bouldin_score(embeddings, cluster_labels)
print(f'Davies-Bouldin Index: {db_index}')

Silhouette Score: 0.27423334474129296
Davies-Bouldin Index: 1.5326189274338844
