# Face Clustering
Face clustering using sklearn

In [1]:
import pickle
import numpy as np
import cv2
import face_recognition
from imutils import paths, build_montages
from sklearn.cluster import DBSCAN

In [2]:
# grab all the image paths of input dataset
print("[INFO] quantifying faces...")
image_paths = list(paths.list_images("./output"))

# creating data store for all our apps
data = []

[INFO] quantifying faces...


In [3]:
detection_method = "cnn"

In [4]:
for (i, image_path) in enumerate(image_paths):
    # load the input image and convert it from RGB (OpenCV ordering)
    # to dlib ordering (RGB)
    print("[INFO] processing image {}/{}".format(i + 1, len(image_paths)))
    print(image_path)
    image = cv2.imread(image_path)
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # detect the (x, y) co-ordinates of the bounding boxes
    # corresponding to each face in the input image
    boxes = face_recognition.face_locations(rgb, model="cnn")
    
    #compute the facial embedding for the face
    encodings = face_recognition.face_encodings(rgb, boxes)
    d = [{"imagepath": image_path, "loc": box, "encoding": enc}
         for (box, enc) in zip(boxes, encodings)]
    data.extend(d)

[INFO] processing image 1/16
./output\generated-image-1.jpg
[INFO] processing image 2/16
./output\generated-image-10.jpg
[INFO] processing image 3/16
./output\generated-image-11.jpg
[INFO] processing image 4/16
./output\generated-image-12.jpg
[INFO] processing image 5/16
./output\generated-image-13.jpg
[INFO] processing image 6/16
./output\generated-image-14.jpg
[INFO] processing image 7/16
./output\generated-image-15.jpg
[INFO] processing image 8/16
./output\generated-image-16.jpg
[INFO] processing image 9/16
./output\generated-image-2.jpg
[INFO] processing image 10/16
./output\generated-image-3.jpg
[INFO] processing image 11/16
./output\generated-image-4.jpg
[INFO] processing image 12/16
./output\generated-image-5.jpg
[INFO] processing image 13/16
./output\generated-image-6.jpg
[INFO] processing image 14/16
./output\generated-image-7.jpg
[INFO] processing image 15/16
./output\generated-image-8.jpg
[INFO] processing image 16/16
./output\generated-image-9.jpg


In [5]:
# dump the facial encodings data to disk
print("[INFO] serializing encodings...")
f = open("encodings", "wb")
f.write(pickle.dumps(data))
f.close()

[INFO] serializing encodings...


Now it's time to run the facial clustering algorithm. First thing's first though: we need to reload the face data we stored

In [6]:
parallel_jobs = -1 # How many CPUs to run, -1 means use all available

In [7]:
# load the serialized face encoding + bounding box locations 
print("[INFO] loading encodings...")
data = pickle.loads(open("encodings", "rb").read())
data = np.array(data)
encodings = [d["encoding"] for d in data]

[INFO] loading encodings...


We then use the DBSCAN (Density-Based Spatial Clustering of Applications with Noise) Algorithm to get all the unique faces. This calculates the Euclidian distance between all the 128 dimension face encodings and groups them into clusters. Not all spaces have the same requirements so tweak the algorithm as necessary

In [8]:
# cluster the embeddings
print("[INFO] clustering...")
cluster = DBSCAN(metric="euclidean", n_jobs=parallel_jobs, min_samples=5)
cluster.fit(encodings)
 
# determine the total number of unique faces found in the dataset
labelIDs = np.unique(cluster.labels_)
numUniqueFaces = len(np.where(labelIDs > -1)[0])
print("[INFO] # unique faces: {}".format(numUniqueFaces))

[INFO] clustering...
[INFO] # unique faces: 1


In [None]:
# loop over the unique face integers
for labelID in labelIDs:
    # find all indexes into the `data` array that belong to the
    # current label ID, then randomly sample a maximum of 25 indexes
    # from the set
    print("[INFO] faces for face ID: {}".format(labelID))
    idxs = np.where(cluster.labels_ == labelID)[0]
    idxs = np.random.choice(idxs, size=min(25, len(idxs)),
        replace=False)
 
    # initialize the list of faces to include in the montage
    faces = []
    
    # loop over the sampled indexes
    for i in idxs:
        # load the input image and extract the face ROI
        image = cv2.imread(data[i]["imagepath"])
        (top, right, bottom, left) = data[i]["loc"]
        face = image[top:bottom, left:right]
 
        # force resize the face ROI to 96x96 and then add it to the
        # faces montage list
        face = cv2.resize(face, (96, 96))
        faces.append(face)
        
    montage = build_montages(faces, (96, 96), (5, 5))[0]

    # show the output montage
    title = "Face ID #{}".format(labelID)
    title = "Unknown Faces" if labelID == -1 else title
    cv2.imshow(title, montage)
    cv2.waitKey(0)
print("done!")

[INFO] faces for face ID: -1
