# Face detection and recognition inference pipeline

The following example illustrates how to use the `facenet_pytorch` python package to perform face detection and recogition on an image dataset using an Inception Resnet V1 pretrained on the VGGFace2 dataset.

The following Pytorch methods are included:
* Datasets
* Dataloaders
* GPU/CPU processing

In [1]:
from models.mtcnn import MTCNN
from models.inception_resnet_v1 import InceptionResnetV1

import torch
from torch.utils.data import DataLoader
from torchvision import datasets
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from collections import Counter

workers = 0 if os.name == 'nt' else 4

#### Determine if an nvidia GPU is available

In [2]:
device = torch.device('cuda:4' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:4


#### Define MTCNN module

Default params shown for illustration, but not needed. Note that, since MTCNN is a collection of neural nets and other code, the device must be passed in the following way to enable copying of objects when needed internally.

See `help(MTCNN)` for more details.

In [3]:
mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True,
    device=device
)

#### Define Inception Resnet V1 module

Set classify=True for pretrained classifier. For this example, we will use the model to output embeddings/CNN features. Note that for inference, it is important to set the model to `eval` mode.

See `help(InceptionResnetV1)` for more details.

In [41]:
resnet = InceptionResnetV1(pretrained=None).eval().to(device) #"vggface2"

#### Define a dataset and data loader

We add the `idx_to_class` attribute to the dataset to enable easy recoding of label indices to identity names later one.

In [5]:
def collate_fn(x):
    return x[0]
dataset_dir = 'data/lfw'
dataset = datasets.ImageFolder(dataset_dir)
dataset.idx_to_class = {i:c for c, i in dataset.class_to_idx.items()}
loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=workers)

#### Perfom MTCNN facial detection

Iterate through the DataLoader object and detect faces and associated detection probabilities for each. The `MTCNN` forward method returns images cropped to the detected face, if a face was detected. By default only a single detected face is returned - to have `MTCNN` return all detected faces, set `keep_all=True` when creating the MTCNN object above.

To obtain bounding boxes rather than cropped face images, you can instead call the lower-level `mtcnn.detect()` function. See `help(mtcnn.detect)` for details.

In [6]:
from PIL import Image
import torchvision.transforms as T
import matplotlib.pyplot as plt

In [7]:
aligned = []
names = []
# counter = 0
for x, y in tqdm(loader, desc="Detecting faces"):
    x_aligned, prob = mtcnn(x, return_prob=True)
    if x_aligned is not None:
        # print('Face detected with probability: {:8f}'.format(prob))
        aligned.append(x_aligned)
        names.append(dataset.idx_to_class[y])
        # counter += 1
        # if counter == 2000:
        #     break

# for img in aligned:
#     img = (img + 1) / 2
#     img = np.transpose(img, (1,2,0))
#     plt.axis("off")
#     plt.imshow(img)
#     plt.show()


Detecting faces: 100%|██████████| 13233/13233 [07:35<00:00, 29.04it/s]


#### Calculate image embeddings

MTCNN will return images of faces all the same size, enabling easy batch processing with the Resnet recognition module. Here, since we only have a few images, we build a single batch and perform inference on it. 

For real datasets, code should be modified to control batch sizes being passed to the Resnet, particularly if being processed on a GPU. For repeated testing, it is best to separate face detection (using MTCNN) from embedding or classification (using InceptionResnetV1), as calculation of cropped faces or bounding boxes can then be performed a single time and detected faces saved for future use.

In [42]:
embs_list = []
batch_size = 200
# Loop over the data in batches
for i in tqdm(range(0, len(aligned), batch_size), desc="Calculate embeddings"):
    # Get a batch of data
    batch = torch.stack(aligned[i:i+batch_size]).to(device)

    # calucalate embeddings for the batch
    embs = resnet(batch).detach().cpu()

    # collect embeddings in a list
    embs_list.append(embs)

embeddings = torch.cat(embs_list)
print(embeddings.shape)

Calculate embeddings: 100%|██████████| 67/67 [00:08<00:00,  7.55it/s]

torch.Size([13233, 512])





In [9]:
# aligned_gpu = torch.stack(aligned).to(device)
# embeddings = resnet(aligned_gpu).detach().cpu()
# print(embeddings.shape)

#### Print distance matrix for classes

In [10]:
# dists = [[(e1 - e2).norm().item() for e2 in embeddings] for e1 in embeddings]
# print(pd.DataFrame(dists, columns=names, index=names))

#### Group images by identity

In [30]:
print(names)

['AJ_Cook', 'AJ_Lamas', 'Aaron_Eckhart', 'Aaron_Guiel', 'Aaron_Patterson', 'Aaron_Peirsol', 'Aaron_Peirsol', 'Aaron_Peirsol', 'Aaron_Peirsol', 'Aaron_Pena', 'Aaron_Sorkin', 'Aaron_Sorkin', 'Aaron_Tippin', 'Abba_Eban', 'Abbas_Kiarostami', 'Abdel_Aziz_Al-Hakim', 'Abdel_Madi_Shabneh', 'Abdel_Nasser_Assidi', 'Abdel_Nasser_Assidi', 'Abdoulaye_Wade', 'Abdoulaye_Wade', 'Abdoulaye_Wade', 'Abdoulaye_Wade', 'Abdul_Majeed_Shobokshi', 'Abdul_Rahman', 'Abdulaziz_Kamilov', 'Abdullah', 'Abdullah', 'Abdullah', 'Abdullah', 'Abdullah_Ahmad_Badawi', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Gul', 'Abdullah_Nasseef', 'Abdullah_al-Attiyah', 'Abdullah_al-Attiyah', 'Abdullah_al-Attiyah', 'Abdullatif_Sener', 'Abdullatif_Sener', 'Abel_Aguilar', 'Abel_Pacheco'

In [43]:
name_counts = Counter(names)
names_iden = list(name_counts.keys())
print(name_counts)

Counter({'George_W_Bush': 530, 'Colin_Powell': 236, 'Tony_Blair': 144, 'Donald_Rumsfeld': 121, 'Gerhard_Schroeder': 109, 'Ariel_Sharon': 77, 'Hugo_Chavez': 71, 'Junichiro_Koizumi': 60, 'Jean_Chretien': 55, 'John_Ashcroft': 53, 'Jacques_Chirac': 52, 'Serena_Williams': 52, 'Vladimir_Putin': 49, 'Luiz_Inacio_Lula_da_Silva': 48, 'Gloria_Macapagal_Arroyo': 44, 'Arnold_Schwarzenegger': 42, 'Jennifer_Capriati': 42, 'Laura_Bush': 41, 'Lleyton_Hewitt': 41, 'Alejandro_Toledo': 39, 'Hans_Blix': 39, 'Nestor_Kirchner': 37, 'Andre_Agassi': 36, 'Alvaro_Uribe': 35, 'Megawati_Sukarnoputri': 33, 'Silvio_Berlusconi': 33, 'Tom_Ridge': 33, 'Kofi_Annan': 32, 'Roh_Moo-hyun': 32, 'Vicente_Fox': 32, 'David_Beckham': 31, 'John_Negroponte': 31, 'Guillermo_Coria': 30, 'Recep_Tayyip_Erdogan': 30, 'Bill_Clinton': 29, 'Mahmoud_Abbas': 29, 'Jack_Straw': 28, 'Juan_Carlos_Ferrero': 28, 'Ricardo_Lagos': 27, 'Gray_Davis': 26, 'Rudolph_Giuliani': 26, 'Tom_Daschle': 25, 'Atal_Bihari_Vajpayee': 24, 'Jeremy_Greenstock': 24, 

In [44]:
emb_dim = embeddings.shape[1]
mean_embeddings = torch.tensor([]).reshape(0, emb_dim)
mean_inner_dists = []

emb_copy = embeddings.clone()
img_counts = list(name_counts.values()).copy()

while emb_copy.shape[0] > 0:
    # calculate mean embedding for one person
    img_count = img_counts[0]
    embs_person = emb_copy[:img_count]
    dists = np.array([[(e1 - e2).norm().item() for e2 in embs_person] for e1 in embs_person])
    if len(dists) > 1:
        mean_inner_dists.append(np.mean(dists[dists>0]))
    # else:
    #     mean_inner_dists.append()

    mean_emb = torch.mean(embs_person, axis=0)
    mean_emb = torch.unsqueeze(mean_emb, dim=0)

    # append embedding to all persons' embeddings
    mean_embeddings = torch.concatenate((mean_embeddings, mean_emb), axis=0)

    emb_copy = emb_copy[img_count:]
    img_counts = img_counts[1:]
print(mean_embeddings.shape)
print(mean_inner_dists)
print(names_iden)

torch.Size([5749, 512])
[0.006764337886124849, 0.007168957032263279, 0.003230213187634945, 0.003908078690680365, 0.006652889618029197, 0.006528621151422461, 0.004940139828249812, 0.0051076957024633884, 0.005130886139037709, 0.007153821022560199, 0.004097191461672385, 0.005053945817053318, 0.005415619971851508, 0.009049697313457727, 0.010683519765734673, 0.008716171607375145, 0.009821638464927673, 0.005077262945247419, 0.005111487582325935, 0.006050701555795968, 0.009097032248973846, 0.005805729003623128, 0.005603918495277564, 0.0023779922630637884, 0.007028776531418164, 0.004383653712769349, 0.004586503375321627, 0.005277125173181828, 0.003160207338320712, 0.005837752140082773, 0.001690170494839549, 0.0055206298595294355, 0.005347494035959244, 0.006752905319444835, 0.004903486914311846, 0.005968105513602495, 0.00595871452242136, 0.0026756494771689177, 0.004337779711931944, 0.009542068311323723, 0.007809371221810579, 0.007749298742661874, 0.006507822608873098, 0.0048665024029711885, 0.0

In [45]:
# split by races
persons_df = pd.read_csv("prepare_lfw_for_split_validation/total.csv", index_col=0)
blacks = persons_df.loc[persons_df["Race"] == "Black"].index.values
whites = persons_df.loc[persons_df["Race"] == "White"].index.values
asians = persons_df.loc[persons_df["Race"] == "Asian"].index.values
all_names_races = [blacks, whites, asians]

mean_embs_races = [torch.tensor([]).reshape(0, emb_dim), 
                   torch.tensor([]).reshape(0, emb_dim), 
                   torch.tensor([]).reshape(0, emb_dim)]
names_races = [[], [], []]
mean_dists_races = [[], [], []]
num_images = [[], [], []]

for name, dist, emb in zip(names_iden, mean_inner_dists, mean_embeddings):
    
    name = name.replace("_", " ")
    if name in blacks:
        race_code = 0
    elif name in whites:
        race_code = 1
    else:
        race_code = 2
    num_images[race_code].append(name_counts[name.replace(" ", "_")])
    names_races[race_code].append(name)
    emb = torch.unsqueeze(emb, dim=0)
    mean_embs_races[race_code] = torch.concatenate((mean_embs_races[race_code], emb), axis=0)
    mean_dists_races[race_code].append(dist)

size = 130
for i in range(3):
    num_images[i] = num_images[i][-size:]
    names_races[i] = names_races[i][-size:]
    mean_embs_races[i] = mean_embs_races[i][-size:]
    mean_dists_races[i] = mean_dists_races[i][-size:]
    


print(mean_embs_races[0].shape)
print(mean_embs_races[1].shape)
print(mean_embs_races[2].shape)

# print(np.mean(num_images[0]))
# print(np.mean(num_images[1]))
# print(np.mean(num_images[2]))
inner_dists = []
for i in range(len(mean_dists_races)):
    inner_dists.append(np.round(np.mean(mean_dists_races[i]), decimals=5))

print("Black", inner_dists[0], "\tWhite", inner_dists[1], "\tAsian", inner_dists[2])

torch.Size([130, 512])
torch.Size([130, 512])
torch.Size([130, 512])
Black 0.00603 	White 0.00578 	Asian 0.00591


In [46]:
outer_dists = []
for names_persons, embeddings in zip(names_races, mean_embs_races):
    dists_iden = [[(e1 - e2).norm().item() for e2 in embeddings] for e1 in embeddings]
    df = pd.DataFrame(dists_iden, columns=names_persons, index=names_persons)
    outer_dists.append(np.round(df.mean().mean(), decimals=4))

outer_dists = outer_dists/max(outer_dists)
print("Black", outer_dists[0], "\tWhite", outer_dists[1], "\tAsian", outer_dists[2])

Black 0.9384615384615386 	White 0.9692307692307692 	Asian 1.0


In [35]:
ratio = np.array(outer_dists)/np.array(inner_dists)
ratio = ratio / max(ratio)
print(ratio)

[0.84101055 1.         0.98478891]


In [17]:
# key = blacks[1].replace(" ", "_")
# print(key)
# print(sorted(name_counts))