In [1]:
from ultralytics import YOLO
import cv2
from facenet_pytorch import MTCNN
import torch

  from .autonotebook import tqdm as notebook_tqdm


Load a pretrained model yolo11

In [2]:
model = YOLO("yolo11n.pt")
model.info()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
mtcnn = MTCNN(select_largest = True, device = device)

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 15.8MB/s]


YOLO11n summary: 319 layers, 2,624,080 parameters, 0 gradients, 6.6 GFLOPs


In [3]:
# Check camera
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Camera unavailable")
    exit()

Get the face with yolo

In [4]:
while True:
    ret, frame = cap.read()

    if not ret:
      print("Not capture")
      break

    # Predict with the model
    results = model(frame)  # predict on camera video

    # Extract bounding boxes,labels and confidence
    boxes = results[0].boxes
    for box in boxes:
      x1, y1, x2, y2 = box.xyxy[0]  # Extract coordinates and confidence
      conf = box.conf[0] # confidence score
      cls = int(box.cls)
      
      if cls == 0 and conf > 0.7:  # label 0 is person
        x1, y1, x2, y2 = map(int, box.xyxy[0]) 

        # get only the person bounding
        person_region = frame[y1:y2, x1:x2]

        # use MTCNN to dectect the faces
        faces, probs = mtcnn.detect(person_region, landmarks=False)

        if faces is not None:
          for face in faces:
            fx1, fy1, fx2, fy2 = map(int, face)
            cv2.rectangle(frame, (x1 + fx1, y1 + fy1), (x1 + fx2, y1 + fy2), (0, 0, 255), 2)
            cv2.putText(frame, "Face", (x1 + fx1, y1 + fy1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)


        label = f"{model.names[cls]} {conf:.2f}"  # Get class name and confidence
   
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        # Put label on top of the bounding box
        cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    cv2.imshow("Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
      break
    


# free the camera and close windows
cap.release()
cv2.destroyAllWindows()


0: 480x640 2 persons, 1 chair, 93.8ms
Speed: 2.4ms preprocess, 93.8ms inference, 290.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 6.5ms
Speed: 1.9ms preprocess, 6.5ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 chair, 6.5ms
Speed: 1.9ms preprocess, 6.5ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 laptop, 6.5ms
Speed: 2.1ms preprocess, 6.5ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 6.6ms
Speed: 2.0ms preprocess, 6.6ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 laptop, 6.7ms
Speed: 2.0ms preprocess, 6.7ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 6.9ms
Speed: 2.2ms preprocess, 6.9ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 laptop, 6.6ms
Speed: 1.7ms preprocess, 6.6ms inference, 0.9

KeyboardInterrupt: 

: 

Test FaceNet to recognize faces.

In [21]:
from keras.models import load_model
facenet = load_model('facenet_keras.h5')
print(facenet.input_shape)
print(facenet.output_shape)

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'facenet_keras.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)