In [1]:
import dlib
import torch
from torchvision import transforms
import warnings
from PIL import Image
import joblib
import os
import cv2
import math
import numpy as np
from tqdm import tqdm

## Face detection

In [2]:
shape_predictor_path = "//Users/valentinaemili/Desktop/shape_predictor_68_face_landmarks.dat"
detector = dlib.get_frontal_face_detector() # HOG-based detector
predictor = dlib.shape_predictor(shape_predictor_path)

In [3]:
def shape_to_normal(shape):
  shape_normal = []
  for i in range(0, shape.num_parts):
    shape_normal.append((i, (shape.part(i).x, shape.part(i).y)))
  return shape_normal

def get_eyes_nose_dlib(shape):
  nose = shape[30][1]
  left_eye_x = (shape[36][1][0] + shape[39][1][0]) // 2
  left_eye_y = (shape[36][1][1] + shape[39][1][1]) // 2
  right_eyes_x = (shape[42][1][0] + shape[45][1][0]) // 2
  right_eyes_y = (shape[42][1][1] + shape[45][1][1]) // 2

  return nose, (left_eye_x, left_eye_y), (right_eyes_x, right_eyes_y)

def rotate_points(shape, rotation_matrix):
  points = []
  for i in range(0, 68):
    x,y = shape[i][1]
    points.append((x, y, 1))
  points = np.array(points)
  rotated_points = np.dot(rotation_matrix, points.T).T
  return np.array([(int(p[0]), int(p[1])) for p in rotated_points])

def get_eyes_in_image(image, points):
  height, width = image.shape[:2]
  top = points[17:27][np.argmin(points[17:27, 1])]
  bottom = points[29]
  min_x, min_y = np.min(points[36:48, 0]), np.min(points[36:48, 1])
  max_x, max_y = np.max(points[36:48, 0]), np.max(points[36:48, 1])
  min_x, min_y = max(0, min_x - 10), max(top[1], min_y - 10)
  max_x, max_y = min(width, max_x + 10), min(bottom[1], max_y + 10)
  image = image[min_y:max_y, min_x:max_x]
  return image

def get_nose_in_image(image, points):
  height, width = image.shape[:2]
  top = points[36:48][np.argmin(points[36:48, 1])]
  bottom = points[48:68][np.argmax(points[48:68, 1])]
  min_x, min_y = np.min(points[27:36, 0]), np.min(points[27:36, 1])
  max_x, max_y = np.max(points[27:36, 0]), np.max(points[27:36, 1])
  min_x, min_y = max(0, min_x - 5), max(top[1], min_y - 5)
  max_x, max_y = min(width, max_x + 5), min(bottom[1], max_y + 5)
  image = image[min_y:max_y, min_x:max_x]
  return image

def get_mouth_in_image(image, points):
  height, width = image.shape[:2]
  top = points[27:36][np.argmin(points[27:36, 1])]
  min_x, min_y = np.min(points[48:68, 0]), np.min(points[48:68, 1])
  max_x, max_y = np.max(points[48:68, 0]), np.max(points[48:68, 1])
  min_x, min_y = max(0, min_x - 10), max(top[1], min_y - 10)
  max_x, max_y = min(width, max_x + 10), min(height, max_y + 10)
  image = image[min_y:max_y, min_x:max_x]
  return image

def align_face(image):
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  faces = detector(gray)
  if len(faces) > 0:
    for face in faces:
      shape = predictor(gray, face)
      shape = shape_to_normal(shape)
      nose, left_eye, right_eye = get_eyes_nose_dlib(shape)
      delta_x = right_eye[0] - left_eye[0]
      delta_y = right_eye[1] - left_eye[1]
      angle = math.atan2(delta_y, delta_x) * (180 / math.pi)
      center_of_eyes = ((left_eye[0] + right_eye[0]) // 2, (left_eye[1] + right_eye[1]) // 2)
      rotation_matrix = cv2.getRotationMatrix2D(center_of_eyes, angle, scale=1.0)
      h, w = image.shape[:2]
      aligned_image = cv2.warpAffine(image, rotation_matrix, (w, h), flags=cv2.INTER_LINEAR)
      rotated_points = rotate_points(shape, rotation_matrix)
      eyes_image = get_eyes_in_image(aligned_image, rotated_points)
      nose_image = get_nose_in_image(aligned_image, rotated_points)
      mouth_image = get_mouth_in_image(aligned_image, rotated_points)
      if eyes_image.size == 0:
        continue
      if nose_image.size == 0:
        continue
      if mouth_image.size == 0:
        continue

      # boolean=True, image is aligned
      return True, eyes_image, nose_image, mouth_image
  return False, _, _, _

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [5]:
warnings.filterwarnings("ignore", category=UserWarning, message=".*InconsistentVersionWarning.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*Secure coding is automatically enabled.*")

In [None]:
# load the model pre-trained on ImageNet dataset
resnet50 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)
# modify the fully connected layer
resnet50.fc = torch.nn.Identity()
resnet50.eval().to(device)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

## Features extraction

In [7]:
def extract_and_save_features(image):
  image_tensor = transform(image).unsqueeze(0).to(device)
  with torch.no_grad():
    features = resnet50(image_tensor)
  return features.cpu().numpy().flatten()

## Ethnicity, age and gender recognition

In [None]:
def main():
  folder_path = "/Users/valentinaemili/Desktop/biometric_systems_project/trained_model"
  category = ["age", "ethnicity", "gender"]
  features = ["eyes", "mouth", "nose"]
  ages = ["around 3 yrs", "around 9 yrs", "around 17 yrs", "around 28 yrs", "around 43 yrs",  "around 63 yrs", "around 83 yrs", "around 104 yrs"]
  ethnicities = ["White", "Black", "Asian", "Indian", "Hispanic/Latino/Middle Eastern"]
  gender = ["male", "female"]
  models = {}
  # Load models for each characteristic and feature combination
  for c in category:
    for feature in features:
      feature_path = os.path.join(folder_path, c, f"svm_model_{feature}.pkl")
      model = joblib.load(feature_path)
      models[(c, feature)] = model

  cv2.setUseOptimized(True)
  cv2.namedWindow('Video Frame with Predictions', cv2.WND_PROP_FULLSCREEN)
  timeout_duration = 10  # Close the window after 10 seconds
  cap = cv2.VideoCapture(0)
  while True:
    ret, img = cap.read() # ret is bool
    if ret:
      boolean, eyes_image, nose_image, mouth_image = align_face(img)
      if boolean:
        eyes_image = Image.fromarray(eyes_image)
        nose_image = Image.fromarray(nose_image)
        mouth_image = Image.fromarray(mouth_image)

        eyes_features = extract_and_save_features(eyes_image)
        nose_features = extract_and_save_features(nose_image)
        mouth_features = extract_and_save_features(mouth_image)
        predictions_text = ""

        for c in category:
          for feature in features:
            model = models[(c, feature)]
            # Select the appropriate feature data
            if feature == "eyes":
              feature_data = eyes_features
            elif feature == "nose":
              feature_data = nose_features
            elif feature == "mouth":
              feature_data = mouth_features
            # Perform prediction
            prediction = model.predict([feature_data])
            
            if c == "age":
              age_pred = ages[int(prediction)]
              predictions_text += f"{feature} ({c}): {age_pred}\n"
            elif c == "ethnicity":
              ethn_pred = ethnicities[int(prediction)]
              predictions_text += f"{feature} ({c}): {ethn_pred}\n"
            elif c == "gender":
              gender_pred = gender[int(prediction)]
              predictions_text += f"{feature} ({c}): {gender_pred}\n"
        y_offset = 30
        for line in predictions_text.splitlines():
          cv2.putText(img, line, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
          y_offset += 30
        
        cv2.imshow('Video Frame with Predictions', img)
        k = cv2.waitKey(10)
        if k == ord('q'):
          break
  cap.release()
  cv2.destroyAllWindows()
  cv2.waitKey(1)
if __name__ == '__main__':
  main()