In [1]:
import os
from PIL import Image
from torchvision import transforms
import torch
import cv2
import torch
from torch import nn
from torch.nn import functional as F
import math
from PIL import Image
import numpy as np
import dlib

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

class CNN(nn.Module):
    def __init__(self, hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, x, y, drop_1, drop_2, out):
        super().__init__()

        # First block
        self.conv1 = nn.Conv2d(3, hid_1, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(hid_1)
        self.conv2 = nn.Conv2d(hid_1, hid_2, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(hid_2)

        # Second block
        self.conv3 = nn.Conv2d(hid_2, hid_3, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(hid_3)
        self.conv4 = nn.Conv2d(hid_3, hid_4, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(hid_4)

        # Third bLock
        self.conv5 = nn.Conv2d(hid_4, hid_5, kernel_size=3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm2d(hid_5)
        self.conv6 = nn.Conv2d(hid_5, hid_6, kernel_size=3, stride=1, padding=1)
        self.bn6 = nn.BatchNorm2d(hid_6)

        # Fully connected layers
        self.fc1 = nn.Linear(hid_6 * x * y, 512)
        self.bn7 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, out)

        # MaxPooling, Dropout2d, Dropout Fully Connected
        self.pool = nn.MaxPool2d(2,2)
        self.dropout2d = nn.Dropout2d(drop_1)
        self.dropout = nn.Dropout(drop_2)

    def forward(self, x):
        # First block
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)

        # Second block
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        x = F.relu(self.bn4(self.conv4(x)))

        # Third bLock
        x = F.relu(self.bn5(self.conv5(x)))
        x = F.relu(self.bn6(self.conv6(x)))
        x = self.pool(x)

        # Dropout e Flatten
        x = self.dropout2d(x)
        x = torch.flatten(x, 1)  # Flatten

        # Fully connected
        x = F.relu(self.bn7(self.fc1(x)))
        x = self.dropout(x)
        x = self.fc2(x)  # Output layer

        return x

cpu


In [2]:

shape_predictor_path = "//Users/valentinaemili/Desktop/biometric_systems_project/shape_predictor_68_face_landmarks.dat"
detector = dlib.get_frontal_face_detector() # HOG-based detector
predictor = dlib.shape_predictor(shape_predictor_path)

def shape_to_normal(shape):
  shape_normal = []
  for i in range(0, shape.num_parts):
    shape_normal.append((i, (shape.part(i).x, shape.part(i).y)))
  return shape_normal

def get_eyes_nose_dlib(shape):
  nose = shape[30][1]
  left_eye_x = (shape[36][1][0] + shape[39][1][0]) // 2
  left_eye_y = (shape[36][1][1] + shape[39][1][1]) // 2
  right_eyes_x = (shape[42][1][0] + shape[45][1][0]) // 2
  right_eyes_y = (shape[42][1][1] + shape[45][1][1]) // 2

  return nose, (left_eye_x, left_eye_y), (right_eyes_x, right_eyes_y)

def rotate_points(shape, rotation_matrix):
  points = []
  for i in range(0, 68):
    x,y = shape[i][1]
    points.append((x, y, 1))
  points = np.array(points)
  rotated_points = np.dot(rotation_matrix, points.T).T
  return np.array([(int(p[0]), int(p[1])) for p in rotated_points])

def get_eyes_in_image(image, points):
  height, width = image.shape[:2]
  top = points[17:27][np.argmin(points[17:27, 1])]
  bottom = points[29]
  min_x, min_y = np.min(points[36:48, 0]), np.min(points[36:48, 1])
  max_x, max_y = np.max(points[36:48, 0]), np.max(points[36:48, 1])
  min_x, min_y = max(0, min_x - 10), max(top[1], min_y - 10)
  max_x, max_y = min(width, max_x + 10), min(bottom[1], max_y + 10)
  image = image[min_y:max_y, min_x:max_x]
  return image

def get_nose_in_image(image, points):
  height, width = image.shape[:2]
  top = points[36:48][np.argmin(points[36:48, 1])]
  bottom = points[48:68][np.argmax(points[48:68, 1])]
  min_x, min_y = np.min(points[27:36, 0]), np.min(points[27:36, 1])
  max_x, max_y = np.max(points[27:36, 0]), np.max(points[27:36, 1])
  min_x, min_y = max(0, min_x - 5), max(top[1], min_y - 5)
  max_x, max_y = min(width, max_x + 5), min(bottom[1], max_y + 5)
  image = image[min_y:max_y, min_x:max_x]
  return image

def get_mouth_in_image(image, points):
  height, width = image.shape[:2]
  top = points[27:36][np.argmin(points[27:36, 1])]
  min_x, min_y = np.min(points[48:68, 0]), np.min(points[48:68, 1])
  max_x, max_y = np.max(points[48:68, 0]), np.max(points[48:68, 1])
  min_x, min_y = max(0, min_x - 10), max(top[1], min_y - 10)
  max_x, max_y = min(width, max_x + 10), min(height, max_y + 10)
  image = image[min_y:max_y, min_x:max_x]
  return image

def align_face(image):
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  faces = detector(gray)
  if len(faces) > 0:
    for face in faces:
      shape = predictor(gray, face)
      shape = shape_to_normal(shape)
      nose, left_eye, right_eye = get_eyes_nose_dlib(shape)
      delta_x = right_eye[0] - left_eye[0]
      delta_y = right_eye[1] - left_eye[1]
      angle = math.atan2(delta_y, delta_x) * (180 / math.pi)
      center_of_eyes = ((left_eye[0] + right_eye[0]) // 2, (left_eye[1] + right_eye[1]) // 2)
      rotation_matrix = cv2.getRotationMatrix2D(center_of_eyes, angle, scale=1.0)
      h, w = image.shape[:2]
      aligned_image = cv2.warpAffine(image, rotation_matrix, (w, h), flags=cv2.INTER_LINEAR)
      rotated_points = rotate_points(shape, rotation_matrix)
      eyes_image = get_eyes_in_image(aligned_image, rotated_points)
      nose_image = get_nose_in_image(aligned_image, rotated_points)
      mouth_image = get_mouth_in_image(aligned_image, rotated_points)
      return (eyes_image, nose_image, mouth_image)
  return (False, False, False)

In [3]:
# load the model pre-trained on ImageNet dataset
transform_eyes = transforms.Compose([
    transforms.Resize((44, 266)),  # Cambia dimensione a (altezza, larghezza)
    transforms.ToTensor(),        # Converte l'immagine in tensor
    transforms.Normalize(mean = [0.56551169, 0.4201522,  0.35519068], std = [0.22746577, 0.20109357, 0.19388984])
])

# Transform per il naso (108x170)
transform_nose = transforms.Compose([
    transforms.Resize((170, 108)),  # Cambia dimensione a (altezza, larghezza)
    transforms.ToTensor(),         # Converte l'immagine in tensor
    transforms.Normalize(mean = [0.70585869, 0.5230923,  0.44163547], std = [0.19556989, 0.19164655, 0.19488743])
])

# Transform per la bocca (193x89)
transform_mouth = transforms.Compose([
    transforms.Resize((89, 193)),  # Cambia dimensione a (altezza, larghezza)
    transforms.ToTensor(),        # Converte l'immagine in tensor
    transforms.Normalize(mean = [0.64867712, 0.4592991,  0.39606173], std = [0.2005014,  0.18597975, 0.18335514])
])

e_x = 16
e_y = 2
n_x = 6
n_y = 10
m_x = 12
m_y = 5

hid_1 = 64
hid_2 = 64
hid_3 = 128
hid_4 = 128
hid_5 = 128
hid_6 = 128
drop_1 = 0.2
drop_2 = 0.5

out_age = 8
out_gender = 2
out_ethnicity = 5

eyes_gender = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, e_x, e_y, drop_1, drop_2,out_gender)
eyes_age = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, e_x, e_y, drop_1, drop_2,out_age)
eyes_ethnicity = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, e_x, e_y, drop_1, drop_2,out_ethnicity)
nose_gender = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, n_x, n_y, drop_1, drop_2,out_gender)
nose_age = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, n_x, n_y, drop_1, drop_2,out_age)
nose_ethnicity = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, n_x, n_y, drop_1, drop_2,out_ethnicity)
mouth_gender = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, m_x, m_y, drop_1, drop_2,out_gender) 
mouth_age = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, m_x, m_y, drop_1, drop_2, out_age) 
mouth_ethnicity = CNN(hid_1, hid_2, hid_3, hid_4, hid_5, hid_6, m_x, m_y, drop_1, drop_2,out_ethnicity) 

In [4]:
def main():
  folder_path = "/Users/valentinaemili/Desktop/biometric_systems_project/trained_model"
  characteristics = ["age","gender","ethnicity"]
  features = ["eyes", "mouth", "nose"]
  ages = ["around 3 yrs", "around 9 yrs", "around 17 yrs", "around 28 yrs", "around 43 yrs", "around 63 yrs", "around 83 yrs", "around 104 yrs"]
  ethnicities = ["White", "Black", "Asian", "Indian", "Hispanic/Latino/Middle Eastern"]
  gender = ["male", "female"]
  models = {}
  for c in characteristics:
    for feature in features:
      feature_path = os.path.join(folder_path, c, f"best_{feature}_{c}.pth")
      if feature == "eyes":
        if c == "age":
          model = eyes_age
        elif c == "gender":
          model = eyes_gender
        else:
          model = eyes_ethnicity
      elif feature == "mouth":
        if c == "age":
          model = mouth_age
        elif c == "gender":
          model = mouth_gender
        else:
          model = mouth_ethnicity
      else:
        if c == "age":
          model = nose_age
        elif c == "gender":
          model = nose_gender
        else:
          model = nose_ethnicity
      model.load_state_dict(torch.load(feature_path, map_location=torch.device('cpu')))
      models[(c, feature)] = model

  cap = cv2.VideoCapture(0)
  while True:
    ret, img = cap.read() # ret is bool
    if ret:
      img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
      eyes, nose, mouth = align_face(img_rgb)
      if eyes is not False:
        eyes = transform_eyes(Image.fromarray(eyes)).unsqueeze(0) # Add batch dimension

        nose = transform_nose(Image.fromarray(nose)).unsqueeze(0) # Add batch dimension

        mouth = transform_mouth(Image.fromarray(mouth)).unsqueeze(0) # Add batch dimension
        predictions_text = ""
        for c in characteristics:
          for feature in features:
            model = models[(c, feature)]
            model.eval()
            if feature == "eyes":
              y_pred = model(eyes)
            elif feature == "mouth":
              y_pred = model(mouth)
            else:
              y_pred = model(nose)
            predicted_class = torch.argmax(y_pred, dim=1).item()  # Get the index of the highest value
            if c == "age":
              predictions_text += f"{feature} ({c}): {ages[predicted_class]}\n"
            elif c == "ethnicity":
              predictions_text += f"{feature} ({c}): {ethnicities[predicted_class]}\n"
            else:
              predictions_text += f"{feature} ({c}): {gender[predicted_class]}\n"
        y_offset = 30  # Start drawing the text at the y-coordinate
        for line in predictions_text.splitlines():
          cv2.putText(img, line, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
          y_offset += 30  # Move down to draw the next line
        cv2.imshow('Video Frame with Predictions', img)
        k = cv2.waitKey(3)
        if k == ord('q'):
          break
  cap.release()
  cv2.destroyAllWindows()
  cv2.waitKey(1)
if __name__ == '__main__':
  main()