In [4]:
import mediapipe as mp
import cv2
import math
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from ultralytics import YOLO
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

def draw_box(image, box, color=(255, 0, 255)):
    """Draw a rectangle on the image."""
    line_width = 2
    lw = line_width or max(round(sum(image.shape) / 2 * 0.003), 2)
    p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
    cv2.rectangle(image, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA)

def norm_coordinates(normalized_x, normalized_y, image_width, image_height):
    """Convert normalized coordinates to pixel coordinates."""
    x_px = min(math.floor(normalized_x * image_width), image_width - 1)
    y_px = min(math.floor(normalized_y * image_height), image_height - 1)
    
    return x_px, y_px

def get_box(fl, w, h, off_y):
    """Get bounding box coordinates from lip landmarks."""
    lips = np.asarray([61, 146, 91, 181, 84, 17, 314, 405, 321, 375, 291, 409, 270, 269, 267, 0, 37, 39, 40, 185]).reshape(1,-1)
    idx_to_coors = {
        idx: norm_coordinates(landmark.x, landmark.y, w, h)
        for idx, landmark in enumerate(fl.landmark)
    }
    
    x_min = np.min(np.asarray(list(idx_to_coors.values()))[lips][:, :, 0])
    y_min = np.min(np.asarray(list(idx_to_coors.values()))[lips][:, :, 1])
    x_max = np.max(np.asarray(list(idx_to_coors.values()))[lips][:, :, 0])
    y_max = np.max(np.asarray(list(idx_to_coors.values()))[lips][:, :, 1])

    upper_lip = 13
    lower_lip = 14

    upper_lip_y = off_y + idx_to_coors[upper_lip][1]
    lower_lip_y = off_y + idx_to_coors[lower_lip][1]
    diff = lower_lip_y - upper_lip_y

    return (max(0, x_min), max(0, y_min), min(w - 1, x_max), min(h - 1, y_max)), 0 if diff < 0 else diff

def pth_processing(fp):
    class PreprocessInput(torch.nn.Module):
        def init(self):
            super(PreprocessInput, self).init()

        def forward(self, x):
            x = x.to(torch.float32)
            x = torch.flip(x, dims=(0,))
            x[0, :, :] -= 91.4953
            x[1, :, :] -= 103.8827
            x[2, :, :] -= 131.0912
            return x

    def get_img_torch(img):
        
        ttransform = transforms.Compose([
            transforms.PILToTensor(),
            PreprocessInput()
        ])
        img = img.resize((224, 224), Image.Resampling.NEAREST)
        img = ttransform(img)
        img = torch.unsqueeze(img, 0).to('cuda')
        return img
    return get_img_torch(fp)

def get_metadata(video_path, video_name):

    model = YOLO('yolov11n-face.pt')
  
    DICT_EMO = {0: 'Neutral', 1: 'Happiness', 2: 'Sadness', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 6: 'Anger'}
    
    cap = cv2.VideoCapture(video_path)
    
    w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
    
    counter = 1
    
    embeds = []
    fr_all = []
    embeds_all = []
    
    with mp_face_mesh.FaceMesh(
            static_image_mode=True,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.5,
        ) as face_mesh:
    
        while True:
            ret, im0 = cap.read()
            if not ret:
                break

            results = model.track(im0, persist=True, imgsz=640, conf=0.01, iou=0.5, augment=False, device='cuda', verbose=False)
    
            idx_face, idx_box, startX, startY, endX, endY, startX_mouth, startY_mouth, endX_mouth, endY_mouth, diff, label = None, None, None, None, None, None, None, None, None, None, None, None
    
            output = [None, None, None, None, None, None, None]
    
            for idx, i in enumerate(results[0].boxes):
                box = i.xyxy.int().cpu().tolist()[0]
                idx_box = i.id.int().cpu().tolist()[0] if i.id else -1
                idx_face = idx
                
                startX = box[0]  
                startY = box[1]
                endX = box[2] 
                endY = box[3]

                startX_new = max(0, startX-int(startX*0.1))
                startY_new = max(0, startY-int(startY*0.1))
                endX_new = min(w, endX+int(startX*0.1))
                endY_new = min(h, endY+int(startY*0.1))
                
                cur_fr = im0[startY_new: endY_new, startX_new: endX_new]
                if cur_fr.shape[0] > 0 and cur_fr.shape[1] > 0:
                    cur_fr = cv2.cvtColor(cur_fr, cv2.COLOR_BGR2RGB)
                    results = face_mesh.process(cur_fr)
                    if results.multi_face_landmarks:
                        face_landmarks = results.multi_face_landmarks[0]
                        box_lips, diff = get_box(face_landmarks, cur_fr.shape[1], cur_fr.shape[0], off_y=startY-int(startY*0.1))
                        startX_mouth = startX-int(startX*0.1)+box_lips[0]
                        startY_mouth = startY-int(startY*0.1)+box_lips[1]
                        endX_mouth = startX-int(startX*0.1)+box_lips[2]
                        endY_mouth = startY-int(startY*0.1)+box_lips[3]
        
                        cur_fr = pth_processing(Image.fromarray(cur_fr))

                        fr_all.append(cur_fr)

                        embeds.append([video_name, counter, idx_box, idx_face, startX, startY, endX, endY, startX_mouth, startY_mouth, endX_mouth, endY_mouth, diff])
            if diff is None:
                embeds.append([video_name, counter, idx_box, idx_face, startX, startY, endX, endY, startX_mouth, startY_mouth, endX_mouth, endY_mouth, diff])
                fr_all.append(torch.zeros((1, 3, 224, 224)).to('cuda'))
            counter += 1
            
    for start in range(0, len(fr_all), 70):
        end = start + 70
        curr_fr_all = fr_all[start:end]
        curr_fr_all = torch.cat(curr_fr_all, dim=0)
        outputs = pth_model(curr_fr_all)
        outputs = torch.nn.functional.softmax(outputs, dim=1).cpu().detach().numpy().tolist()
        for idx_frame, idx_out in zip(range(end), range(len(outputs))):
            cl = np.argmax(outputs[idx_out])
            label = DICT_EMO[cl]
            embeds_all.append([*embeds[idx_frame], *outputs[idx_out], label])

    torch.cuda.empty_cache()
    
    df_segments = pd.DataFrame(embeds_all, columns=["video_name", "frame", "face_id", "box_id", "startX_face", "startY_face", "endX_face", "endY_face", "startX_mouth", "startY_mouth", "endX_mouth", "endY_mouth", "diff_lips", "neutral_prob", "happiness_prob", 'sadness_prob', 'surprise_prob', 'fear_prob', 'disgust_prob', 'anger_prob', 'pred_emotion'])

    return df_segments

In [None]:
mp_face_mesh = mp.solutions.face_mesh

name = '0_66_37_wo_gl'
pth_model = torch.jit.load('torchscript_model_{0}.pth'.format(name)).to('cuda')
pth_model.eval()

face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=True,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5
)

corpus = "AFEW"
path = f"E:/Databases/9th_ABAW/{corpus}/Chunk"
subsets = ["dev", "train"]

combined_df = pd.DataFrame()

for subset in subsets:
    video_names = os.listdir(f'{path}/{subset}/')
    for video_name in tqdm(video_names):
        curr_path = f"{path}/{subset}/{video_name}"
        curr_df = get_metadata(curr_path, video_name)
        combined_df = pd.concat([combined_df, curr_df], ignore_index=True)
                   
    combined_df.to_csv(os.path.join(path, f'{subset}_faces.csv'), index=False)

  2%|█▊                                                                                | 9/411 [00:18<11:58,  1.79s/it]