# BlinkLinMulT: Transformer-based Eye Blink Detection

In [None]:
!pip install blinklinmult torchvision pillow
!pip install mediapipe

Sử dụng mediapipe để lấy các điểm landmark mắt và đưa vào model để inference

In [None]:
# infer_from_face_with_mediapipe.py
import mediapipe as mp
import cv2
from PIL import Image
import torch
from torchvision import transforms
from blinklinmult.models import DenseNet121
import time

In [None]:
IMAGE_PATH = "/content/drive/MyDrive/TÀI LIỆU MÔN HỌC/Xử lý ảnh và ứng dụng/DoAn/test_image/closed_eye_face.jpg"

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

LEFT_EYE_IDX = [
    33, 7, 163, 144, 145, 153, 154, 155,
    133, 173, 157, 158, 159, 160, 161, 246
]
RIGHT_EYE_IDX = [
    263, 249, 390, 373, 374, 380, 381, 382,
    362, 398, 384, 385, 386, 387, 388, 466
]

preprocess = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])])

def crop_eye(img_rgb, landmarks, idxs, w, h, padding=5):
    xs, ys = [], []
    for i in idxs:
        lm = landmarks.landmark[i]
        xs.append(lm.x)
        ys.append(lm.y)

    x_min = int(max(0, min(xs) * w) - padding)
    x_max = int(min(w, max(xs) * w) + padding)
    y_min = int(max(0, min(ys) * h) - padding)
    y_max = int(min(h, max(ys) * h) + padding)

    crop = img_rgb[y_min:y_max, x_min:x_max]
    return Image.fromarray(crop).convert("RGB")

# load face image
img_bgr = cv2.imread(IMAGE_PATH)
if img_bgr is None:
    raise FileNotFoundError(IMAGE_PATH)

h, w = img_bgr.shape[:2]
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

mp_face = mp.solutions.face_mesh

with mp_face.FaceMesh(static_image_mode=True, max_num_faces=1) as fm:
    result = fm.process(img_rgb)
    if not result.multi_face_landmarks:
        raise RuntimeError("Không phát hiện khuôn mặt")

    lm = result.multi_face_landmarks[0]

    # --- crop từng mắt ---
    left_eye_img  = crop_eye(img_rgb, lm, LEFT_EYE_IDX,  w, h)
    right_eye_img = crop_eye(img_rgb, lm, RIGHT_EYE_IDX, w, h)

    # --- preprocessing ---
    x_left  = preprocess(left_eye_img).unsqueeze(0).to(DEVICE)
    x_right = preprocess(right_eye_img).unsqueeze(0).to(DEVICE)


In [None]:
model = DenseNet121(output_dim=1, weights="densenet121-union").to(DEVICE)
model.eval()

# --- inference + timing ---
with torch.no_grad():
    # LEFT
    start_left = time.perf_counter()
    prob_left  = torch.sigmoid(model(x_left)).cpu().item()
    end_left = time.perf_counter()

    # RIGHT
    start_right = time.perf_counter()
    prob_right = torch.sigmoid(model(x_right)).cpu().item()
    end_right = time.perf_counter()

# thời gian tính theo millisecond
time_left_ms  = (end_left  - start_left)
time_right_ms = (end_right - start_right)

print("Left eye probability (closed): ", prob_left)
print(f"Inference time (left eye):  {time_left_ms:.6f} s")

print("Right eye probability (closed):", prob_right)
print(f"Inference time (right eye): {time_right_ms:.3f} s")

Left eye probability (closed):  0.998917818069458
Inference time (left eye):  0.031545 ms
Right eye probability (closed): 0.9990187883377075
Inference time (right eye): 0.030 ms
