## Webcam

In [7]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import mediapipe as mp
import numpy as np
import torchvision
import cv2
import time
from PIL import Image
import os
from torchvision import datasets, models, transforms

labels = ['sleep', 'normal', 'yawn']

model_ft = models.mobilenet_v3_small(pretrained=True)
num_ftrs = model_ft.classifier[0].in_features
model_ft.classifier = nn.Sequential(
    nn.Dropout(0.3),
    nn.Linear(num_ftrs, num_ftrs),
    nn.BatchNorm1d(num_ftrs),
    nn.ReLU(),
    nn.Linear(num_ftrs, 2000),
    nn.BatchNorm1d(2000),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(2000, 1000),
    nn.BatchNorm1d(1000),
    nn.ReLU(),
    nn.Linear(1000, 3)
)

model_ft.load_state_dict(torch.load(r'best_model_params.pt', map_location=torch.device('cpu')))

model_ft.eval()
preprocess = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
def display_text(frame, text):
    # Add text to the frame
    cv2.putText(frame, text, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)


mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils
mp_eye_landmarks = mp.solutions.face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
eye_mouth = [2,206,212,211,171,400,431,432,426,6,119,143,156,52,55,168,295,276,340,357]
cv2.namedWindow('crop', cv2.WINDOW_NORMAL)
cv2.namedWindow('All', cv2.WINDOW_NORMAL)
A = ['normal']*16
video_path = r'video/dem.mp4'
cap = cv2.VideoCapture(video_path)
count_sleep=0
count_yawn=0
count_notice = 0
max_yawn = 2
current_state = 'normal'
pre_state = 'normal'
# cap = cv2.VideoCapture(0)
while True:
    ret, image = cap.read()  
    if not ret:
        break
    start = time.time()
    cv2.imshow("All",image)
    # Chuyển đổi ảnh sang không gian màu RGB
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Nhận diện landmarks của khuôn mặt trên ảnh
    results = mp_eye_landmarks.process(image_rgb)

    # Kiểm tra xem có landmarks nào được nhận diện không
    if results.multi_face_landmarks:
        # Lấy danh sách các landmarks của khuôn mặt đầu tiên trong ảnh
        face_landmarks = results.multi_face_landmarks[0]

        # Tạo danh sách tọa độ của các điểm LEFT_EYE
        eye_mouth_lanmarks = []
        eye_lanmarks = []
        min_x1 = 1000
        min_y1 = 1000
        max_x1 = 0
        max_y1 = 0
        for index in eye_mouth:
            landmark = face_landmarks.landmark[index]
            landmark_x = int(landmark.x * image.shape[1])
            landmark_y = int(landmark.y * image.shape[0])
            min_x1 = min(landmark_x,min_x1)
            min_y1 = min(landmark_y,min_y1)
            max_x1 = max(landmark_x,max_x1)
            max_y1 = max(landmark_y,max_y1)

            eye_mouth_lanmarks.append((landmark_x, landmark_y))
        # Tạo mask để làm nổi bật vùng khuôn mặt
        mask = np.zeros(image.shape[:2], dtype=np.uint8)
        cv2.fillPoly(mask, [np.array(eye_mouth_lanmarks[:9])], (255, 255, 255))
        cv2.fillPoly(mask, [np.array(eye_mouth_lanmarks[9:21])], (255, 255, 255))

        # Áp dụng mask để chỉ hiển thị vùng khuôn mặt
        result = cv2.bitwise_and(image, image, mask=mask)
        result = result[int(min_y1):int(max_y1), int(min_x1):int(max_x1)]
        input_image = Image.fromarray(result)

        input_tensor = preprocess(input_image)

        # Create a mini-batch as expected by the model
        input_batch = input_tensor.unsqueeze(0)

        # Move the input to CPU
        input_batch = input_batch.to('cpu')
        # # Make the prediction
        with torch.no_grad():
            output = model_ft(input_batch)
            _, preds = torch.max(abs(output), 1)
        cv2.imshow('crop', result)
        A.append(labels[preds])
        A.pop(0)


        if A.count('yawn') > 15:
            display_text(image,'Status: Yawn')
            current_state = 'yawn'
        elif A[3:16].count('sleep') >= 12:
            display_text(image,'Status: Sleep')
            current_state = 'sleep'
        else:
            display_text(image,'Status: Normal')
            current_state = 'normal'
            
        if pre_state != current_state and current_state =='yawn':
            count_yawn = count_yawn + 1

        if pre_state != current_state and current_state =='sleep':
            count_notice += 1

        if count_yawn > max_yawn:
            count_notice += 1
            count_yawn = 0

        pre_state = current_state

        end = time.time()
        totalTime = end - start
        fps = int(1.0 / totalTime)
        print(f"FPS:{fps}")
        
        # Hiển thị FPS trên khung hình
        cv2.putText(image, f"Yawn: {int(count_yawn)}", (10, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(image, f"Warning: {int(count_notice)}", (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(image, f"FPS: {int(fps)}", (10, 160), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        cv2.putText(image, f"Predict: {labels[preds]}", (10, 200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.imshow('All', image)
        if cv2.waitKey(1) & 0xFF == ord('r'):
            count_sleep=0
            count_yawn=0
            count_notice = 0
    else:
        display_text(image,'No face')
        cv2.imshow('crop', cv2.resize(image,(224,224)))
        cv2.imshow('All', image)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


W0000 00:00:1720366792.330772   11625 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1720366792.343607   11625 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


FPS:21
FPS:28
FPS:24
FPS:23
FPS:29
FPS:27
FPS:30
FPS:28
FPS:28
FPS:27
FPS:20
FPS:25
FPS:28
FPS:27
FPS:26
FPS:30
FPS:31
FPS:31
FPS:27
FPS:26
FPS:24
FPS:22
FPS:23
FPS:27
FPS:28
FPS:25
FPS:33
FPS:29
FPS:36
FPS:33
FPS:25
FPS:21
FPS:24
FPS:23
FPS:24
FPS:31
FPS:32
FPS:30
FPS:32
FPS:36
FPS:32
FPS:32
FPS:31
FPS:35
FPS:37
FPS:33
FPS:27
FPS:24
FPS:27
FPS:27
FPS:22
FPS:27
FPS:28
FPS:35
FPS:30
FPS:29
FPS:33
FPS:34
FPS:31
FPS:30
FPS:28
FPS:28
FPS:29
FPS:30
FPS:36
FPS:30
FPS:34
FPS:34
FPS:30
FPS:28
FPS:26
FPS:29
FPS:28
FPS:28
FPS:29
FPS:27
FPS:32
FPS:26
FPS:32
FPS:31
FPS:28
FPS:30
FPS:34
FPS:33
FPS:31
FPS:36
FPS:31
FPS:39
FPS:31
FPS:34
FPS:31
FPS:30
FPS:32
FPS:34
FPS:34
FPS:36
FPS:30
FPS:33
FPS:34
FPS:34
FPS:26
FPS:39
FPS:35
FPS:28
FPS:30
FPS:30
FPS:28
FPS:31
FPS:34
FPS:29
FPS:29
FPS:30
FPS:34
FPS:31
FPS:30
FPS:32
FPS:36
FPS:32
FPS:32
FPS:31
FPS:33
FPS:31
FPS:31
FPS:34
FPS:34
FPS:25
FPS:25
FPS:35
FPS:30
FPS:33
FPS:24
FPS:37
FPS:32
FPS:37
FPS:40
FPS:35
FPS:31
FPS:33
FPS:34
FPS:37
FPS:35
FPS:36
FPS:30

ZeroDivisionError: division by zero

In [51]:
from PIL import Image

start_time = time.time()
# Load the image
input_image = Image.open("crop1/frame_0000.jpg")

# Define transformations
preprocess = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Preprocess the image
input_tensor = preprocess(input_image)

# Create a mini-batch as expected by the model
input_batch = input_tensor.unsqueeze(0)

# Move the input to CPU
input_batch = input_batch.to('cpu')
# Make the prediction
with torch.no_grad():
    output = model_ft(input_batch)
    _, preds = torch.max(abs(output), 1)

end_time = time.time()
fps = int(1/(end_time-start_time))
print(labels[preds])
print("FPS:",fps)

yawn
FPS: 30
