In [None]:
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->mediapipe)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->mediapipe)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->mediapipe)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->mediapipe)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1

In [None]:
!wget -O face_landmarker_v2_with_blendshapes.task -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task

In [None]:
import cv2
import mediapipe as mp
import numpy as np
from google.colab.patches import cv2_imshow
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
LEFT_EYE =[ 362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385,384, 398, 286, 258, 257, 259, 260]
RIGHT_IRIS = [468, 470, 469, 472, 471]
RIGHT_EYE=[ 33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161 , 246, 30, 29, 28, 27, 56]
LEFT_IRIS = [473, 475, 474, 477, 476]

In [None]:
base_options = python.BaseOptions(model_asset_path='face_landmarker_v2_with_blendshapes.task')
options = vision.FaceLandmarkerOptions(base_options=base_options,
                                       output_face_blendshapes=True,
                                       output_facial_transformation_matrixes=True,
                                       num_faces=1)
detector = vision.FaceLandmarker.create_from_options(options)

In [None]:
video_file = "22.MOV"
cap = cv2.VideoCapture(video_file)
threshold = 0.1

In [None]:
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
fps = int(cap.get(5))

In [None]:
out = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))

In [None]:
def landmarks_detection(img_width, img_height, face_landmarks, ind):
    mesh_coord = [(int(face_landmarks[i].x * img_width), int(face_landmarks[i].y * img_height)) for i in ind]
    return mesh_coord

In [None]:
def count_displacement(eye_coords, iris_coords):
    max_x = (max(eye_coords, key=lambda item: item[0]))[0]
    min_x = (min(eye_coords, key=lambda item: item[0]))[0]
    max_y = (max(eye_coords, key=lambda item : item[1]))[1]
    min_y = (min(eye_coords, key=lambda item: item[1]))[1]
    width = max_x - min_x
    height = max_y - min_y
    iris_x = iris_coords[0][0]
    iris_y = iris_coords[0][1]
    percent_x = (2*iris_x-width-2*min_x)/width
    percent_y = (2*iris_y-height-2*min_y)/height
    return percent_x, percent_y

In [None]:
def process_gaze(right_x, right_y, left_x, left_y):
    x = (right_x + left_x)/2
    y = (right_y + left_y)/2
    if y > 0.45:
      result = "down "
    elif y < 0.2:
      result = "up "
    else:
      result = ""

    if abs(x) > threshold and x > 0:
      result += "right"
    elif abs(x) > threshold and x < 0:
      result += "left"
    else:
      result += "center"
    return result

In [None]:
def get_all_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    return frames

In [None]:
def calculate_emotion_percentage(emotion_list):
    total_frames = len(emotion_list)
    emotion_percentage = {}
    for emotion in emotion_list:
        if emotion in emotion_percentage.keys():
            emotion_percentage[emotion] += 1
        else:
            emotion_percentage[emotion] = 1
    for emotion in emotion_percentage.keys():
        emotion_percentage[emotion] = (emotion_percentage[emotion] / total_frames) * 100

    return emotion_percentage

In [None]:
def gaze_detection(frames):
  result_list = []
  for frame in frames:
    image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
    results = detector.detect(image)
    frame_width = frame.shape[0]
    frame_height = frame.shape[1]
    try:
        face_landmarks = results.face_landmarks[0]
        left_iris_coords = landmarks_detection(frame_width, frame_height, face_landmarks, LEFT_IRIS)
        right_iris_coords = landmarks_detection(frame_width, frame_height, face_landmarks, RIGHT_IRIS)
        left_eye_coords = landmarks_detection(frame_width, frame_height, face_landmarks, LEFT_EYE)
        right_eye_coords = landmarks_detection(frame_width, frame_height, face_landmarks, RIGHT_EYE)
        right_x, right_y = count_displacement(right_eye_coords, right_iris_coords)
        left_x, left_y = count_displacement(left_eye_coords, left_iris_coords)
        res = process_gaze(right_x, right_y, left_x, left_y)
        result_list.append(res)
    except Exception as ex:
        print(ex.args[0])
        continue
  return result_list

In [None]:
video_path = '21.mp4'
cap = cv2.VideoCapture(video_path)
frames = get_all_frames(video_path)
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = len(frames)
video_duration_sec = total_frames / fps
frames_per_second = int(total_frames // video_duration_sec)
cap.release()
cv2.destroyAllWindows()

In [None]:
def get_subarray(array, subset, ind):
  last_ind = min(ind+subset, len(array))
  return array[ind:last_ind]

In [None]:
result_all = []

In [None]:
for i in range(0, total_frames, int(fps) * 10):
    second_frames = get_subarray(frames, frames_per_second, i)
    array = gaze_detection(second_frames)
    result_all.append(calculate_emotion_percentage(array))

In [None]:
result_all

[{'center': 100.0},
 {'up center': 13.793103448275861, 'center': 86.20689655172413},
 {'center': 89.65517241379311,
  'right': 3.4482758620689653,
  'up center': 6.896551724137931}]

In [None]:
font = cv2.FONT_HERSHEY_COMPLEX
cap = cv2.VideoCapture(video_path)
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
fps = int(cap.get(5))
out = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
fontScale = 1
color = (255, 255, 255)
thickness = 2
x = 20
y = 50
for ind in range(len(frames)):
  current_sec = ind // fps
  index = int(current_sec//10)
  count = 1
  frame = cv2.cvtColor(frames[ind], cv2.COLOR_RGB2BGR)
  dictionary = result_all[index]
  for key in dictionary.keys():
    text = f"{key} - {dictionary[key]}"
    if len(text) > 0:
      frame = cv2.putText(frame, text, (x, y*count), font, fontScale, color, thickness, cv2.LINE_AA)
      count+=1
  out.write(frame)
cap.release()
out.release()
cv2.destroyAllWindows()

In [None]:
from moviepy.editor import VideoFileClip
from IPython.display import display

video_path = "output_video.mp4"
video_clip = VideoFileClip(video_path)
video_clip.ipython_display(width=640)

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4






Moviepy - Done !
Moviepy - video ready __temp__.mp4
