In [None]:
!pip install mediapipe
!pip -q install facenet-pytorch
!pip install decord

In [None]:
import mediapipe as mp
import itertools
import cv2
import json
import os
import numpy as np
from tqdm import tqdm
import torch
from torchvision.models import resnet50, ResNet50_Weights, vgg16, VGG16_Weights
from torch import nn
from torch.utils.data import Dataset,DataLoader
from facenet_pytorch import InceptionResnetV1
import gc
import shutil

In [None]:
from decord import VideoReader

# Util

In [None]:
CFG = {
    'fold_num': 5,
    'vgg_embedding_features': 8631,
    'vgg-input-shape': 224,
    'n_frames': 30,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

In [None]:
# Initialize mediapipe
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
                static_image_mode=True,
                max_num_faces=1,
                refine_landmarks=True,
                min_detection_confidence=0.5)
normalize_face_mesh = mp_face_mesh.FaceMesh(
                static_image_mode=True,
                max_num_faces=1,
                refine_landmarks=True,
                min_detection_confidence=0.5)
LEFT_EYE_INDEXES = list(set(itertools.chain(*mp_face_mesh.FACEMESH_LEFT_EYE)))
RIGHT_EYE_INDEXES = list(set(itertools.chain(*mp_face_mesh.FACEMESH_RIGHT_EYE)))
LIP_INDEXES = list(set(itertools.chain(*mp_face_mesh.FACEMESH_LIPS)))
FACE_INDEXES = list(set(itertools.chain(*mp_face_mesh.FACEMESH_FACE_OVAL)))

In [None]:
# Initialize VGG model
resnet = InceptionResnetV1(pretrained = 'vggface2', classify=True).to(CFG['device']).eval()

In [None]:
# Padding feature
def pad(array, target_shape):
    if array.shape[0]==target_shape[0]:
        return array
    else:
        padding = torch.zeros((target_shape[0]-array.shape[0], *target_shape[1:]))
        return torch.concatenate([array.to(CFG['device']), padding.to(CFG['device'])], axis=0)

In [None]:
# Crop images and extract keypoint
def crop_and_extract(h, w, frame, INDEXES, result):
    cx_min=  w
    cy_min = h
    cx_max= cy_max= 0
    kpts = []
    for point in INDEXES:
        mediapipe_x = result.landmark[point].x
        mediapipe_y = result.landmark[point].y
        mediapipe_z = result.landmark[point].z
        kpts.append(mediapipe_x)
        kpts.append(mediapipe_y)
        kpts.append(mediapipe_z)
        cx, cy = int(mediapipe_x * w), int(mediapipe_y * h)
        if cx<cx_min:
            cx_min=cx
        if cy<cy_min:
            cy_min=cy
        if cx>cx_max:
            cx_max=cx
        if cy>cy_max:
            cy_max=cy
    crop = frame[cy_min : cy_max, cx_min : cx_max]
    return kpts, crop, (cx_min, cy_min), (cx_max, cy_max)

# Crop images and extract keypoint
def keypoint_preprocess(frame_h, frame_w, top_left, bottom_right, INDEXES, result):
    kpts = []
    for point in INDEXES:
        mediapipe_x = result.landmark[point].x
        mediapipe_y = result.landmark[point].y
        preprocess_x = (mediapipe_x*frame_w - top_left[0])/(bottom_right[0]-top_left[0])
        preprocess_y = (mediapipe_y*frame_h - top_left[1])/(bottom_right[1]-top_left[1])
        kpts.append(preprocess_x)
        kpts.append(preprocess_y)
    return kpts

In [None]:
# Load data dir metadata
with open('/kaggle/input/sust-ddd-metadata/dataset_metadata.json', 'r') as f:
    metadata = json.load(f)

# Feature Extraction (Old)

In [None]:
class MyDataset(Dataset):
    def __init__(self, metadata, fold,
                 data_root=None,
                ):
        super().__init__()
        videos = []
        labels = []
        label_names = []
        for fold_num in fold:
            for label, video_names in metadata[fold_num].items():
                videos += video_names
                labels += (np.ones(len(video_names)) if label=='drowsiness' else np.zeros(len(video_names))).astype(int).tolist()
                label_names += [label]*len(video_names)
                
        self.data_root = data_root
        self.videos = videos
        self.labels = labels
        self.label_names = label_names

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, index: int):

        # get labels
        label = self.labels[index]
        label_name = self.label_names[index]
        video_name = self.videos[index]
        
        vd_dir = os.path.join(self.data_root, label_name, video_name)

        # Loop through frames
        vr = VideoReader(vd_dir)
        total_frames = len(vr)
        sample_rate = int(total_frames/CFG['n_frames'])
        # Loop through frames
        # Keypoints
        lip_feats = []
        l_eye_feats = []
        r_eye_feats = []
        # Images
        faces = []
#         l_eyes = []
#         r_eyes = []
#         lips = []
        count = 0
        for fno in (range(0, total_frames, sample_rate)):
            frame = cv2.cvtColor(vr[fno].asnumpy(), cv2.COLOR_BGR2RGB)
#             try:
            results = face_mesh.process(frame).multi_face_landmarks
#             except:
#                 continue
            if results is not None:
                result = results[0]
                h, w, _ = frame.shape
                _, face, top_left, bottom_right = crop_and_extract(h, w, frame, FACE_INDEXES, result)
                try:
#                     normalize_result = normalize_face_mesh.process(face).multi_face_landmarks[0]
                    face = cv2.resize(face, dsize=(CFG['vgg-input-shape'], CFG['vgg-input-shape']), interpolation=cv2.INTER_CUBIC)
                except:
                    continue
                face = np.transpose(face, (2, 0, 1))
                faces.append(face)
                
#                 fl_eye_kpt, fl_eye = crop_and_extract(h, w, frame, LEFT_EYE_INDEXES, result)
                fl_eye_kpt = keypoint_preprocess(h, w, top_left, bottom_right, LEFT_EYE_INDEXES, result)
#                 fl_eye = cv2.resize(fl_eye, dsize=(CFG['vgg-input-shape'], CFG['vgg-input-shape']), interpolation=cv2.INTER_CUBIC)
#                 fl_eye = np.transpose(fl_eye, (2, 0, 1))
#                 l_eyes.append(fl_eye)
                l_eye_feats.append(fl_eye_kpt)

#                 fr_eye_kpt, fr_eye = crop_and_extract(h, w, frame, RIGHT_EYE_INDEXES, result)
                fr_eye_kpt = keypoint_preprocess(h, w, top_left, bottom_right, RIGHT_EYE_INDEXES, result)
#                 fr_eye = cv2.resize(fr_eye, dsize=(CFG['vgg-input-shape'], CFG['vgg-input-shape']), interpolation=cv2.INTER_CUBIC)
#                 fr_eye = np.transpose(fr_eye, (2, 0, 1))
#                 r_eyes.append(fr_eye)
                r_eye_feats.append(fr_eye_kpt)

#                 f_lip_kpt, f_lip = crop_and_extract(h, w, frame, LIP_INDEXES, result)
                f_lip_kpt = keypoint_preprocess(h, w, top_left, bottom_right, LIP_INDEXES, result)
#                 f_lip = cv2.resize(f_lip, dsize=(CFG['vgg-input-shape'], CFG['vgg-input-shape']), interpolation=cv2.INTER_CUBIC)
#                 f_lip = np.transpose(f_lip, (2, 0, 1))
#                 lips.append(f_lip)
                lip_feats.append(f_lip_kpt)             
                
        if len(faces)!=0:
            seq_len = len(faces)
            with torch.no_grad():
                inputs = torch.tensor(np.array(faces)).float().to(CFG['device'])
                face_feats = resnet(inputs)
            # Keypoints
            lip_feats = torch.tensor(lip_feats).float().to(CFG['device'])
            l_eye_feats = torch.tensor(l_eye_feats).float().to(CFG['device'])
            r_eye_feats = torch.tensor(r_eye_feats).float().to(CFG['device'])
            # Images
#             faces = torch.tensor(np.array(faces)).float().to(CFG['device'])
#             l_eyes = torch.tensor(np.array(l_eyes)).float().to(CFG['device'])
#             r_eyes = torch.tensor(np.array(r_eyes)).float().to(CFG['device'])
#             lips = torch.tensor(np.array(lips)).float().to(CFG['device'])
            
            if seq_len<CFG['n_frames']:
                # Keypoints
                lip_feats = pad(lip_feats, (CFG['n_frames'], lip_feats.shape[-1]))
                l_eye_feats = pad(l_eye_feats, (CFG['n_frames'], l_eye_feats.shape[-1]))
                r_eye_feats = pad(r_eye_feats, (CFG['n_frames'], r_eye_feats.shape[-1]))
                # Images   
                face_feats = pad(face_feats, (CFG['n_frames'], face_feats.shape[-1]))
#                 faces = pad(faces, (CFG['n_frames'], 3, CFG['vgg-input-shape'], CFG['vgg-input-shape']))
#                 l_eyes = pad(l_eyes, (CFG['n_frames'], 3, CFG['vgg-input-shape'], CFG['vgg-input-shape']))
#                 r_eyes = pad(r_eyes, (CFG['n_frames'], 3, CFG['vgg-input-shape'], CFG['vgg-input-shape']))
#                 lips = pad(lips, (CFG['n_frames'], 3, CFG['vgg-input-shape'], CFG['vgg-input-shape']))
            else:
                # Keypoints
                lip_feats = lip_feats[:CFG['n_frames'], :]
                l_eye_feats = l_eye_feats[:CFG['n_frames'], :]
                r_eye_feats = r_eye_feats[:CFG['n_frames'], :]
                # Images
                face_feats = face_feats[:CFG['n_frames'], :]
#                 faces = faces[:CFG['n_frames'], :]
#                 l_eyes = l_eyes[:CFG['n_frames'], :]
#                 r_eyes = r_eyes[:CFG['n_frames'], :]
#                 lips = lips[:CFG['n_frames'], :]
                seq_len = CFG['n_frames']
            return video_name, (face_feats, lip_feats, l_eye_feats, r_eye_feats), seq_len, label   

#             return video_name, (faces, lips, l_eyes, r_eyes, lip_feats, l_eye_feats, r_eye_feats), seq_len, label   
        else:
            return video_name, None, 0, label

In [None]:
def save_data(name, save_path='/kaggle/working/data'):
    if not os.path.exists(os.path.join(save_path, name)):
        os.makedirs(os.path.join(save_path, name))
    dataset = MyDataset(metadata=metadata, fold=[name], data_root='/kaggle/input/sust-ddd/SUST Driver Drowsiness Dataset')
    vid_names = []
    sequence_len = []
    X_face = []
#     X_face_img = []
#     X_lip_img = []
#     X_l_eye_img = []
#     X_r_eye_img = []
    X_lip = []
    X_l_eye = []
    X_r_eye = []
    Y = []
    for i in tqdm(range(dataset.__len__()), total = dataset.__len__()):
        vid_name, data, seq_len, label = dataset[i]
        if data!=None:
            # Image
            X_face.append(torch.unsqueeze(data[0], 0))
#             X_face_img.append(torch.unsqueeze(data[0], 0))
#             X_lip_img.append(torch.unsqueeze(data[1], 0))
#             X_l_eye_img.append(torch.unsqueeze(data[2], 0))
#             X_r_eye_img.append(torch.unsqueeze(data[3], 0))
            # Keypoint
            X_lip.append(torch.unsqueeze(data[1], 0))
            X_l_eye.append(torch.unsqueeze(data[2], 0))
            X_r_eye.append(torch.unsqueeze(data[3], 0))
            # Other
            Y.append(label)
            vid_names.append(vid_name)
            sequence_len.append(seq_len)
            
    torch.save(torch.concatenate(X_face, axis=0), os.path.join(save_path, name, 'X_face.pt'))  
#     torch.save(torch.concatenate(X_face_img, axis=0), os.path.join(save_path, name, 'X_face_img.pt'))
#     torch.save(torch.concatenate(X_lip_img, axis=0), os.path.join(save_path, name, 'X_lip_img.pt'))
#     torch.save(torch.concatenate(X_l_eye_img, axis=0), os.path.join(save_path, name, 'X_l_eye_img.pt'))
#     torch.save(torch.concatenate(X_r_eye_img, axis=0), os.path.join(save_path, name, 'X_r_eye_img.pt'))
    
    torch.save(torch.concatenate(X_lip, axis=0), os.path.join(save_path, name, 'X_lip.pt'))
    torch.save(torch.concatenate(X_l_eye, axis=0), os.path.join(save_path, name, 'X_l_eye.pt'))
    torch.save(torch.concatenate(X_r_eye, axis=0), os.path.join(save_path, name, 'X_r_eye.pt'))
    
    with open(os.path.join(save_path, name, 'vid_names.json'), 'w') as f:
        json.dump(vid_names, f)
    torch.save(torch.tensor(sequence_len), os.path.join(save_path, name, 'seq_len.pt'))
    torch.save(torch.tensor(Y), os.path.join(save_path, name, 'Y.pt'))

In [None]:
save_data('fold1')
save_data('fold2')
save_data('fold3')
save_data('fold4')
save_data('fold5')

In [None]:
shutil.make_archive('data_small', 'zip', '/kaggle/working/data')

In [None]:
import os
os.chdir(r'/kaggle/working')
from IPython.display import FileLink

In [None]:
FileLink(r'data_small.zip')

# Feature Extraction (New)

In [None]:
# class MyDataset(Dataset):
#     def __init__(self, metadata, fold,
#                  data_root=None,
#                 ):
#         super().__init__()
#         videos = []
#         labels = []
#         label_names = []
#         for fold_num in fold:
#             for label, video_names in metadata[fold_num].items():
#                 videos += video_names
#                 labels += (np.ones(len(video_names)) if label=='drowsiness' else np.zeros(len(video_names))).astype(int).tolist()
#                 label_names += [label]*len(video_names)
                
#         self.data_root = data_root
#         self.videos = videos
#         self.labels = labels
#         self.label_names = label_names

#     def __len__(self):
#         return len(self.videos)

#     def __getitem__(self, index: int):

#         # get labels
#         label = self.labels[index]
#         label_name = self.label_names[index]
#         video_name = self.videos[index]
        
#         vd_dir = os.path.join(self.data_root, label_name, video_name)

#         # Loop through frames
#         vr = VideoReader(vd_dir)
#         total_frames = len(vr)
#         sample_rate = int(total_frames/CFG['n_frames'])
#         # Loop through frames
#         # Keypoints
#         lip_feats = []
#         l_eye_feats = []
#         r_eye_feats = []
#         # Images
#         faces = []
#         count = 0
#         for fno in (range(0, total_frames, sample_rate)):
#             frame = cv2.cvtColor(vr[fno].asnumpy(), cv2.COLOR_BGR2RGB)
#             results = face_mesh.process(frame).multi_face_landmarks
#             if results is not None:
#                 result = results[0]
#                 h, w, _ = frame.shape
#                 _, face, top_left, bottom_right = crop_and_extract(h, w, frame, FACE_INDEXES, result)
#                 try:
#                     face = cv2.resize(face, dsize=(CFG['vgg-input-shape'], CFG['vgg-input-shape']), interpolation=cv2.INTER_CUBIC)
#                 except:
#                     continue
#                 face = np.transpose(face, (2, 0, 1))
#                 faces.append(face)
                
#                 fl_eye_kpt = keypoint_preprocess(h, w, top_left, bottom_right, LEFT_EYE_INDEXES, result)
#                 l_eye_feats.append(fl_eye_kpt)

#                 fr_eye_kpt = keypoint_preprocess(h, w, top_left, bottom_right, RIGHT_EYE_INDEXES, result)
#                 r_eye_feats.append(fr_eye_kpt)

#                 f_lip_kpt = keypoint_preprocess(h, w, top_left, bottom_right, LIP_INDEXES, result)
#                 lip_feats.append(f_lip_kpt)             
                
#         if len(faces)!=0:
#             faces = np.array(faces)
#             lip_feats = np.array(lip_feats)
#             l_eye_feats = np.array(l_eye_feats)
#             r_eye_feats = np.array(r_eye_feats)
#             return video_name, (faces, lip_feats, l_eye_feats, r_eye_feats)   
#         else:
#             return video_name, None

In [None]:
# def save_data(name, save_dir='/kaggle/working/data'):
#     if not os.path.exists(os.path.join(save_dir, name)):
#         os.makedirs(os.path.join(save_dir, name))
#     dataset = MyDataset(metadata=metadata, fold=[name], data_root='/kaggle/input/sust-ddd/SUST Driver Drowsiness Dataset')
#     for i in tqdm(range(dataset.__len__()), total = dataset.__len__()):
#         vid_name, data = dataset[i]
#         if data!=None:
#             save_path = os.path.join(save_dir, name, vid_name)
#             os.makedirs(save_path)
#             np.save(os.path.join(save_path, 'face.npy'),data[0])
#             np.save(os.path.join(save_path, 'lip.npy'),data[1])
#             np.save(os.path.join(save_path, 'l_eye.npy'),data[2])
#             np.save(os.path.join(save_path, 'r_eye.npy'),data[3])
#             del data
#             gc.collect()
#     gc.collect()

In [None]:
# print('fold1')
# save_data('fold1')

# print('fold2')
# save_data('fold2')

# print('fold3')
# save_data('fold3')

# print('fold4')
# save_data('fold4')

# print('fold5')
# save_data('fold5')

# Example mediapipe

In [None]:
# img_path = '/kaggle/input/human-faces-object-detection/images/00000074.jpg'
# img = cv2.imread(img_path)
# result = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).multi_face_landmarks[0]

# fl_eye_feats = []
# fr_eye_feats = []
# f_lip_feats = []

# for point in LEFT_EYE_INDEXES:
#     mediapipe_x = result.landmark[point].x
#     mediapipe_y = result.landmark[point].y
#     fl_eye_feats.append(mediapipe_x)

# for point in RIGHT_EYE_INDEXES:
#     mediapipe_x = result.landmark[point].x
#     mediapipe_y = result.landmark[point].y
#     fr_eye_feats.append(mediapipe_x)


# for point in LIP_INDEXES:
#     mediapipe_x = result.landmark[point].x
#     mediapipe_y = result.landmark[point].y
#     f_lip_feats.append(mediapipe_x)

# print(len(LEFT_EYE_INDEXES), len(RIGHT_EYE_INDEXES), len(LIP_INDEXES))
# print(len(fl_eye_feats), len(fr_eye_feats), len(f_lip_feats))

In [None]:
# # Create a MediaPipe `Pose` object
# with mp_pose.Pose(static_image_mode=True,
#                   model_complexity=2,
#                   enable_segmentation=True) as pose:
        
#     # Read the file in and get dims
#     image = cv2.imread(img_path)

#     # Convert the BGR image to RGB and then process with the `Pose` object.
#     results = pose.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))