In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.9.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: mediapipe
Successfully installed mediapipe-0.9.0.1
[0mNote: you may need to restart the kernel to use updated packages.


In [10]:
import mediapipe as mp

In [11]:
ls /kaggle/input/include-50/

[0m[01;34mAdjectives[0m/  [01;34mClothes[0m/        [01;34mElectronics[0m/  [01;34mHome[0m/  [01;34mMeans_of_Transportation[0m/
[01;34mAnimals[0m/     [01;34mDays_and_Time[0m/  [01;34mGreetings[0m/    [01;34mJobs[0m/


In [12]:
ls

__notebook_source__.ipynb  [0m[01;34mkeypoints1[0m/


In [24]:
import os
import json
import multiprocessing
import argparse
import os.path
import cv2
import mediapipe as mp
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import numpy as np
import gc
import warnings

def process_landmarks(landmarks):
    x_list, y_list = [], []
    for landmark in landmarks.landmark:
        x_list.append(landmark.x)
        y_list.append(landmark.y)
    return x_list, y_list


def process_hand_keypoints(results):
    hand1_x, hand1_y, hand2_x, hand2_y = [], [], [], []

    if results.multi_hand_landmarks is not None:
        if len(results.multi_hand_landmarks) > 0:
            hand1 = results.multi_hand_landmarks[0]
            hand1_x, hand1_y = process_landmarks(hand1)

        if len(results.multi_hand_landmarks) > 1:
            hand2 = results.multi_hand_landmarks[1]
            hand2_x, hand2_y = process_landmarks(hand2)

    return hand1_x, hand1_y, hand2_x, hand2_y


def process_pose_keypoints(results):
    pose = results.pose_landmarks
    pose_x, pose_y = process_landmarks(pose)
    return pose_x, pose_y


def swap_hands(left_wrist, right_wrist, hand, input_hand):
    left_wrist_x, left_wrist_y = left_wrist
    right_wrist_x, right_wrist_y = right_wrist
    hand_x, hand_y = hand

    left_dist = (left_wrist_x - hand_x) ** 2 + (left_wrist_y - hand_y) ** 2
    right_dist = (right_wrist_x - hand_x) ** 2 + (right_wrist_y - hand_y) ** 2

    if left_dist < right_dist and input_hand == "h2":
        return True

    if right_dist < left_dist and input_hand == "h1":
        return True

    return False


def process_video(path, save_dir):
    hands = mp.solutions.hands.Hands(
        min_detection_confidence=0.5, min_tracking_confidence=0.5
    )
    pose = mp.solutions.pose.Pose(
        min_detection_confidence=0.5, min_tracking_confidence=0.5#, upper_body_only=True
    )

    pose_points_x, pose_points_y = [], []
    hand1_points_x, hand1_points_y = [], []
    hand2_points_x, hand2_points_y = [], []

    label = path.split("/")[-2]
    label = "".join([i for i in label if i.isalpha()]).lower()
    uid = os.path.splitext(os.path.basename(path))[0]
    uid = "_".join([label, uid])
    n_frames = 0
    if not os.path.isfile(path):
        warnings.warn(path + " file not found")
    cap = cv2.VideoCapture(path)
    while cap.isOpened():
        ret, image = cap.read()
        if not ret:
            break
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        hand_results = hands.process(image)
        pose_results = pose.process(image)

        hand1_x, hand1_y, hand2_x, hand2_y = process_hand_keypoints(hand_results)
        pose_x, pose_y = process_pose_keypoints(pose_results)

        ## Assign hands to correct positions
        if len(hand1_x) > 0 and len(hand2_x) == 0:
            if swap_hands(
                left_wrist=(pose_x[15], pose_y[15]),
                right_wrist=(pose_x[16], pose_y[16]),
                hand=(hand1_x[0], hand1_y[0]),
                input_hand="h1",
            ):
                hand1_x, hand1_y, hand2_x, hand2_y = hand2_x, hand2_y, hand1_x, hand1_y

        elif len(hand1_x) == 0 and len(hand2_x) > 0:
            if swap_hands(
                left_wrist=(pose_x[15], pose_y[15]),
                right_wrist=(pose_x[16], pose_y[16]),
                hand=(hand2_x[0], hand2_y[0]),
                input_hand="h2",
            ):
                hand1_x, hand1_y, hand2_x, hand2_y = hand2_x, hand2_y, hand1_x, hand1_y

        ## Set to nan so that values can be interpolated in dataloader
        pose_x = pose_x if pose_x else [np.nan] * 25
        pose_y = pose_y if pose_y else [np.nan] * 25

        hand1_x = hand1_x if hand1_x else [np.nan] * 21
        hand1_y = hand1_y if hand1_y else [np.nan] * 21
        hand2_x = hand2_x if hand2_x else [np.nan] * 21
        hand2_y = hand2_y if hand2_y else [np.nan] * 21

        pose_points_x.append(pose_x)
        pose_points_y.append(pose_y)
        hand1_points_x.append(hand1_x)
        hand1_points_y.append(hand1_y)
        hand2_points_x.append(hand2_x)
        hand2_points_y.append(hand2_y)

        n_frames += 1

    cap.release()

    ## Set to nan so that values can be interpolated in dataloader
    pose_points_x = pose_points_x if pose_points_x else [[np.nan] * 25]
    pose_points_y = pose_points_y if pose_points_y else [[np.nan] * 25]

    hand1_points_x = hand1_points_x if hand1_points_x else [[np.nan] * 21]
    hand1_points_y = hand1_points_y if hand1_points_y else [[np.nan] * 21]
    hand2_points_x = hand2_points_x if hand2_points_x else [[np.nan] * 21]
    hand2_points_y = hand2_points_y if hand2_points_y else [[np.nan] * 21]

    save_data = {
        "uid": uid,
        "label": label,
        "pose_x": pose_points_x,
        "pose_y": pose_points_y,
        "hand1_x": hand1_points_x,
        "hand1_y": hand1_points_y,
        "hand2_x": hand2_points_x,
        "hand2_y": hand2_points_y,
        "n_frames": n_frames,
    }
    with open(os.path.join(save_dir, f"{uid}.json"), "w") as f:
        json.dump(save_data, f)

    hands.close()
    pose.close()
    del hands, pose, save_data
    gc.collect()


def load_file(path, include_dir):
    with open(path, "r") as fp:
        data = fp.read()
        data = data.split("\n")
    data = list(map(lambda x: os.path.join(include_dir, x), data))
    return data


# def load_train_test_val_paths(args):
#     train_paths = load_file(
#         f"train_test_paths/{args.dataset}_train.txt", args.include_dir
#     )
#     val_paths = load_file(f"train_test_paths/{args.dataset}_val.txt", args.include_dir)
#     test_paths = load_file(
#         f"train_test_paths/{args.dataset}_test.txt", args.include_dir
#     )
#     return train_paths, val_paths, test_paths


def save_keypoints(dataset, file_paths, mode, args):
    save_dir = os.path.join(args['save_direc'], f"{dataset}_keypoints")
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    Parallel(n_jobs=n_cores, backend="multiprocessing")(
        delayed(process_video)(path, save_dir)
        for path in tqdm(file_paths, desc=f"processing videos")
    )


#if __name__ == "__main__":
#    parser = argparse.ArgumentParser(description="Generate keypoints from Mediapipe")
args = {
    'include_dir':'/kaggle/input/include-50/',
    #/kaggle/input/include-50-2/
    'save_direc':'/kaggle/working/keypoints1/',
    'dataset':'include50'
}

# parser.add_argument(
#         "--include_dir",
#         default="",
#         type=str,
#         required=True,
#         help="path to the location of INCLUDE/INCLUDE50 videos",
#     )
#     parser.add_argument(
#         "--save_dir",
#         default="",
#         type=str,
#         required=True,
#         help="location to output json file",
#     )
#     parser.add_argument(
#         "--dataset", default="include", type=str, help="options: include or include50"
#     )
#     args = parser.parse_args()

n_cores = multiprocessing.cpu_count()
#     train_paths, val_paths, test_paths = load_train_test_val_paths(args)

#     save_keypoints('include50', val_paths, "val")
#     save_keypoints('include50', test_paths, "test")
save_keypoints('include50', include_dir, "train", args)
save_keypoints('include50', '/kaggle/input/include-50-2/', "train",args)

processing videos:   0%|          | 0/25 [00:00<?, ?it/s]

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
[ERROR:0] global /tmp/pip-req-build-jpmv6t9_/opencv/modules/videoio/src/cap.cpp (164) open VIDEOIO(CV_IMAGES): raised OpenCV exception:

OpenCV(4.5.4) /tmp/pip-req-build-jpmv6t9_/opencv/modules/videoio/src/cap_images.cpp:253: error: (-5:Bad argument) CAP_IMAGES: can't find starting number (in the name of file): / in function 'icvExtractPattern'


[ERROR:0] global /tmp/pip-req-build-jpmv6t9_/opencv/modules/videoio/src/cap.cpp (164) open VIDEOIO(CV_IMAGES): raised OpenCV exception:

OpenCV(4.5.4) /tmp/pip-req-build-jpmv6t9_/opencv/modules/videoio/src/cap_images.cpp:253: error: (-5:Bad argument) CAP_IMAGES: can't find starting number (in the name of file): / in function 'icvExtractPattern'




IndexError: list index out of range

To DO
1. Understand ai4bharat mediapipe code
2. debug it
3. use mediapipe from scratch for 1 vdo

In [None]:
cap = cv2.VideoCapture('/kaggle/input/include-50/Adjectives/1. loud/MVI_5177.MOV')
while cap.isOpened():
    ret, image = cap.read()
    if not ret:
        break
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)


In [30]:
ls /kaggle/input/

[0m[01;34mhand-landmark[0m/  [01;34minclude-50[0m/  [01;34minclude-50-2[0m/


In [41]:
import mediapipe as mp

BaseOptions = mp.tasks.BaseOptions
HandLandmarker = mp.tasks.vision.HandLandmarker
HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Create a hand landmarker instance with the video mode:
options = HandLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='/kaggle/input/hand-landmark/hand_landmarker.task'),
    running_mode=VisionRunningMode.VIDEO)
with HandLandmarker.create_from_options(options) as landmarker:
    
  # The landmarker is initialized. Use it here.
  # ...
    # Use OpenCV’s VideoCapture to load the input video.
    cap = cv2.VideoCapture('/kaggle/input/include-50/Adjectives/1. loud/MVI_5177.MOV')
    video_framerate = cap.get(cv2.CAP_PROP_FPS)
    print(video_framerate)
    while cap.isOpened():
        ret, image = cap.read() #read returns if frame_exits, current_frame
        if not ret:
            break
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #when we load, its in blue green red, we convert it to RGB
    # Load the frame rate of the video using OpenCV’s CV_CAP_PROP_FPS
    # You’ll need it to calculate the timestamp for each frame.
    # Loop through each frame in the video using VideoCapture#read()
    # Convert the frame received from OpenCV to a MediaPipe’s Image object.
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
        # Perform hand landmarks detection on the provided single image.
        # The hand landmarker must be created with the video mode.
        frame_timestamp_ms = int(cap.get(cv2.CAP_PROP_POS_MSEC))
        print(type(frame_timestamp_ms))
        #case CV_FFMPEG_CAP_PROP_POS_MSEC:
        #return 1000.0*(double)frame_number/get_fps();
        hand_landmarker_result = landmarker.detect_for_video(mp_image, frame_timestamp_ms)
        print(hand_landmarker_result)
    
    

25.0
<class 'int'>
HandLandmarkerResult(handedness=[[Category(index=1, score=0.9402497410774231, display_name='Right', category_name='Right')]], hand_landmarks=[[NormalizedLandmark(x=0.6061415672302246, y=0.7957913279533386, z=1.194399459336637e-07, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.5919004678726196, y=0.810642421245575, z=-0.0041010091081261635, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.5862265229225159, y=0.8386191129684448, z=-0.00514757726341486, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.5840981602668762, y=0.8651171326637268, z=-0.0048139579594135284, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.583078145980835, y=0.8845532536506653, z=-0.003933663014322519, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.5981758832931519, y=0.8641141057014465, z=-0.007079826667904854, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.5916528105735779, y=0.9002236723899841, z=-0.00813490804284811, visibility=0.0, presence=0.0), Normali

ValueError: Input timestamp must be monotonically increasing.