# Extracting keypoints from videos

We will use the following models to extract keypoints:

* MoveNet (2D)
* MediaPipe Pose Landmarker (3D)

In [1]:
import tensorflow as tf
import tensorflow_hub as tf_hub
import mediapipe as mp
import numpy as np
import pandas as pd
import imageio
import matplotlib.pyplot as plt
from pathlib import Path

2026-02-06 14:20:30.599377: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-02-06 14:20:30.859415: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-02-06 14:20:32.599684: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
  from pkg_resources import parse_version


In [2]:
# Helper functions

MOVENET_KEYPOINT_DICT = {
  'nose': 0,
  'left_eye': 1,
  'right_eye': 2,
  'left_ear': 3,
  'right_ear': 4,
  'left_shoulder': 5,
  'right_shoulder': 6,
  'left_elbow': 7,
  'right_elbow': 8,
  'left_wrist': 9,
  'right_wrist': 10,
  'left_hip': 11,
  'right_hip': 12,
  'left_knee': 13,
  'right_knee': 14,
  'left_ankle': 15,
  'right_ankle': 16
}

MEDIAPIPE_JOINTS = [
  ('left_shoulder', 11),
  ('right_shoulder', 12),
  ('left_elbow', 13),
  ('right_elbow', 14),
  ('left_wrist', 15),
  ('right_wrist', 16),
  ('left_hip', 23),
  ('right_hip', 24),
  ('left_knee', 25),
  ('right_knee', 26),
  ('left_ankle', 27),
  ('right_ankle', 28)
]

# Cropping algorithm to improve detection accuracy (source: https://www.tensorflow.org/hub/tutorials/movenet)

MIN_CROP_KEYPOINT_SCORE = 0.2

def init_crop_region(image_height, image_width):
  if image_width > image_height:
    box_height = image_width / image_height
    box_width = 1.0
    y_min = (image_height / 2 - image_width / 2) / image_height
    x_min = 0.0
  else:
    box_height = 1.0
    box_width = image_height / image_width
    y_min = 0.0
    x_min = (image_width / 2 - image_height / 2) / image_width

  return {
    'y_min': y_min,
    'x_min': x_min,
    'y_max': y_min + box_height,
    'x_max': x_min + box_width,
    'height': box_height,
    'width': box_width
  }

def torso_visible(keypoints):
  return ((keypoints[0, 0, MOVENET_KEYPOINT_DICT['left_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, MOVENET_KEYPOINT_DICT['right_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE) and
          (keypoints[0, 0, MOVENET_KEYPOINT_DICT['left_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, MOVENET_KEYPOINT_DICT['right_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE))

def determine_torso_and_body_range(keypoints, target_keypoints, center_y, center_x):
  torso_joints = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
  max_torso_yrange = 0.0
  max_torso_xrange = 0.0
  for joint in torso_joints:
    dist_y = abs(center_y - target_keypoints[joint][0])
    dist_x = abs(center_x - target_keypoints[joint][1])
    if dist_y > max_torso_yrange:
      max_torso_yrange = dist_y
    if dist_x > max_torso_xrange:
      max_torso_xrange = dist_x

  max_body_yrange = 0.0
  max_body_xrange = 0.0
  for joint in MOVENET_KEYPOINT_DICT.keys():
    if keypoints[0, 0, MOVENET_KEYPOINT_DICT[joint], 2] < MIN_CROP_KEYPOINT_SCORE:
      continue
    dist_y = abs(center_y - target_keypoints[joint][0]);
    dist_x = abs(center_x - target_keypoints[joint][1]);
    if dist_y > max_body_yrange:
      max_body_yrange = dist_y

    if dist_x > max_body_xrange:
      max_body_xrange = dist_x

  return [max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange]

def determine_crop_region(keypoints, image_height, image_width):
  target_keypoints = {}
  for joint in MOVENET_KEYPOINT_DICT.keys():
    target_keypoints[joint] = [
      keypoints[0, 0, MOVENET_KEYPOINT_DICT[joint], 0] * image_height,
      keypoints[0, 0, MOVENET_KEYPOINT_DICT[joint], 1] * image_width
    ]

  if torso_visible(keypoints):
    center_y = (target_keypoints['left_hip'][0] +
                target_keypoints['right_hip'][0]) / 2;
    center_x = (target_keypoints['left_hip'][1] +
                target_keypoints['right_hip'][1]) / 2;

    (max_torso_yrange, max_torso_xrange,
      max_body_yrange, max_body_xrange) = determine_torso_and_body_range(
          keypoints, target_keypoints, center_y, center_x)

    crop_length_half = np.amax(
        [max_torso_xrange * 1.9, max_torso_yrange * 1.9,
          max_body_yrange * 1.2, max_body_xrange * 1.2])

    tmp = np.array(
        [center_x, image_width - center_x, center_y, image_height - center_y])
    crop_length_half = np.amin(
        [crop_length_half, np.amax(tmp)]);

    crop_corner = [center_y - crop_length_half, center_x - crop_length_half];

    if crop_length_half > max(image_width, image_height) / 2:
      return init_crop_region(image_height, image_width)
    else:
      crop_length = crop_length_half * 2;
      return {
        'y_min': crop_corner[0] / image_height,
        'x_min': crop_corner[1] / image_width,
        'y_max': (crop_corner[0] + crop_length) / image_height,
        'x_max': (crop_corner[1] + crop_length) / image_width,
        'height': (crop_corner[0] + crop_length) / image_height -
            crop_corner[0] / image_height,
        'width': (crop_corner[1] + crop_length) / image_width -
            crop_corner[1] / image_width
      }
  else:
    return init_crop_region(image_height, image_width)

def crop_and_resize(image, crop_region, crop_size):
  boxes=[[crop_region['y_min'], crop_region['x_min'],
          crop_region['y_max'], crop_region['x_max']]]
  output_image = tf.image.crop_and_resize(
      image, box_indices=[0], boxes=boxes, crop_size=crop_size)
  return output_image

def run_inference(movenet, image, crop_region, crop_size):
  image_height, image_width, _ = image.shape
  input_image = crop_and_resize(
    tf.expand_dims(image, axis=0), crop_region, crop_size=crop_size)
  input_image = tf.cast(input_image, dtype=tf.int32)
  
  keypoints_with_scores = movenet(input_image)
  keypoints_with_scores = keypoints_with_scores['output_0'].numpy()

  for idx in range(17):
    keypoints_with_scores[0, 0, idx, 0] = (
        crop_region['y_min'] * image_height +
        crop_region['height'] * image_height *
        keypoints_with_scores[0, 0, idx, 0]) / image_height
    keypoints_with_scores[0, 0, idx, 1] = (
        crop_region['x_min'] * image_width +
        crop_region['width'] * image_width *
        keypoints_with_scores[0, 0, idx, 1]) / image_width
  return keypoints_with_scores


def load_video_data(video_path):
  reader = imageio.get_reader(video_path)
  fps = reader.get_meta_data()['fps']

  frames = []
  timestamps = []

  for frame_index, frame in enumerate(reader):
    frames.append(frame)
    timestamps.append(int((frame_index / fps) * 1000))
  
  return frames, timestamps


def load_movenet_model(path):
  model = tf_hub.load(path)
  model = model.signatures['serving_default']
  return model

def compute_movenet(frames, model):
  frame_height, frame_width, _ = frames[0].shape
  input_size = 256
  crop_region = init_crop_region(frame_height, frame_width)

  keypoint_list = []
  for frame in frames:
    outputs = run_inference(model, frame, crop_region, crop_size=[input_size, input_size])
    keypoint_list.append(outputs)

    crop_region = determine_crop_region(outputs, frame_height, frame_width)
  
  return keypoint_list

def restructure_movenet(keypoint_list, timestamps):
  stacked = np.concatenate(keypoint_list, axis=0)[:, 0, :, :]
  yx = stacked[:, 5:, :] # remove face keypoints

  joint_names = [joint for joint, idx in sorted(MOVENET_KEYPOINT_DICT.items(), key=lambda x: x[1]) if idx > 4]
  coordinates = {}
  for i, joint in enumerate(joint_names):
    coordinates[f'{joint}_x'] = yx[:, i, 1]
    coordinates[f'{joint}_y'] = yx[:, i, 0]
    coordinates[f'{joint}_confidence'] = yx[:, i, 2]
  
  frames = np.arange(len(keypoint_list), dtype=int)
  times = np.asarray(timestamps, dtype=int)

  df = pd.DataFrame({'frame': frames, 'time': times, **coordinates})
  return df

def load_mediapipe_options(path):
  base_options = mp.tasks.BaseOptions(model_asset_path=path)
  running_mode = mp.tasks.vision.RunningMode.VIDEO
  options = mp.tasks.vision.PoseLandmarkerOptions(base_options=base_options, running_mode=running_mode)
  return options

def compute_mediapipe(frames, timestamps_ms, options):
  landmarker_results = []
  with mp.tasks.vision.PoseLandmarker.create_from_options(options) as landmarker:
    for i in range(len(frames)):
      mp_frame = mp.Image(image_format=mp.ImageFormat.SRGB, data=frames[i])
      timestamp = timestamps_ms[i]
      landmarker_result = landmarker.detect_for_video(mp_frame, timestamp)
      landmarker_results.append(landmarker_result)
  return landmarker_results

def restructure_mediapipe(landmarker_results, timestamps, world:bool):
  if world:
    landmarker_results = [landmarker_result.pose_world_landmarks[0] for landmarker_result in landmarker_results]
  else:
    landmarker_results = [landmarker_result.pose_landmarks[0] for landmarker_result in landmarker_results]
  
  stacked = np.stack(landmarker_results)
  coordinates = {}
  for joint_name, joint_index in MEDIAPIPE_JOINTS:
    coordinates[f'{joint_name}_x'] = np.vectorize(lambda joint: joint.x)(stacked[:, joint_index])
    coordinates[f'{joint_name}_y'] = np.vectorize(lambda joint: joint.y)(stacked[:, joint_index])
    coordinates[f'{joint_name}_z'] = np.vectorize(lambda joint: joint.z)(stacked[:, joint_index])
    coordinates[f'{joint_name}_visibility'] = np.vectorize(lambda joint: joint.visibility)(stacked[:, joint_index])
    coordinates[f'{joint_name}_presence'] = np.vectorize(lambda joint: joint.presence)(stacked[:, joint_index])
  frames = np.arange(len(landmarker_results), dtype=int)
  times = np.asarray(timestamps, dtype=int)

  df = pd.DataFrame({'frame': frames, 'time': times, **coordinates})
  return df


def process_video(video_path, movenet_model, mediapipe_options, out_movenet_path, out_mediapipe_norm_path, out_mediapipe_world_path):
  frames, timestamps_ms = load_video_data(video_path)
  timestamps_sec = [ts // 1000 for ts in timestamps_ms]

  movenet_result = compute_movenet(frames, movenet_model)
  movenet_structured = restructure_movenet(movenet_result, timestamps_sec)
  movenet_structured.to_csv(out_movenet_path, sep=',', index=False)

  mediapipe_result = compute_mediapipe(frames, timestamps_ms, mediapipe_options)
  mediapipe_norm_structured = restructure_mediapipe(mediapipe_result, timestamps_sec, world=False)
  mediapipe_norm_structured.to_csv(out_mediapipe_norm_path, sep=',', index=False)
  mediapipe_world_structured = restructure_mediapipe(mediapipe_result, timestamps_sec, world=True)
  mediapipe_world_structured.to_csv(out_mediapipe_world_path, sep=',', index=False)

  print(f'Video {video_path} processed successfully.')


def process_videos(video_dir, movenet_path, mediapipe_path, out_movenet_dir, out_mediapipe_norm_dir, out_mediapipe_world_dir):
  Path(out_movenet_dir).mkdir(parents=True, exist_ok=True)
  Path(out_mediapipe_norm_dir).mkdir(parents=True, exist_ok=True)
  Path(out_mediapipe_world_dir).mkdir(parents=True, exist_ok=True)

  movenet_model = load_movenet_model(movenet_path)
  mediapipe_options = load_mediapipe_options(mediapipe_path)

  for video in Path(video_dir).rglob('*.MP4'):
    video_name = video.stem
    out_movenet_path = Path(out_movenet_dir) / f'{video_name}_movenet.csv'
    out_mediapipe_norm_path = Path(out_mediapipe_norm_dir) / f'{video_name}_mediapipe_norm.csv'
    out_mediapipe_world_path = Path(out_mediapipe_world_dir) / f'{video_name}_mediapipe_world.csv'
    process_video(video, movenet_model, mediapipe_options, out_movenet_path, out_mediapipe_norm_path, out_mediapipe_world_path)

In [3]:
VIDEO_DIR = '../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt'
MOVENET_PATH = 'https://tfhub.dev/google/movenet/singlepose/thunder/4'
MEDIAPIPE_PATH = '../models/pose_landmarker_heavy.task'

OUT_MOVENET_DIR = '../data/processed/keypoints/movenet'
OUT_MEDIAPIPE_NORM_DIR = '../data/processed/keypoints/mediapipe_norm'
OUT_MEDIAPIPE_WORLD_DIR = '../data/processed/keypoints/mediapipe_world'

In [4]:
process_videos(VIDEO_DIR, MOVENET_PATH, MEDIAPIPE_PATH, OUT_MOVENET_DIR, OUT_MEDIAPIPE_NORM_DIR, OUT_MEDIAPIPE_WORLD_DIR)

E0000 00:00:1770384078.748074   93336 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1770384078.753264   93336 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1770384132.044904   97500 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770384132.103445   97499 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support f

Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425092743_0028_D.MP4 processed successfully.


W0000 00:00:1770384303.691599   97574 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770384303.903837   97572 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425093100_0030_D.MP4 processed successfully.


W0000 00:00:1770384472.821514   97641 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770384473.478070   97641 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425104507_0045_D.MP4 processed successfully.


W0000 00:00:1770384598.923872   97691 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770384598.986862   97692 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425104804_0047_D.MP4 processed successfully.


W0000 00:00:1770384748.699047   97747 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770384748.745481   97748 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425112502_0059_D.MP4 processed successfully.


W0000 00:00:1770384887.196520   97802 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770384887.251077   97802 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425112749_0061_D.MP4 processed successfully.


W0000 00:00:1770385034.044623   97854 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770385034.106851   97854 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425120835_0074_D.MP4 processed successfully.


W0000 00:00:1770385148.102555   97907 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770385148.168087   97905 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425121226_0076_D.MP4 processed successfully.


W0000 00:00:1770385277.731349   97967 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770385277.847193   97967 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425125202_0091_D.MP4 processed successfully.


W0000 00:00:1770385379.268927   98026 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1770385379.429773   98026 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Video ../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt/DJI_20250425125448_0093_D.MP4 processed successfully.


In [4]:
def draw_keypoints_on_image(image, structured_keypoints, joint_names):
  height, width, _ = image.shape
  aspect_ratio = float(width) / height
  fig, ax = plt.subplots(figsize=(12*aspect_ratio,12))
  fig.tight_layout(pad=0)
  ax.margins(0)
  ax.set_yticklabels([])
  ax.set_xticklabels([])
  plt.axis('off')

  x = np.array([structured_keypoints[f"{j}_x"] for j in joint_names]) * width
  y = np.array([structured_keypoints[f"{j}_y"] for j in joint_names]) * height
  ax.imshow(image)
  ax.scatter(x, y, c="#00ff00")
  fig.canvas.draw()
  image_from_plot = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8)
  image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (4,))
  image_from_plot = image_from_plot[:, :, 1:4]
  plt.close(fig)

  return image_from_plot

def draw_keypoints_on_video(frames, structured_keypoints, joint_names, outpath):
  images = []  
  for i in range(len(frames)):
    image = draw_keypoints_on_image(frames[i], structured_keypoints.iloc[i].to_dict(), joint_names)
    images.append(image)
  
  imageio.mimsave(Path(outpath), images, fps=50)

POSE_EDGES_12 = [
  (11,13),(13,15),    # left arm
  (12,14),(14,16),    # right arm
  (11,12),            # shoulders
  (11,23),(12,24),    # torso
  (23,24),            # hips
  (23,25),(25,27),    # left leg
  (24,26),(26,28),    # right leg
]

def render_world_frame(world_row):
  xs, depth, up = [], [], []
  for name, _ in MEDIAPIPE_JOINTS:
    xs.append(world_row[f"{name}_x"])
    depth.append(world_row[f"{name}_z"])
    up.append(world_row[f"{name}_y"])

  fig = plt.figure(figsize=(4, 4))
  ax = fig.add_subplot(111, projection="3d")

  ax.scatter(xs, depth, up, s=25)

  for a, b in POSE_EDGES_12:
    ia = [i for i, (_, idx) in enumerate(MEDIAPIPE_JOINTS) if idx == a][0]
    ib = [i for i, (_, idx) in enumerate(MEDIAPIPE_JOINTS) if idx == b][0]
    ax.plot(
      [xs[ia], xs[ib]],
      [depth[ia], depth[ib]],
      [up[ia], up[ib]],
      linewidth=2
    )

  ax.set_xlim(-0.5, 0.5)
  ax.set_ylim(-0.5, 0.5)
  ax.set_zlim(-1.0, 1.0)

  ax.set_xlabel("X")
  ax.set_ylabel("Z")
  ax.set_zlabel("Y") # z and y flipped in matplotlib
  ax.invert_zaxis()
  ax.set_aspect('equal')

  ax.view_init(elev=20, azim=225)

  fig.canvas.draw()
  w, h = fig.canvas.get_width_height()
  buf = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8)
  buf = buf.reshape((h, w, 4))
  img = buf[:, :, 1:4]

  plt.close(fig)
  return img


def draw_world_preview_video(world_df, outpath, fps=50):
  images = []
  for i in range(len(world_df)):
    images.append(render_world_frame(world_df.iloc[i]))
  imageio.mimsave(outpath, images, fps=fps)

In [5]:
joint_names = [name for name, index in MEDIAPIPE_JOINTS]

frames, _ = load_video_data(VIDEO_PATH)

movenet_structured = pd.read_csv(OUT_MOVENET_PATH)
mediapipe_norm_structured = pd.read_csv(OUT_MEDIAPIPE_NORM_PATH)
mediapipe_world_structured = pd.read_csv(OUT_MEDIAPIPE_WORLD_PATH)

start_frame = 550
end_frame = 650

draw_keypoints_on_video(frames[start_frame:end_frame], movenet_structured.iloc[start_frame:end_frame], joint_names, '../data/processed/keypoints_on_video/movenet.mp4')
draw_keypoints_on_video(frames[start_frame:end_frame], mediapipe_norm_structured.iloc[start_frame:end_frame], joint_names, '../data/processed/keypoints_on_video/mediapipe_norm.mp4')
draw_world_preview_video(mediapipe_world_structured.iloc[start_frame:end_frame], "../data/processed/keypoints_on_video/mediapipe_world.mp4")

