# Extracting keypoints from videos

We will use the following models to extract keypoints:

* MoveNet (2D)
* ML Kit Pose Detection (3D)

## MoveNet (2D)

In [1]:
import tensorflow as tf
import tensorflow_hub as tf_hub
import numpy as np
import pandas as pd
import imageio
import matplotlib.pyplot as plt
from pathlib import Path

2026-02-04 14:08:02.619069: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from pkg_resources import parse_version


In [2]:
# Helper functions

KEYPOINT_DICT = {
  'nose': 0,
  'left_eye': 1,
  'right_eye': 2,
  'left_ear': 3,
  'right_ear': 4,
  'left_shoulder': 5,
  'right_shoulder': 6,
  'left_elbow': 7,
  'right_elbow': 8,
  'left_wrist': 9,
  'right_wrist': 10,
  'left_hip': 11,
  'right_hip': 12,
  'left_knee': 13,
  'right_knee': 14,
  'left_ankle': 15,
  'right_ankle': 16
}

# Cropping algorithm to improve detection accuracy (source: https://www.tensorflow.org/hub/tutorials/movenet)

MIN_CROP_KEYPOINT_SCORE = 0.2

def init_crop_region(image_height, image_width):
  if image_width > image_height:
    box_height = image_width / image_height
    box_width = 1.0
    y_min = (image_height / 2 - image_width / 2) / image_height
    x_min = 0.0
  else:
    box_height = 1.0
    box_width = image_height / image_width
    y_min = 0.0
    x_min = (image_width / 2 - image_height / 2) / image_width

  return {
    'y_min': y_min,
    'x_min': x_min,
    'y_max': y_min + box_height,
    'x_max': x_min + box_width,
    'height': box_height,
    'width': box_width
  }

def torso_visible(keypoints):
  return ((keypoints[0, 0, KEYPOINT_DICT['left_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, KEYPOINT_DICT['right_hip'], 2] >
           MIN_CROP_KEYPOINT_SCORE) and
          (keypoints[0, 0, KEYPOINT_DICT['left_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE or
          keypoints[0, 0, KEYPOINT_DICT['right_shoulder'], 2] >
           MIN_CROP_KEYPOINT_SCORE))

def determine_torso_and_body_range(keypoints, target_keypoints, center_y, center_x):
  torso_joints = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
  max_torso_yrange = 0.0
  max_torso_xrange = 0.0
  for joint in torso_joints:
    dist_y = abs(center_y - target_keypoints[joint][0])
    dist_x = abs(center_x - target_keypoints[joint][1])
    if dist_y > max_torso_yrange:
      max_torso_yrange = dist_y
    if dist_x > max_torso_xrange:
      max_torso_xrange = dist_x

  max_body_yrange = 0.0
  max_body_xrange = 0.0
  for joint in KEYPOINT_DICT.keys():
    if keypoints[0, 0, KEYPOINT_DICT[joint], 2] < MIN_CROP_KEYPOINT_SCORE:
      continue
    dist_y = abs(center_y - target_keypoints[joint][0]);
    dist_x = abs(center_x - target_keypoints[joint][1]);
    if dist_y > max_body_yrange:
      max_body_yrange = dist_y

    if dist_x > max_body_xrange:
      max_body_xrange = dist_x

  return [max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange]

def determine_crop_region(keypoints, image_height, image_width):
  target_keypoints = {}
  for joint in KEYPOINT_DICT.keys():
    target_keypoints[joint] = [
      keypoints[0, 0, KEYPOINT_DICT[joint], 0] * image_height,
      keypoints[0, 0, KEYPOINT_DICT[joint], 1] * image_width
    ]

  if torso_visible(keypoints):
    center_y = (target_keypoints['left_hip'][0] +
                target_keypoints['right_hip'][0]) / 2;
    center_x = (target_keypoints['left_hip'][1] +
                target_keypoints['right_hip'][1]) / 2;

    (max_torso_yrange, max_torso_xrange,
      max_body_yrange, max_body_xrange) = determine_torso_and_body_range(
          keypoints, target_keypoints, center_y, center_x)

    crop_length_half = np.amax(
        [max_torso_xrange * 1.9, max_torso_yrange * 1.9,
          max_body_yrange * 1.2, max_body_xrange * 1.2])

    tmp = np.array(
        [center_x, image_width - center_x, center_y, image_height - center_y])
    crop_length_half = np.amin(
        [crop_length_half, np.amax(tmp)]);

    crop_corner = [center_y - crop_length_half, center_x - crop_length_half];

    if crop_length_half > max(image_width, image_height) / 2:
      return init_crop_region(image_height, image_width)
    else:
      crop_length = crop_length_half * 2;
      return {
        'y_min': crop_corner[0] / image_height,
        'x_min': crop_corner[1] / image_width,
        'y_max': (crop_corner[0] + crop_length) / image_height,
        'x_max': (crop_corner[1] + crop_length) / image_width,
        'height': (crop_corner[0] + crop_length) / image_height -
            crop_corner[0] / image_height,
        'width': (crop_corner[1] + crop_length) / image_width -
            crop_corner[1] / image_width
      }
  else:
    return init_crop_region(image_height, image_width)

def crop_and_resize(image, crop_region, crop_size):
  boxes=[[crop_region['y_min'], crop_region['x_min'],
          crop_region['y_max'], crop_region['x_max']]]
  output_image = tf.image.crop_and_resize(
      image, box_indices=[0], boxes=boxes, crop_size=crop_size)
  return output_image

def run_inference(movenet, image, crop_region, crop_size):
  image_height, image_width, _ = image.shape
  input_image = crop_and_resize(
    tf.expand_dims(image, axis=0), crop_region, crop_size=crop_size)
  input_image = tf.cast(input_image, dtype=tf.int32)
  
  keypoints_with_scores = movenet(input_image)
  keypoints_with_scores = keypoints_with_scores['output_0'].numpy()

  for idx in range(17):
    keypoints_with_scores[0, 0, idx, 0] = (
        crop_region['y_min'] * image_height +
        crop_region['height'] * image_height *
        keypoints_with_scores[0, 0, idx, 0]) / image_height
    keypoints_with_scores[0, 0, idx, 1] = (
        crop_region['x_min'] * image_width +
        crop_region['width'] * image_width *
        keypoints_with_scores[0, 0, idx, 1]) / image_width
  return keypoints_with_scores


def load_video_data(video_path):
  reader = imageio.get_reader(video_path)
  fps = reader.get_meta_data()['fps']

  frames = []
  timestamps = []

  for frame_index, frame in enumerate(reader):
    frames.append(frame)
    timestamps.append(int(frame_index / fps))
  
  return frames, timestamps

def compute_video_keypoints(frames, model):
  frame_height, frame_width, _ = frames[0].shape
  input_size = 256
  crop_region = init_crop_region(frame_height, frame_width)

  keypoint_list = []
  for frame in frames:
    outputs = run_inference(model, frame, crop_region, crop_size=[input_size, input_size])
    keypoint_list.append(outputs)

    crop_region = determine_crop_region(outputs, frame_height, frame_width)
  
  return keypoint_list

def restructure_keypoints(keypoint_list, timestamps):
  stacked = np.concatenate(keypoint_list, axis=0)[:, 0, :, :]
  yx = stacked[:, 5:, :] # remove face keypoints

  joint_names = [joint for joint, idx in sorted(KEYPOINT_DICT.items(), key=lambda x: x[1]) if idx > 4]
  coordinates = {}
  for i, joint in enumerate(joint_names):
    coordinates[f'{joint}_x'] = yx[:, i, 1]
    coordinates[f'{joint}_y'] = yx[:, i, 0]
    coordinates[f'{joint}_confidence'] = yx[:, i, 2]
  
  frames = np.arange(len(keypoint_list), dtype=int)
  times = np.asarray(timestamps, dtype=int)

  df = pd.DataFrame({'frame': frames, 'time': times, **coordinates})
  return df


def draw_keypoints_on_image(image, structured_keypoints):
  height, width, _ = image.shape
  aspect_ratio = float(width) / height
  fig, ax = plt.subplots(figsize=(12*aspect_ratio,12))
  fig.tight_layout(pad=0)
  ax.margins(0)
  ax.set_yticklabels([])
  ax.set_xticklabels([])
  plt.axis('off')

  joint_names = [joint for joint, idx in sorted(KEYPOINT_DICT.items(), key=lambda x: x[1]) if idx > 4]
  x = np.array([structured_keypoints[f"{j}_x"] for j in joint_names]) * width
  y = np.array([structured_keypoints[f"{j}_y"] for j in joint_names]) * height
  ax.imshow(image)
  ax.scatter(x, y, c="#00ff00")
  fig.canvas.draw()
  image_from_plot = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8)
  image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (4,))
  image_from_plot = image_from_plot[:, :, 1:4]
  plt.close(fig)

  return image_from_plot

def draw_keypoints_on_video(frames, structured_keypoints, outpath):
  images = []  
  for i in range(len(frames)):
    image = draw_keypoints_on_image(frames[i], structured_keypoints.iloc[i].to_dict())
    images.append(image)
  
  imageio.mimsave(Path(outpath), images, fps=50)

In [3]:
VIDEO_DIR = Path('../data/Utvalda filminspelningar för IRAF analys/Dec 2025 sit-stå och stå-sitt')
VIDEO_PATH = VIDEO_DIR / 'DJI_20250425092743_0028_D.MP4'

model = tf_hub.load("https://tfhub.dev/google/movenet/singlepose/thunder/4")
model = model.signatures['serving_default']

frames, timestamps = load_video_data(VIDEO_PATH)
keypoint_list = compute_video_keypoints(frames, model)
structured_keypoints = restructure_keypoints(keypoint_list, timestamps)
structured_keypoints

I0000 00:00:1770210486.730769   62962 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3886 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5
2026-02-04 14:08:29.746268: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91801


Unnamed: 0,frame,time,left_shoulder_x,left_shoulder_y,left_shoulder_confidence,right_shoulder_x,right_shoulder_y,right_shoulder_confidence,left_elbow_x,left_elbow_y,...,left_knee_confidence,right_knee_x,right_knee_y,right_knee_confidence,left_ankle_x,left_ankle_y,left_ankle_confidence,right_ankle_x,right_ankle_y,right_ankle_confidence
0,0,0,0.521092,0.519910,0.518110,0.487056,0.516722,0.474878,0.510511,0.588453,...,0.385356,0.433174,0.683465,0.330330,0.453917,0.836186,0.308938,0.436931,0.815531,0.253571
1,1,0,0.527350,0.528623,0.950296,0.494422,0.521833,0.817320,0.486309,0.586663,...,0.860696,0.410921,0.702311,0.876339,0.449583,0.847424,0.892238,0.428268,0.827503,0.912502
2,2,0,0.528538,0.527570,0.726030,0.490460,0.520787,0.901920,0.484894,0.584514,...,0.810998,0.410692,0.703591,0.842042,0.450361,0.850583,0.899512,0.428887,0.826975,0.884594
3,3,0,0.528784,0.527498,0.706572,0.491219,0.521669,0.858702,0.485602,0.585935,...,0.790453,0.410869,0.702297,0.823224,0.450788,0.848687,0.876097,0.429132,0.825626,0.831866
4,4,0,0.528488,0.527175,0.720821,0.490798,0.520651,0.911837,0.485270,0.585636,...,0.816951,0.410958,0.703541,0.858632,0.450401,0.849938,0.736147,0.429274,0.826565,0.899712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392,1392,27,0.496995,0.529126,0.775767,0.460909,0.527993,0.793917,0.503290,0.620198,...,0.355572,0.417221,0.704720,0.782718,0.450130,0.850940,0.886711,0.427603,0.829396,0.691001
1393,1393,27,0.497176,0.528483,0.683359,0.461329,0.527684,0.597838,0.503264,0.620914,...,0.435492,0.416934,0.703567,0.825288,0.450592,0.850619,0.916399,0.427746,0.830316,0.743930
1394,1394,27,0.497203,0.528441,0.751997,0.460787,0.527693,0.776782,0.503783,0.621019,...,0.537183,0.417114,0.704459,0.847842,0.450644,0.850071,0.862668,0.427844,0.829929,0.741679
1395,1395,27,0.497441,0.528786,0.727078,0.460570,0.528287,0.772776,0.503455,0.620536,...,0.460713,0.417113,0.704490,0.814719,0.449691,0.850503,0.838916,0.427294,0.830889,0.739813


In [4]:
draw_keypoints_on_video(frames[550:650], structured_keypoints.iloc[550:650], '../data/processed/keypoints_on_video/movenet.mp4')

