# Non-Blender MediaPipe Landmark Smoothing Pipeline

This jupyter notebook is for those who are trying to get started and what a simple view of what we are doing. This removes all the blender aspects of the project, and is a space to test diffrent algos. Before you begin, make sure to pip install:
- Mediapipe
- cv2

## 1. Pre-clean (whole-sequence pass)
- Normalize all landmarks to a consistent coordinate space.
- Reject or drop extreme spikes (e.g., velocity z-score across the sequence).
- Fill gaps from dropped frames with linear or spline interpolation.

## 2. Temporal denoising (window/full sequence) [Try this one!]
- **Savitzky–Golay filter** (window 7–11 frames, polynomial order 2) — smooths while keeping peaks sharp. (https://eigenvector.com/wp-content/uploads/2020/01/SavitzkyGolay.pdf)
- Or apply a **low-pass Butterworth/IIR filter** to each joint trajectory.  
Both require larger windows or the full trajectory to avoid introducing lag.

## 3. Adaptive pass (optional)
- Run a **One-Euro filter** tuned on the cleaned signal.  
Since we have full data, apply it forward **and** backward (causal + anti-causal) to cancel lag.

## 4. Kalman smoothing (not just filtering)
- Instead of online Kalman, run a full **Rauch–Tung–Striebel smoother** per joint. (https://arxiv.org/pdf/1303.5237)
This backward pass bridges dropouts and reduces jitter more than the real-time Kalman filter.

## 6. Orientation smoothing
- Smooth orientations using **SLERP with EMA/One-Euro** across the whole sequence. (https://direction.bordeaux.inria.fr/~roussel/publications/2012-CHI-one-euro-filter.pdf)

## 7. Rig-side polish
- Add velocity/acceleration penalties during retarget optimization.
- After baking to the rig, apply a final **low-pass filter (e.g. 2nd-order Butterworth, cutoff 6–8 Hz @30 fps)** on Euler f-curves.
- Don't worry about this though.

## Globals and Imports

In [40]:
INPUT_SOURCE = "webcam" # options are webcam or video
VIDEO_PATH = "sample.mp4"
WEBCAM_ID = 0 # IF YOU GET A WEBCAM ERROR SWITCH THIS NUMBER
DRAW_2D = True # Turn off if you don't care about cv2 output (most helpful for video output).
COLLECT_FRAMES = True
MAX_FRAMES = None
SAVE_PREFIX = "pose_run"

import cv2, time, json, numpy as np
import mediapipe as mp

## Main fucntions

In [45]:
# don't worry about this function.
def open_capture():
    if INPUT_SOURCE == "video":
        cap = cv2.VideoCapture(VIDEO_PATH)
        if not cap.isOpened():
            raise FileNotFoundError(f"Could not open: {VIDEO_PATH}")
        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    else:
        cap = cv2.VideoCapture(WEBCAM_ID)
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
        cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
        if fps < 1: fps = 30.0
    return cap, float(fps)

class PoseRunner:
    def __init__(self):
        self.pose = mp.solutions.pose.Pose(
            static_image_mode=False,
            model_complexity=2,
            smooth_landmarks=True,
            min_detection_confidence=0.6,
            min_tracking_confidence=0.75
        )
        self.conn = mp.solutions.pose.POSE_CONNECTIONS
        self.N = 33  # pose landmarks

    def process(self, bgr):
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        res = self.pose.process(rgb)

        pts2d = None
        if res.pose_landmarks:
            pts2d = np.array([(lm.x, lm.y, lm.z, lm.visibility)
                              for lm in res.pose_landmarks.landmark],
                             dtype=np.float32)  # (33,4)

        pts3d = None
        if res.pose_world_landmarks:
            pts3d = np.array([(lm.x, lm.y, lm.z, lm.visibility)
                              for lm in res.pose_world_landmarks.landmark],
                             dtype=np.float32)  # (33,4)

        return pts2d, pts3d

    def close(self): self.pose.close()


# This is where most of the chagnes are going to be implemented
class IdentitySmoother:
    """
    Skeleton smoother. Replace `step` contents later with EMA / One-Euro / Kalman.
    Keeps shapes and API stable so you can drop in your logic.
    """
    def __init__(self):
        self._last = None

    def step(self, t, pts: np.ndarray) -> np.ndarray:
        return pts

# No need to worry about this function this is just for a live view of the output.
def draw_points_and_bones(frame_bgr, pts_2d: np.ndarray, connections, color=(0,255,0)):
    if pts_2d is None: return frame_bgr
    H, W = frame_bgr.shape[:2]
    if connections:
        for a, b in connections:
            xa, ya = int(pts_2d[a,0]*W), int(pts_2d[a,1]*H)
            xb, yb = int(pts_2d[b,0]*W), int(pts_2d[b,1]*H)
            if 0 <= xa < W and 0 <= ya < H and 0 <= xb < W and 0 <= yb < H:
                if pts_2d[a,3] > 0.2 and pts_2d[b,3] > 0.2:
                    cv2.line(frame_bgr, (xa,ya), (xb,yb), color, 2, cv2.LINE_AA)
    for (x,y,_,vis) in pts_2d:
        if vis <= 0.2: continue
        cx, cy = int(x*W), int(y*H)
        if 0 <= cx < W and 0 <= cy < H:
            cv2.circle(frame_bgr, (cx,cy), 3, color, -1, cv2.LINE_AA)
    return frame_bgr

## Runner Code

In [44]:
cap, fps = open_capture()
pose = PoseRunner()
filt2d, filt3d = IdentitySmoother(), IdentitySmoother()

raw2d_list, raw3d_list = [], []
sm2d_list,  sm3d_list  = [], []
times = []

t0, frame_idx = time.time(), 0
print(f"Running at ~{fps:.1f} FPS — press 'q' to quit")

while True:
    ok, bgr = cap.read()
    if not ok: break
    frame_idx += 1
    if (MAX_FRAMES is not None) and (frame_idx > MAX_FRAMES): break

    pts2d, pts3d = pose.process(bgr)
    t = time.time() - t0

    raw2d_list.append(pts2d.copy() if pts2d is not None else None)
    raw3d_list.append(pts3d.copy() if pts3d is not None else None)

    sm2d = filt2d.step(t, pts2d) if pts2d is not None else None
    sm3d = filt3d.step(t, pts3d) if pts3d is not None else None
    sm2d_list.append(sm2d.copy() if sm2d is not None else None)
    sm3d_list.append(sm3d.copy() if sm3d is not None else None)
    times.append(t)

    if DRAW_2D:
        vis = bgr.copy()
        if sm2d is not None:
            vis = draw_points_and_bones(vis, sm2d, pose.conn, color=(0,255,0))
        cv2.putText(vis, f"Frame {frame_idx}", (12,26),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2, cv2.LINE_AA)
        cv2.imshow("MediaPipe Pose - Stored Raw & Filtered", vis)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release(); pose.close(); cv2.destroyAllWindows()

T, N = len(times), pose.N
def stack_or_nan(seq, N):
    """Pack list[None or (N,4)] -> (T,N,4) with NaNs for missing frames."""
    out = np.full((T, N, 4), np.nan, dtype=np.float32)
    for i, arr in enumerate(seq):
        if arr is None: continue
        if arr.shape == (N,4):
            out[i] = arr
    return out

raw2d = stack_or_nan(raw2d_list, N)
raw3d = stack_or_nan(raw3d_list, N)
sm2d  = stack_or_nan(sm2d_list,  N)
sm3d  = stack_or_nan(sm3d_list,  N)

print("Packed arrays:")
print(" raw2d:", raw2d.shape, " raw3d:", raw3d.shape, " sm2d:", sm2d.shape, " sm3d:", sm3d.shape)

np.savez_compressed(f"{SAVE_PREFIX}_raw.npz",
                    times=np.array(times, dtype=np.float32),
                    pose2d=raw2d, pose3d=raw3d)
np.savez_compressed(f"{SAVE_PREFIX}_smooth.npz",
                    times=np.array(times, dtype=np.float32),
                    pose2d=sm2d,  pose3d=sm3d)

meta = {
    "source": INPUT_SOURCE if INPUT_SOURCE=="webcam" else os.path.abspath(VIDEO_PATH),
    "fps": fps,
    "num_frames": T,
    "landmarks": "MediaPipe Pose (33 points, columns: x,y,z,visibility)",
    "shapes": {
        "raw2d":  raw2d.shape,
        "raw3d":  raw3d.shape,
        "sm2d":   sm2d.shape,
        "sm3d":   sm3d.shape
    }
}
with open(f"{SAVE_PREFIX}_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print(f"Saved:\n  {SAVE_PREFIX}_raw.npz\n  {SAVE_PREFIX}_smooth.npz\n  {SAVE_PREFIX}_meta.json")


Running at ~10.0 FPS — press 'q' to quit


I0000 00:00:1757635477.540333   18979 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1757635477.542747   47686 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.2.8-1~bpo12+1pop1~1744225826~22.04~b077665), renderer: Mesa Intel(R) Graphics (ADL GT2)
W0000 00:00:1757635477.627206   47667 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757635477.768828   47665 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Packed arrays:
 raw2d: (103, 33, 4)  raw3d: (103, 33, 4)  sm2d: (103, 33, 4)  sm3d: (103, 33, 4)
Saved:
  pose_run_raw.npz
  pose_run_smooth.npz
  pose_run_meta.json
