# Data Augmentation of Keypoint CSV Data

- Three methods: **mirroring**, **rotation**, **stretching**
- Applied to MoveNet (2D) and MediaPipe (3D) keypoint CSVs

In [39]:
import numpy as np
import pandas as pd
from pathlib import Path
import time

In [40]:
MOVENET_DIR = Path('../data/processed/keypoints/movenet')
MEDIAPIPE_NORM_DIR = Path('../data/processed/keypoints/mediapipe_norm')
MEDIAPIPE_WORLD_DIR = Path('../data/processed/keypoints/mediapipe_world')

OUT_MOVENET_DIR = Path('../data/processed/keypoints_augmented/movenet')
OUT_MEDIAPIPE_NORM_DIR = Path('../data/processed/keypoints_augmented/mediapipe_norm')
OUT_MEDIAPIPE_WORLD_DIR = Path('../data/processed/keypoints_augmented/mediapipe_world')

JOINTS = [
    'left_shoulder', 'right_shoulder',
    'left_elbow', 'right_elbow',
    'left_wrist', 'right_wrist',
    'left_hip', 'right_hip',
    'left_knee', 'right_knee',
    'left_ankle', 'right_ankle'
]

# Angles for rotation in radians
ROTATION_ANGLES = [np.pi / 7, -np.pi / 11, -np.pi / 13]
# Stretch factors in x and y directions
STRETCH_FACTORS_X = [0.9, 1.1, 1.2]
STRETCH_FACTORS_Y = [0.9, 1.1, 1.2]

In [41]:
def is_3d(df):
    for col in df.columns:
        if col.endswith('_z'):
            return True
    return False


def get_columns(df, suffix):
    cols = []
    for joint in JOINTS:
        col = joint + suffix
        if col in df.columns:
            cols.append(col)
    return cols

## Augmentation functions

### Mirroring
- Negate x, y, and/or z coordinates
- Mirror any axis combination

In [42]:
def mirror(df, mirror_x=True, mirror_y=False, mirror_z=False):
    result = df.copy()
    
    if mirror_x:
        x_cols = get_columns(df, '_x')
        result[x_cols] = result[x_cols] * -1
    
    if mirror_y:
        y_cols = get_columns(df, '_y')
        result[y_cols] = result[y_cols] * -1
    
    if mirror_z:
        z_cols = get_columns(df, '_z')
        if z_cols:  # 3D
            result[z_cols] = result[z_cols] * -1
    
    return result

### Rotation
- **2D (MoveNet):** rotation matrix on (x, y)
- **3D (MediaPipe):** Y-axis rotation on (x, z)

In [43]:
def rotate(df, angle):
    result = df.copy()
    c = np.cos(angle)
    s = np.sin(angle)

    if is_3d(df):
        # 3D -> rotate around Y-axis, affects x and z
        for joint in JOINTS:
            x = df[joint + '_x'].values
            z = df[joint + '_z'].values
            result[joint + '_x'] = c * x + s * z
            result[joint + '_z'] = -s * x + c * z
    else:
        # 2D -> rotate in xy-plane
        for joint in JOINTS:
            x = df[joint + '_x'].values
            y = df[joint + '_y'].values
            result[joint + '_x'] = c * x - s * y
            result[joint + '_y'] = s * x + c * y

    return result

### Stretching
- Scale x, y, and/or z coordinates by a factor
- Z-stretch only applies to 3D data

In [44]:
def stretch(df, fx=1.0, fy=1.0, fz=1.0):
    result = df.copy()

    if fx != 1.0:
        x_cols = get_columns(df, '_x')
        result[x_cols] = result[x_cols] * fx

    if fy != 1.0:
        y_cols = get_columns(df, '_y')
        result[y_cols] = result[y_cols] * fy

    if fz != 1.0:
        z_cols = get_columns(df, '_z')
        if z_cols: 
            result[z_cols] = result[z_cols] * fz

    return result

In [45]:
def run_augmentation(input_dir, output_dir):
    start = time.time()

    steps = [
        ('_mirrored',    'mirror',  {}),
        ('_stretchX1.1', 'stretch', {'fx': 1.1}),
        ('_stretchX1.2', 'stretch', {'fx': 1.2}),
        ('_stretchX0.9', 'stretch', {'fx': 0.9}),
        ('_stretchY1.1', 'stretch', {'fy': 1.1}),
        ('_stretchY1.2', 'stretch', {'fy': 1.2}),
        ('_stretchY0.9', 'stretch', {'fy': 0.9}),
        ('_rot25.7',     'rotate',  {'angle': np.pi / 7}),
        ('_rot-16.4',    'rotate',  {'angle': -np.pi / 11}),
        ('_rot-13.8',    'rotate',  {'angle': -np.pi / 13}),
    ]

    funcs = {'mirror': mirror, 'stretch': stretch, 'rotate': rotate}

    output_dir.mkdir(parents=True, exist_ok=True)
    for old in output_dir.glob('*.npz'):
        old.unlink()

    total = 0
    csv_files = list(input_dir.glob('*.csv'))

    for i, csv_file in enumerate(csv_files):
        base_name = csv_file.stem
        batch = [(pd.read_csv(csv_file), '')]

        for suffix, func_name, kwargs in steps:
            new = [(funcs[func_name](df, **kwargs), s + suffix) for df, s in batch]
            batch.extend(new)

        count = 0
        for df, s in batch:
            if s == '':
                continue
            np.savez_compressed(
                output_dir / f'{base_name}{s}.npz',
                data=df.values,
                columns=np.array(list(df.columns))
            )
            count += 1

        total += count
        del batch  
        print(f'  [{i+1}/{len(csv_files)}] {base_name}: {count} files')

    elapsed = time.time() - start
    print(f'Done: {input_dir.name} -> {total} files in {elapsed:.1f}s')

## Run augmentation
- Sequential approach (same as MATLAB): each step applied to ALL previous results
- **1023 augmented variants per original file**
- Output saved as `.npz` files

### MoveNet (2D)

In [46]:
run_augmentation(MOVENET_DIR, OUT_MOVENET_DIR)

  [1/10] DJI_20250425092743_0028_D_movenet: 1023 files
  [2/10] DJI_20250425121226_0076_D_movenet: 1023 files
  [3/10] DJI_20250425112502_0059_D_movenet: 1023 files
  [4/10] DJI_20250425125448_0093_D_movenet: 1023 files
  [5/10] DJI_20250425120835_0074_D_movenet: 1023 files
  [6/10] DJI_20250425112749_0061_D_movenet: 1023 files
  [7/10] DJI_20250425104507_0045_D_movenet: 1023 files
  [8/10] DJI_20250425125202_0091_D_movenet: 1023 files
  [9/10] DJI_20250425104804_0047_D_movenet: 1023 files
  [10/10] DJI_20250425093100_0030_D_movenet: 1023 files
Done: movenet -> 10230 files in 481.5s


### MediaPipe Normalized (3D)

In [47]:
run_augmentation(MEDIAPIPE_NORM_DIR, OUT_MEDIAPIPE_NORM_DIR)

  [1/10] DJI_20250425104507_0045_D_mediapipe_norm: 1023 files
  [2/10] DJI_20250425125202_0091_D_mediapipe_norm: 1023 files
  [3/10] DJI_20250425121226_0076_D_mediapipe_norm: 1023 files
  [4/10] DJI_20250425092743_0028_D_mediapipe_norm: 1023 files
  [5/10] DJI_20250425093100_0030_D_mediapipe_norm: 1023 files
  [6/10] DJI_20250425112502_0059_D_mediapipe_norm: 1023 files
  [7/10] DJI_20250425125448_0093_D_mediapipe_norm: 1023 files
  [8/10] DJI_20250425104804_0047_D_mediapipe_norm: 1023 files
  [9/10] DJI_20250425120835_0074_D_mediapipe_norm: 1023 files
  [10/10] DJI_20250425112749_0061_D_mediapipe_norm: 1023 files
Done: mediapipe_norm -> 10230 files in 867.1s


### MediaPipe World (3D)

In [48]:
run_augmentation(MEDIAPIPE_WORLD_DIR, OUT_MEDIAPIPE_WORLD_DIR)

  [1/10] DJI_20250425112749_0061_D_mediapipe_world: 1023 files
  [2/10] DJI_20250425093100_0030_D_mediapipe_world: 1023 files
  [3/10] DJI_20250425104507_0045_D_mediapipe_world: 1023 files
  [4/10] DJI_20250425120835_0074_D_mediapipe_world: 1023 files
  [5/10] DJI_20250425125202_0091_D_mediapipe_world: 1023 files
  [6/10] DJI_20250425112502_0059_D_mediapipe_world: 1023 files
  [7/10] DJI_20250425092743_0028_D_mediapipe_world: 1023 files
  [8/10] DJI_20250425121226_0076_D_mediapipe_world: 1023 files
  [9/10] DJI_20250425104804_0047_D_mediapipe_world: 1023 files
  [10/10] DJI_20250425125448_0093_D_mediapipe_world: 1023 files
Done: mediapipe_world -> 10230 files in 737.6s


## Summary

In [49]:
print("File counts:")
print()

models = [('MoveNet', OUT_MOVENET_DIR),
          ('MediaPipe Norm', OUT_MEDIAPIPE_NORM_DIR),
          ('MediaPipe World', OUT_MEDIAPIPE_WORLD_DIR)]

grand_total = 0
for name, path in models:
    n = len(list(path.glob('*.npz'))) if path.exists() else 0
    print(f'  {name}: {n} files')
    grand_total += n

print()
print(f' Total: {grand_total} files')

File counts:

  MoveNet: 10230 files
  MediaPipe Norm: 10230 files
  MediaPipe World: 10230 files

 Total: 30690 files
