# 01: Ingest & Normalize

Load raw IMU data and normalize to unified schema.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import re
sys.path.append('../../src')

from sledhead_imu.io.load_imu import load_imu_data
from sledhead_imu.ingest.normalize import normalize_imu_data

data_dir = Path('../data')
collect_dir = data_dir / '00_collect' / 'imu'
ingest_dir = data_dir / '01_ingest_normalize'

# Load all sample IMU files
imu_files = list(collect_dir.glob('sample_imu_*.csv'))
print(f"Found {len(imu_files)} IMU files")

if imu_files:
    # Process all files
    ingest_dir.mkdir(parents=True, exist_ok=True)
    
    for imu_file in imu_files:
        print(f"\nProcessing: {imu_file.name}")
        
        # Load raw data
        df_raw = load_imu_data(imu_file)
        print(f"  Raw shape: {df_raw.shape}")
        print(f"  Columns: {list(df_raw.columns)[:5]}...")
        
        # Standardize column names
        if 'Number of Symptoms' in df_raw.columns:
            df_raw = df_raw.rename(columns={'Number of Symptoms': 'num_symptoms'})
        
        # Extract athlete_id and run_id from filename if not in data
        if 'athlete_id' not in df_raw.columns or df_raw['athlete_id'].isna().all():
            # Try to extract from filename pattern: sample_imu_AXXX_RXXX.csv or New-XXXX-...
            match = re.search(r'A(\w+)_R(\w+)', imu_file.name)
            if not match:
                match = re.search(r'New-(\d+)-.*?\(Run\s*(\d+)\)', imu_file.name)
            if match:
                athlete_id = f"A{match.group(1)}"
                run_id = f"R{match.group(2)}"
                df_raw['athlete_id'] = athlete_id
                df_raw['run_id'] = run_id
                print(f"  Extracted from filename: athlete={athlete_id}, run={run_id}")
        
        # Normalize
        df_normalized = normalize_imu_data(df_raw, {})
        print(f"  Normalized shape: {df_normalized.shape}")
        
        # Save
        output_file = ingest_dir / f"normalized_{imu_file.name}"
        df_normalized.to_csv(output_file, index=False)
        print(f"  Saved: {output_file.name}")
    
    print(f"\n✓ Processed {len(imu_files)} files")
    print(f"✓ Saved to: {ingest_dir}")
else:
    print("No IMU files found.")


Current working directory: /Users/jeff/sledhead-imu/notebooks
Looking in: ../data/00_collect/imu
Files found: ['sample_imu_A002_R001.csv', 'sample_imu_A003_R001.csv', 'sample_imu_A003_R002.csv', 'sample_imu_A002_R002.csv', 'sample_imu_A002_R003.csv', 'sample_imu_A003_R003.csv', 'sample_imu_A001_R002.csv', 'sample_imu_A001_R003.csv', 'sample_imu_A001_R001.csv', 'sample_imu_data.csv']
Found 10 IMU files
Raw data shape: (1007, 14)
Columns: ['timestamp', 'athlete_id', 'run_id', 'accy', 'accz', 'gyrox', 'gyroy', 'gyroz', 't', 'x', 'y', 'z', 'r_gs', 'num_symptoms']
Normalized data shape: (1007, 15)
Sample data:
                timestamp athlete_id run_id  accy  accz  gyrox  gyroy  gyroz  \
0 2025-01-16 09:00:00.000       A002   R001  5129  4771   4959   5002   4994   
1 2025-01-16 09:00:00.001       A002   R001  5133  4770   4958   5005   4995   
2 2025-01-16 09:00:00.002       A002   R001  5131  4771   4961   5008   4994   
3 2025-01-16 09:00:00.003       A002   R001  5130  4775   4960   50