# 06: Extract Random Forest Features

Extract comprehensive features from IMU data for Random Forest classification.

**Pipeline flow:** 01 Ingest → 06 Features → 07 Labels → 08 Model-Ready

Features are extracted from normalized time-series data per run, then aggregated per athlete per day.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../../src')

from sledhead_imu.features.random_forest_features import (
    extract_all_runs, 
    aggregate_rf_features_daily
)

print("Extracting Random Forest features from IMU data...")


In [None]:
# Load normalized data from previous step
data_dir = Path('../data')
ingest_dir = data_dir / '01_ingest_normalize'

# Load all normalized IMU files
normalized_files = list(ingest_dir.glob('normalized_sample_imu_A*.csv'))
print(f"Found {len(normalized_files)} normalized files")

# Load and concatenate all files
df_all = []
for imu_file in normalized_files:
    df = pd.read_csv(imu_file)
    df_all.append(df)

df_raw = pd.concat(df_all, ignore_index=True)
print(f"Total data shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")
print(f"\nUnique athletes: {df_raw['athlete_id'].unique()}")
print(f"Unique runs: {df_raw['run_id'].unique()}")


In [None]:
# Extract features for all runs
features_df = extract_all_runs(df_raw, fs=2000.0)

print(f"Extracted features for {len(features_df)} runs")
print(f"Feature columns: {len(features_df.columns)}")
print(f"\nFeatures DataFrame:")
print(features_df.head())

print(f"\n\nFeature columns:")
print(list(features_df.columns))


In [None]:
# Aggregate run-level features to daily per-athlete
print("\nAggregating features per athlete per day...")

# Add timestamps to features_df for aggregation
for idx, row in features_df.iterrows():
    run_data = df_raw[(df_raw['athlete_id'] == row['athlete_id']) & 
                      (df_raw['run_id'] == row['run_id'])]
    if not run_data.empty:
        features_df.loc[idx, 'timestamp'] = run_data['timestamp'].iloc[0]

features_df['timestamp'] = pd.to_datetime(features_df['timestamp'])
features_df['date'] = features_df['timestamp'].dt.date

daily_features_df = aggregate_rf_features_daily(features_df)

print(f"Aggregated to {len(daily_features_df)} athlete-days")
print(f"\nDaily features:")
print(daily_features_df.head())


In [None]:
# Display summary statistics
print("Summary statistics of daily features:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(daily_features_df.describe())


In [None]:
# Save features to CSV
# Use the standard pipeline directory structure
output_dir = data_dir / '06_features_exposure_2g' / 'exposure_data'
output_dir.mkdir(parents=True, exist_ok=True)

# Save daily aggregated features (matches pipeline structure)
output_file = output_dir / 'rf_features_daily.csv'
daily_features_df.to_csv(output_file, index=False)
print(f"\nSaved daily features to: {output_file}")

# Also save run-level features for reference
run_file = output_dir / 'rf_features_runs.csv'
features_df.to_csv(run_file, index=False)
print(f"Saved run-level features to: {run_file}")
print("\nDaily features are ready for label merging in the next step.")
