# 08: Model-Ready Data

Build final feature table (X) and labels (y) for machine learning.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../../src')

from sledhead_imu.prep.build_model_ready import build_model_ready_data

# Load labeled data from previous stage
data_dir = Path('../data')
labels_dir = data_dir / '07_labels_merge' / 'labeled_data'
model_ready_dir = data_dir / '08_model_ready_build' / 'model_ready_data'

# Find labeled files
labeled_files = list(labels_dir.glob('*.csv'))
print(f"Found {len(labeled_files)} labeled files")

if labeled_files:
    # Load first labeled file
    df = pd.read_csv(labeled_files[0])
    print(f"Labeled data shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Build model-ready data
    print("\nBuilding model-ready data...")
    
    # Define feature columns (exposure metrics)
    feature_cols = ['exposure_s', 'duration_s', 'max_g', 'mean_g']
    
    # Define label column
    label_col = 'symptom'  # Binary symptom indicator
    
    # Check if required columns exist, if not use available columns
    available_cols = list(df.columns)
    print(f"Available columns: {available_cols}")
    
    # Use available feature columns
    actual_feature_cols = [col for col in feature_cols if col in df.columns]
    if not actual_feature_cols:
        # Fallback to any numeric columns
        actual_feature_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col != 'athlete_id']
    
    # Use available label column
    if label_col not in df.columns:
        # Try alternative label columns
        label_alternatives = ['symptom', 'has_symptoms', 'symptom_severity', 'severity', 'symptom_type']
        label_col = next((col for col in label_alternatives if col in df.columns), None)
        
        if label_col is None:
            print("Warning: No suitable label column found. Creating synthetic labels...")
            # Create synthetic binary labels
            df['symptom'] = (np.random.random(len(df)) > 0.6).astype(int)
            label_col = 'symptom'
    
    print(f"Using feature columns: {actual_feature_cols}")
    print(f"Using label column: {label_col}")
    
    # Ensure we have at least one feature column
    if not actual_feature_cols:
        print("Warning: No feature columns found. Creating synthetic features...")
        df['exposure_s'] = np.random.exponential(10, len(df))
        df['duration_s'] = np.random.exponential(5, len(df))
        actual_feature_cols = ['exposure_s', 'duration_s']
    
    X, y = build_model_ready_data(df, actual_feature_cols, label_col)
    
    print(f"Features (X) shape: {X.shape}")
    print(f"Labels (y) shape: {y.shape}")
    print(f"Feature columns: {list(X.columns)}")
    print(f"Label column: {y.name}")
    
    # Save model-ready data
    model_ready_dir.mkdir(parents=True, exist_ok=True)
    
    # Save features
    X_file = model_ready_dir / f"features_{labeled_files[0].stem}.csv"
    X.to_csv(X_file, index=False)
    print(f"Saved features to: {X_file}")
    
    # Save labels
    y_file = model_ready_dir / f"labels_{labeled_files[0].stem}.csv"
    y.to_csv(y_file, index=False)
    print(f"Saved labels to: {y_file}")
    
    # Show sample data
    print("\nSample features:")
    print(X.head())
    print("\nSample labels:")
    print(y.head())
    
else:
    print("No labeled data found. Using sample data for demonstration...")
    
    # Create sample labeled data
    np.random.seed(42)
    n_samples = 100
    
    # Create sample features
    X = pd.DataFrame({
        'exposure_s': np.random.exponential(10, n_samples),
        'duration_s': np.random.exponential(5, n_samples),
        'max_g': np.random.uniform(1, 8, n_samples),
        'mean_g': np.random.uniform(0.5, 3, n_samples),
        'athlete_id': np.random.choice(['A001', 'A002', 'A003'], n_samples)
    })
    
    # Create sample labels
    y = pd.DataFrame({
        'symptom': (np.random.random(n_samples) > 0.6).astype(int),
        'g': np.random.uniform(0.5, 8, n_samples),
        'athlete_id': X['athlete_id']
    })
    
    print(f"Sample features shape: {X.shape}")
    print(f"Sample labels shape: {y.shape}")
    
    # Save sample data
    model_ready_dir.mkdir(parents=True, exist_ok=True)
    
    X_file = model_ready_dir / "features_sample.csv"
    X.to_csv(X_file, index=False)
    print(f"Saved sample features to: {X_file}")
    
    y_file = model_ready_dir / "labels_sample.csv"
    y.to_csv(y_file, index=False)
    print(f"Saved sample labels to: {y_file}")
    
    print("\nSample features:")
    print(X.head())
    print("\nSample labels:")
    print(y.head())


Found 1 labeled files
Labeled data shape: (5, 8)
Columns: ['athlete_id', 'run_id', 'exposure_s', 'duration_s', 'timestamp', 'symptom_type', 'severity', 'duration_minutes']

Building model-ready data...
Available columns: ['athlete_id', 'run_id', 'exposure_s', 'duration_s', 'timestamp', 'symptom_type', 'severity', 'duration_minutes']
Using feature columns: ['exposure_s', 'duration_s']
Using label column: severity
Features (X) shape: (5, 2)
Labels (y) shape: (5,)
Feature columns: ['exposure_s', 'duration_s']
Label column: severity
Saved features to: ../data/08_model_ready_build/model_ready_data/features_labeled_exposure_filtered_daily_agg_sample_imu_A002_R001.csv
Saved labels to: ../data/08_model_ready_build/model_ready_data/labels_labeled_exposure_filtered_daily_agg_sample_imu_A002_R001.csv

Sample features:
   exposure_s  duration_s
0    2.516409        0.52
1    2.516409        0.52
2    2.516409        0.52
3    2.516409        0.52
4    2.516409        0.52

Sample labels:
0    1
1 