# 07: Attach Labels (Merge Symptoms)

Merge symptom data with exposure metrics for model training.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../../src')

from sledhead_imu.labeling.attach_labels import attach_labels

# Load exposure data from previous stage
data_dir = Path('../data')
exposure_dir = data_dir / '06_features_exposure_2g' / 'exposure_data'
symptoms_dir = data_dir / '00_collect' / 'symptoms'
labels_dir = data_dir / '07_labels_merge' / 'labeled_data'

# Find exposure files
exposure_files = list(exposure_dir.glob('*.csv'))
print(f"Found {len(exposure_files)} exposure files")

# Find symptom files
symptom_files = list(symptoms_dir.glob('*.csv'))
print(f"Found {len(symptom_files)} symptom files")

if exposure_files and symptom_files:
    # Load exposure data
    df_exposure = pd.read_csv(exposure_files[0])
    print(f"Exposure data shape: {df_exposure.shape}")
    print(f"Exposure columns: {list(df_exposure.columns)}")
    
    # Load symptom data
    df_symptoms = pd.read_csv(symptom_files[0])
    print(f"Symptom data shape: {df_symptoms.shape}")
    print(f"Symptom columns: {list(df_symptoms.columns)}")
    
    # Attach labels
    print("\nAttaching labels...")
    df_labeled = attach_labels(df_exposure, df_symptoms)
    print(f"Labeled data shape: {df_labeled.shape}")
    print(f"Labeled columns: {list(df_labeled.columns)}")
    
    # Save labeled data
    labels_dir.mkdir(parents=True, exist_ok=True)
    output_file = labels_dir / f"labeled_{exposure_files[0].stem}.csv"
    df_labeled.to_csv(output_file, index=False)
    print(f"Saved labeled data to: {output_file}")
    
    # Show sample of labeled data
    print("\nSample labeled data:")
    print(df_labeled.head())
    
else:
    print("Missing exposure or symptom data. Run previous notebooks first.")
    print("Available files:")
    print(f"  Exposure: {exposure_files}")
    print(f"  Symptoms: {symptom_files}")


Found 1 exposure files
Found 1 symptom files
Exposure data shape: (1, 4)
Exposure columns: ['athlete_id', 'run_id', 'exposure_s', 'duration_s']
Symptom data shape: (5, 5)
Symptom columns: ['timestamp', 'athlete_id', 'symptom_type', 'severity', 'duration_minutes']

Attaching labels...
Labeled data shape: (5, 8)
Labeled columns: ['athlete_id', 'run_id', 'exposure_s', 'duration_s', 'timestamp', 'symptom_type', 'severity', 'duration_minutes']
Saved labeled data to: ../data/07_labels_merge/labeled_data/labeled_exposure_filtered_daily_agg_sample_imu_A002_R001.csv

Sample labeled data:
  athlete_id run_id  exposure_s  duration_s                   timestamp  \
0       A001   R001    2.516409        0.52  2025-01-01 09:00:00.000000   
1       A001   R001    2.516409        0.52  2025-01-02 18:26:56.656198   
2       A001   R001    2.516409        0.52  2025-01-03 07:38:50.348499   
3       A001   R001    2.516409        0.52  2025-01-05 09:35:59.610289   
4       A001   R001    2.516409        