#**CELLA 1 - CONFIGURATION**

In [1]:
# ═══════════════════════════════════════════════════════════════════════════
# STATUS MODULE - FEATURE ENGINEERING v2.2
# ═══════════════════════════════════════════════════════════════════════════
#
# OBIETTIVO:
# Preparare dataset user-level per classificazione experience (Beginner/Intermediate/Advanced)
#
# INPUT:
#   - data/synth_set_level_v2/users_sampled.csv (510 users)
#   - data/synth_set_level_v2/workout_sets_sampled.csv (1.56M sets)
#   - data/synth_set_level_v2/workouts_sampled.csv (106k workouts)
#
# OUTPUT:
#   - models/status_preprocessed_v2.2.pkl (X_train, X_test, y_train, y_test, scaler)
#   - models/status_feature_metadata_v2.2.json (feature info per interpretability)
#
# FEATURE SELECTION v2.2 (7 features, leakage-free):
#   - reps_mean, rpe_mean, total_sets, acwr_mean, spike_weeks_count,
#     load_progression, skip_rate
#
# RIMOSSI per leakage/low discriminative power:
#   - consistency_score (correlation 1.000, deterministic)
#   - load_mean (correlation 0.803, borderline)
#   - observed_freq_weekly (Cohen's d 0.04, non discrimina)
#
# VERSIONE: 2.2
# AUTORE: Alessandro Ambrosio
# ═══════════════════════════════════════════════════════════════════════════

import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
from scipy import stats
from datetime import datetime

print("="*80)
print("STATUS MODULE - FEATURE ENGINEERING v2.2")
print("="*80)
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()

# ═══════════════════════════════════════════════════════════════════════════
# SECTION 1: CONFIGURATION & PATHS
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 1: CONFIGURATION")
print("="*80)

# Paths
DATADIR = Path('data/synth_set_level_v2')
MODELDIR = Path('models')
MODELDIR.mkdir(exist_ok=True)

# Feature selection v2.2 (leakage-free)
SELECTED_FEATURES = [
    'reps_mean',
    'rpe_mean',
    'total_sets',
    'acwr_mean',
    'spike_weeks_count',
    'load_progression',
    'skip_rate'
]

# Seeds (reproducibility)
SEED_SPLIT = 42
SEED_CV = 42

# Target encoding
TARGET_ENCODING = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}

print(f"\n[OK] Data directory: {DATADIR}")
print(f"[OK] Model output directory: {MODELDIR}")
print(f"[OK] Selected features: {len(SELECTED_FEATURES)}")
print(f"[OK] Random seed: {SEED_SPLIT}")
print()



STATUS MODULE - FEATURE ENGINEERING v2.2
Timestamp: 2026-02-16 00:44:01

SECTION 1: CONFIGURATION

[OK] Data directory: data/synth_set_level_v2
[OK] Model output directory: models
[OK] Selected features: 7
[OK] Random seed: 42



#**CELLA 2 - LOAD RAW DATA**

In [2]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 2: LOAD RAW DATA
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 2: LOAD RAW DATA")
print("="*80)

# Load datasets
df_users = pd.read_csv(DATADIR / 'users_sampled.csv')
df_sets = pd.read_csv(DATADIR / 'workout_sets_sampled.csv')
df_workouts = pd.read_csv(DATADIR / 'workouts_sampled.csv')

# Parse dates
df_sets['date'] = pd.to_datetime(df_sets['date'])
df_workouts['date'] = pd.to_datetime(df_workouts['date'])

print(f"\n[OK] Users loaded: {len(df_users):,}")
print(f"[OK] Sets loaded: {len(df_sets):,}")
print(f"[OK] Workouts loaded: {len(df_workouts):,}")

# Target distribution
print("\n" + "-"*80)
print("TARGET DISTRIBUTION")
print("-"*80)
target_dist = df_users['experience_label'].value_counts().sort_index()
for label, count in target_dist.items():
    print(f"{label:12s}: {count:3d} ({count/len(df_users)*100:5.1f}%)")

print()



SECTION 2: LOAD RAW DATA

[OK] Users loaded: 510
[OK] Sets loaded: 1,566,944
[OK] Workouts loaded: 106,571

--------------------------------------------------------------------------------
TARGET DISTRIBUTION
--------------------------------------------------------------------------------
Advanced    : 170 ( 33.3%)
Beginner    : 170 ( 33.3%)
Intermediate: 170 ( 33.3%)



# **CELLA 3 - USER-LEVEL AGGREGATIONS**

In [3]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 3: USER-LEVEL AGGREGATIONS
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 3: USER-LEVEL AGGREGATIONS")
print("="*80)

print("\n[1/7] Basic workout statistics...")

# Basic stats
user_agg_basic = df_sets.groupby('user_id').agg({
    'load_done_kg': ['mean', 'std', 'min', 'max'],
    'reps_done': ['mean', 'std'],
    'rpe_done': ['mean', 'std'],
    'set_id': 'count'  # Total sets
}).reset_index()

# Flatten columns
user_agg_basic.columns = [
    'user_id', 'load_mean', 'load_std', 'load_min', 'load_max',
    'reps_mean', 'reps_std', 'rpe_mean', 'rpe_std', 'total_sets'
]

print(f"  [OK] Aggregated {len(user_agg_basic)} users")

# ────────────────────────────────────────────────────────────────────────────

print("[2/7] ACWR statistics...")

user_acwr = df_sets.groupby('user_id')['acwr'].mean().reset_index()
user_acwr.columns = ['user_id', 'acwr_mean']

print(f"  [OK] ACWR computed for {len(user_acwr)} users")

# ────────────────────────────────────────────────────────────────────────────

# [3/7] Spike weeks count

# Normalize week_type (avoid malformed values)
df_sets['week_type'] = (
    df_sets['week_type']
    .astype(str).str.strip().str.lower()
    .replace({'sp': 'spike'})
)

# Build a week identifier (choose a convention and keep it consistent with EDA)
df_sets['week_id'] = df_sets['date'].dt.to_period('W-SUN')

spike_counts = (
    df_sets[df_sets['week_type'].eq('spike')]
    .drop_duplicates(['user_id', 'week_id'])
    .groupby('user_id')
    .size()
    .reset_index(name='spike_weeks_count')
)

print(f"  [OK] Spike weeks counted for {len(spike_counts)} users")


# ────────────────────────────────────────────────────────────────────────────

print("[4/7] Workout frequency...")

# Count workouts per user
workout_counts = df_workouts.groupby('user_id').size().reset_index(name='total_workouts')

# Calculate duration
user_duration = df_workouts.groupby('user_id')['date'].agg(['min', 'max']).reset_index()
user_duration['duration_days'] = (user_duration['max'] - user_duration['min']).dt.days + 1
user_duration['duration_weeks'] = user_duration['duration_days'] / 7.0

# Observed frequency (workouts/week)
workout_counts = workout_counts.merge(user_duration[['user_id', 'duration_weeks']], on='user_id')
workout_counts['observed_freq_weekly'] = workout_counts['total_workouts'] / workout_counts['duration_weeks']

print(f"  [OK] Frequency computed for {len(workout_counts)} users")

# ────────────────────────────────────────────────────────────────────────────

print("[5/7] Skip rate...")

skip_stats = df_workouts.groupby('user_id')['workout_status'].apply(
    lambda x: (x == 'skipped').sum() / len(x) if len(x) > 0 else 0.0
).reset_index(name='skip_rate')

print(f"  [OK] Skip rate computed for {len(skip_stats)} users")

# ────────────────────────────────────────────────────────────────────────────

print("[6/7] Load progression (slope regression)...")

def calculate_load_progression(user_df):
    """
    Calculate slope of load over time using linear regression.
    Returns monthly progression rate (1.0 = no change, 1.1 = +10%/month).
    """
    if len(user_df) < 10:
        return 1.0

    user_df = user_df.sort_values('date').reset_index(drop=True)
    user_df = user_df.dropna(subset=['load_done_kg'])

    if len(user_df) < 10:
        return 1.0

    # Time index (days from start)
    user_df['days_from_start'] = (user_df['date'] - user_df['date'].min()).dt.days

    # Linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        user_df['days_from_start'],
        user_df['load_done_kg']
    )

    # Normalize: slope per 30 days (monthly rate)
    slope_monthly = slope * 30

    # Convert to multiplier (1.0 = no change, 1.1 = +10% per month)
    if intercept > 0:
        progression = 1.0 + (slope_monthly / intercept)
    else:
        progression = 1.0

    # Clip to realistic range [0.5, 2.0]
    return float(np.clip(progression, 0.5, 2.0))

load_progressions = []
for uid in user_agg_basic['user_id']:
    user_sets = df_sets[df_sets['user_id'] == uid]
    prog = calculate_load_progression(user_sets)
    load_progressions.append({'user_id': uid, 'load_progression': prog})

load_prog_df = pd.DataFrame(load_progressions)

print(f"  [OK] Load progression computed for {len(load_prog_df)} users")

# ────────────────────────────────────────────────────────────────────────────

print("[7/7] Merging all aggregations...")

# Merge all
user_agg = user_agg_basic.copy()
user_agg = user_agg.merge(user_acwr, on='user_id', how='left')
user_agg = user_agg.merge(spike_counts, on='user_id', how='left')
user_agg = user_agg.merge(workout_counts[['user_id', 'observed_freq_weekly']], on='user_id', how='left')
user_agg = user_agg.merge(skip_stats, on='user_id', how='left')
user_agg = user_agg.merge(load_prog_df, on='user_id', how='left')

# Fill missing spike_weeks_count with 0
user_agg['spike_weeks_count'] = user_agg['spike_weeks_count'].fillna(0).astype(int)

# Merge target
user_agg = user_agg.merge(
    df_users[['user_id', 'experience_label']],
    on='user_id',
    how='left'
)

print(f"  [OK] Final dataset: {user_agg.shape}")
print()



SECTION 3: USER-LEVEL AGGREGATIONS

[1/7] Basic workout statistics...
  [OK] Aggregated 510 users
[2/7] ACWR statistics...
  [OK] ACWR computed for 510 users
  [OK] Spike weeks counted for 508 users
[4/7] Workout frequency...
  [OK] Frequency computed for 510 users
[5/7] Skip rate...
  [OK] Skip rate computed for 510 users
[6/7] Load progression (slope regression)...
  [OK] Load progression computed for 510 users
[7/7] Merging all aggregations...
  [OK] Final dataset: (510, 16)



#**CELLA 4 - DERIVED FEATURES**

In [4]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 4: FEATURE ENGINEERING - DERIVED FEATURES (Optional)
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 4: DERIVED FEATURES (Optional, for exploration)")
print("="*80)

# Composite features (NOT used in final model, for reference only)
user_agg['volume_score'] = user_agg['total_sets'] * user_agg['observed_freq_weekly']
user_agg['intensity_score'] = user_agg['load_mean'] * user_agg['reps_mean']
user_agg['rpe_volume_interaction'] = user_agg['rpe_mean'] * user_agg['reps_mean']

# Ratio features
user_agg['load_variability'] = user_agg['load_std'] / user_agg['load_mean']
user_agg['rpe_variability'] = user_agg['rpe_std'] / user_agg['rpe_mean']

# Spike rate (normalized)
user_agg['spike_rate'] = user_agg['spike_weeks_count'] / (user_agg['total_sets'] / 10)  # per 10 sets

print(f"\n[OK] Derived features created: 6")
print("  Note: These are NOT used in final model v2.2 (for reference/exploration only)")
print()



SECTION 4: DERIVED FEATURES (Optional, for exploration)

[OK] Derived features created: 6
  Note: These are NOT used in final model v2.2 (for reference/exploration only)



#**CELLA 5 - FEATURE SELECTION & VALIDATION**

In [5]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 5: FEATURE SELECTION & VALIDATION
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 5: FEATURE SELECTION & VALIDATION")
print("="*80)

# Check all selected features exist
print("\n[1/3] Verifying selected features exist...")
missing_features = [f for f in SELECTED_FEATURES if f not in user_agg.columns]

if missing_features:
    print(f"  [!]  Missing features: {missing_features}")
    raise ValueError("Some selected features not found in aggregated dataset")
else:
    print(f"  [OK] All {len(SELECTED_FEATURES)} features present")

# ────────────────────────────────────────────────────────────────────────────

print("\n[2/3] Checking for missing values...")

missing_counts = user_agg[SELECTED_FEATURES].isnull().sum()
missing_total = missing_counts.sum()

if missing_total > 0:
    print(f"  [!]  Missing values detected:")
    for feat, count in missing_counts[missing_counts > 0].items():
        print(f"    - {feat}: {count} ({count/len(user_agg)*100:.1f}%)")

    print(f"\n  Filling with median imputation...")
    for feat in SELECTED_FEATURES:
        if user_agg[feat].isnull().any():
            median_val = user_agg[feat].median()
            user_agg[feat].fillna(median_val, inplace=True)
            print(f"    - {feat}: filled with {median_val:.3f}")
else:
    print(f"  [OK] No missing values")

# ────────────────────────────────────────────────────────────────────────────

print("\n[3/3] Feature statistics summary...")

print("\nDescriptive statistics (selected features):")
print(user_agg[SELECTED_FEATURES].describe().T[['mean', 'std', 'min', 'max']].round(2))

print()



SECTION 5: FEATURE SELECTION & VALIDATION

[1/3] Verifying selected features exist...
  [OK] All 7 features present

[2/3] Checking for missing values...
  [OK] No missing values

[3/3] Feature statistics summary...

Descriptive statistics (selected features):
                      mean      std     min       max
reps_mean             8.79     0.15    8.55      9.21
rpe_mean              4.81     0.31    4.04      5.65
total_sets         3072.44  2880.31  125.00  13527.00
acwr_mean             1.14     0.06    1.03      1.39
spike_weeks_count     8.70     4.80    0.00     25.00
load_progression      1.08     0.15    0.95      2.00
skip_rate             0.13     0.07    0.03      0.38



#**CELLA 6 - PREPARE X, y**

In [6]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 6: PREPARE X, y
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 6: PREPARE X, y")
print("="*80)

X = user_agg[SELECTED_FEATURES].copy()
y = user_agg['experience_label'].copy()

print(f"\n[OK] X shape: {X.shape}")
print(f"[OK] y shape: {y.shape}")
print(f"[OK] Feature count: {X.shape[1]}")

# Verify no missing
assert X.isnull().sum().sum() == 0, "X contains missing values!"
assert y.isnull().sum() == 0, "y contains missing values!"

print("[OK] No missing values in X, y")
print()



SECTION 6: PREPARE X, y

[OK] X shape: (510, 7)
[OK] y shape: (510,)
[OK] Feature count: 7
[OK] No missing values in X, y



#**CELLA 7 - TRAIN/TEST SPLIT**

In [7]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 7: TRAIN/TEST SPLIT (Stratified)
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 7: TRAIN/TEST SPLIT (80/20, Stratified)")
print("="*80)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=SEED_SPLIT,
    stratify=y
)

print(f"\n[OK] Train set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"[OK] Test set:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# Verify stratification
print("\n" + "-"*80)
print("STRATIFICATION VERIFICATION")
print("-"*80)

train_dist = y_train.value_counts().sort_index()
test_dist = y_test.value_counts().sort_index()

print("\nTrain distribution:")
for label, count in train_dist.items():
    print(f"  {label:12s}: {count:3d} ({count/len(y_train)*100:5.1f}%)")

print("\nTest distribution:")
for label, count in test_dist.items():
    print(f"  {label:12s}: {count:3d} ({count/len(y_test)*100:5.1f}%)")

# Check balance
train_ratios = train_dist / len(y_train)
test_ratios = test_dist / len(y_test)
max_diff = (train_ratios - test_ratios).abs().max()

print(f"\n[OK] Max stratification difference: {max_diff:.3%}")
if max_diff < 0.02:
    print("  [OK] Excellent stratification (< 2%)")
elif max_diff < 0.05:
    print("  [OK] Good stratification (< 5%)")
else:
    print("  [!]  Moderate stratification (> 5%)")

print()



SECTION 7: TRAIN/TEST SPLIT (80/20, Stratified)

[OK] Train set: 408 samples (80.0%)
[OK] Test set:  102 samples (20.0%)

--------------------------------------------------------------------------------
STRATIFICATION VERIFICATION
--------------------------------------------------------------------------------

Train distribution:
  Advanced    : 136 ( 33.3%)
  Beginner    : 136 ( 33.3%)
  Intermediate: 136 ( 33.3%)

Test distribution:
  Advanced    :  34 ( 33.3%)
  Beginner    :  34 ( 33.3%)
  Intermediate:  34 ( 33.3%)

[OK] Max stratification difference: 0.000%
  [OK] Excellent stratification (< 2%)



#**CELLA 8 - FEATURE SCALING**

In [8]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 8: FEATURE SCALING (StandardScaler)
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 8: FEATURE SCALING")
print("="*80)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on train, transform both
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame (preserves feature names)
X_train_scaled = pd.DataFrame(
    X_train_scaled,
    columns=SELECTED_FEATURES,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    X_test_scaled,
    columns=SELECTED_FEATURES,
    index=X_test.index
)

print(f"\n[OK] Scaler fitted on train set ({len(X_train)} samples)")
print(f"[OK] Train and test transformed")

# Verify scaling (mean≈0, std≈1 on train)
print("\n" + "-"*80)
print("SCALING VERIFICATION (Train Set)")
print("-"*80)
print("\nMean (should be ≈0):")
print(X_train_scaled.mean().round(6))
print("\nStd (should be ≈1):")
print(X_train_scaled.std().round(6))

# Check test set (mean/std will differ, expected)
print("\n" + "-"*80)
print("TEST SET STATISTICS (After Scaling)")
print("-"*80)
print("\nMean (may differ from 0):")
print(X_test_scaled.mean().round(3))
print("\nStd (may differ from 1):")
print(X_test_scaled.std().round(3))

print("\n[OK] Scaling complete")
print()



SECTION 8: FEATURE SCALING

[OK] Scaler fitted on train set (408 samples)
[OK] Train and test transformed

--------------------------------------------------------------------------------
SCALING VERIFICATION (Train Set)
--------------------------------------------------------------------------------

Mean (should be ≈0):
reps_mean           -0.0
rpe_mean             0.0
total_sets          -0.0
acwr_mean           -0.0
spike_weeks_count    0.0
load_progression     0.0
skip_rate            0.0
dtype: float64

Std (should be ≈1):
reps_mean            1.001228
rpe_mean             1.001228
total_sets           1.001228
acwr_mean            1.001228
spike_weeks_count    1.001228
load_progression     1.001228
skip_rate            1.001228
dtype: float64

--------------------------------------------------------------------------------
TEST SET STATISTICS (After Scaling)
--------------------------------------------------------------------------------

Mean (may differ from 0):
reps_mean     

#**CELLA 9 - SAVE PREPROCESSED DATA**

In [9]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 9: SAVE PREPROCESSED DATA
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("SECTION 9: SAVE PREPROCESSED DATA")
print("="*80)

# Prepare data bundle
preprocessed_data = {
    'X_train': X_train_scaled,
    'X_test': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test,
    'scaler': scaler,
    'feature_names': SELECTED_FEATURES,
    'target_encoding': TARGET_ENCODING,
    'seed_split': SEED_SPLIT,
    'version': '2.2',
    'timestamp': datetime.now().isoformat()
}

# Save as pickle
output_path = MODELDIR / 'status_preprocessed_v2.2.pkl'

with open(output_path, 'wb') as f:
    pickle.dump(preprocessed_data, f)

print(f"\n[OK] Preprocessed data saved: {output_path}")
print(f"  File size: {output_path.stat().st_size / 1024:.1f} KB")

# ────────────────────────────────────────────────────────────────────────────

print("\n" + "-"*80)
print("SAVE FEATURE METADATA (for interpretability)")
print("-"*80)

# Feature metadata
feature_metadata = {
    'version': '2.2',
    'date_created': datetime.now().isoformat(),
    'n_features': len(SELECTED_FEATURES),
    'features': [],
    'removed_features': [
        {
            'name': 'consistency_score',
            'reason': 'Perfect leakage (correlation 1.000, deterministic)',
            'cohens_d': 'infinite',
            'correlation': 1.000
        },
        {
            'name': 'load_mean',
            'reason': 'Borderline leakage risk (correlation 0.803)',
            'cohens_d': 2.94,
            'correlation': 0.803
        },
        {
            'name': 'observed_freq_weekly',
            'reason': 'Low discriminative power (Cohen\'s d 0.04)',
            'cohens_d': 0.04,
            'correlation': 0.016
        }
    ],
    'train_test_split': {
        'test_size': 0.20,
        'random_state': SEED_SPLIT,
        'stratified': True,
        'n_train': len(X_train),
        'n_test': len(X_test)
    },
    'scaling': {
        'method': 'StandardScaler',
        'fit_on': 'train_set'
    }
}

# Add feature info (from EDA results, manually entered)
feature_info = [
    {'name': 'reps_mean', 'cohens_d': 3.30, 'correlation': -0.761, 'interpretation': 'Average reps per set (Beginner higher)'},
    {'name': 'rpe_mean', 'cohens_d': 2.21, 'correlation': -0.673, 'interpretation': 'RPE self-report (Beginner overestimate)'},
    {'name': 'total_sets', 'cohens_d': 2.61, 'correlation': 0.755, 'interpretation': 'Training history length'},
    {'name': 'acwr_mean', 'cohens_d': 2.16, 'correlation': -0.691, 'interpretation': 'Load management (Beginner higher ACWR)'},
    {'name': 'spike_weeks_count', 'cohens_d': 0.650, 'correlation': 0.247, 'interpretation': 'Frequency of overload weeks'},
    {'name': 'load_progression', 'cohens_d': 1.00, 'correlation': 0.432, 'interpretation': 'Monthly load growth rate'},
    {'name': 'skip_rate', 'cohens_d': 3.66, 'correlation': -0.839, 'interpretation': 'Workout dropout rate (fatigue-driven)'}
]

feature_metadata['features'] = feature_info

# Save metadata as JSON
metadata_path = MODELDIR / 'status_feature_metadata_v2.2.json'

with open(metadata_path, 'w') as f:
    json.dump(feature_metadata, f, indent=2)

print(f"\n[OK] Feature metadata saved: {metadata_path}")
print(f"  File size: {metadata_path.stat().st_size / 1024:.1f} KB")

print()



SECTION 9: SAVE PREPROCESSED DATA

[OK] Preprocessed data saved: models/status_preprocessed_v2.2.pkl
  File size: 43.1 KB

--------------------------------------------------------------------------------
SAVE FEATURE METADATA (for interpretability)
--------------------------------------------------------------------------------

[OK] Feature metadata saved: models/status_feature_metadata_v2.2.json
  File size: 1.9 KB



#**CELLA 10 - SUMMARY & NEXT STEPS**

In [10]:
# ═══════════════════════════════════════════════════════════════════════════
# SECTION 10: SUMMARY & NEXT STEPS
# ═══════════════════════════════════════════════════════════════════════════

print("="*80)
print("FEATURE ENGINEERING v2.2 - SUMMARY")
print("="*80)

print("\n" + "┌" + "─"*78 + "┐")
print("│" + " "*28 + "PREPROCESSING COMPLETE" + " "*28 + "│")
print("└" + "─"*78 + "┘")

print("\nDATASET STATISTICS")
print("-"*80)
print(f"Total users:          {len(user_agg):,}")
print(f"Train set:            {len(X_train):,} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set:             {len(X_test):,} ({len(X_test)/len(X)*100:.1f}%)")
print(f"Features selected:    {len(SELECTED_FEATURES)}")

print("\nTARGET DISTRIBUTION")
print("-"*80)
print("Train:")
for label, count in y_train.value_counts().sort_index().items():
    print(f"  {label:12s}: {count:3d} ({count/len(y_train)*100:5.1f}%)")
print("Test:")
for label, count in y_test.value_counts().sort_index().items():
    print(f"  {label:12s}: {count:3d} ({count/len(y_test)*100:5.1f}%)")

print("\nFEATURE ENGINEERING CHOICES")
print("-"*80)
print("Selected features (7):")
for i, feat in enumerate(SELECTED_FEATURES, 1):
    print(f"  {i}. {feat}")

print("\nRemoved features (3):")
print("  1. consistency_score (leakage: correlation 1.000)")
print("  2. load_mean (leakage risk: correlation 0.803)")
print("  3. observed_freq_weekly (low discriminative power)")

print("\nOUTPUT FILES")
print("-"*80)
print(f"[OK] {output_path}")
print(f"[OK] {metadata_path}")

print("\n NEXT STEPS")
print("-"*80)
print("1. Open STATUS_Modeling_v2.2.ipynb")
print("2. Load preprocessed data:")
print(f"   with open('{output_path}', 'rb') as f:")
print("       data = pickle.load(f)")
print("3. Train 6 models (Dummy, LR, DT, RF, GB, XGBoost)")
print("4. Evaluate on tests sets ")

print("\n" + "="*80)
print("FEATURE ENGINEERING v2.2 COMPLETE")
print("="*80)
print()


FEATURE ENGINEERING v2.2 - SUMMARY

┌──────────────────────────────────────────────────────────────────────────────┐
│                            PREPROCESSING COMPLETE                            │
└──────────────────────────────────────────────────────────────────────────────┘

DATASET STATISTICS
--------------------------------------------------------------------------------
Total users:          510
Train set:            408 (80.0%)
Test set:             102 (20.0%)
Features selected:    7

TARGET DISTRIBUTION
--------------------------------------------------------------------------------
Train:
  Advanced    : 136 ( 33.3%)
  Beginner    : 136 ( 33.3%)
  Intermediate: 136 ( 33.3%)
Test:
  Advanced    :  34 ( 33.3%)
  Beginner    :  34 ( 33.3%)
  Intermediate:  34 ( 33.3%)

FEATURE ENGINEERING CHOICES
--------------------------------------------------------------------------------
Selected features (7):
  1. reps_mean
  2. rpe_mean
  3. total_sets
  4. acwr_mean
  5. spike_weeks_cou