In [9]:
# 03_regime_classification.py - Final Version with Risk-On Fix
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

# --- Configuration ---
DATA_PATH = Path('./data')
PROCESSED_PATH = DATA_PATH / 'processed'

def load_and_merge_data() -> pd.DataFrame:
    """Robust data loading and merging with error handling"""
    try:
        # Load and validate stock data
        stock_df = pd.read_parquet(PROCESSED_PATH / 'full_dataset.parquet')
        stock_df['date'] = pd.to_datetime(stock_df['date'], errors='coerce')
        stock_df = stock_df.dropna(subset=['date']).sort_values('date')

        # Load and process macro data
        macro_df = pd.read_parquet(DATA_PATH / 'macro_data.parquet')
        macro_df['date'] = pd.to_datetime(macro_df['date'], errors='coerce')
        macro_df = macro_df.rename(columns={
            'vix': 'vixcls',
            'yield_curve': 't10y2y'
        }).dropna(subset=['date'])

        # Handle missing values in macro indicators
        macro_df['vixcls'] = macro_df['vixcls'].ffill().bfill()
        macro_df['t10y2y'] = macro_df['t10y2y'].interpolate().ffill()

        # Perform the merge with error checking
        merged_df = pd.merge_asof(
            stock_df,
            macro_df[['date', 'vixcls', 't10y2y']],
            on='date',
            direction='nearest',
            tolerance=pd.Timedelta('7D')
        )

        # Final data validation
        if merged_df.empty:
            raise ValueError("Merge resulted in empty dataframe")
            
        return merged_df.dropna(subset=['vixcls', 't10y2y'])

    except Exception as e:
        print(f"Data processing error: {str(e)}")
        print("Verify that:")
        print("- All input files exist")
        print("- Date formats are consistent")
        print("- Required columns (vix, yield_curve) exist in macro_data")
        raise

def classify_regime(row: pd.Series) -> str:
    """Enhanced market regime classification with Risk-On fix"""
    try:
        vix = row.get('vixcls', np.nan)
        yield_spread = row.get('t10y2y', np.nan)
        
        # Handle missing values first
        if pd.isna(vix) or pd.isna(yield_spread):
            return 'Unknown'
            
        # Risk-Off Conditions (either trigger)
        if vix > 25 or yield_spread < -0.25:
            return 'Risk-Off'
            
        # Risk-On Conditions (both required) - FIXED LOGIC
        if vix < 15 and yield_spread > 0.75:
            return 'Risk-On'
            
        # Moderate Risk-On condition
        if vix < 20 and yield_spread > 0.5:
            return 'Moderate-Risk-On'
            
        return 'Neutral'
        
    except KeyError as e:
        print(f"Missing column: {str(e)}")
        return 'Unknown'

def main():
    print("🏁 Starting market regime analysis...")
    start_time = datetime.now()
    
    try:
        # Load and merge datasets
        merged_df = load_and_merge_data()
        
        # Add regime classification
        merged_df['macro_regime'] = merged_df.apply(classify_regime, axis=1)
        
        # Save results
        output_path = PROCESSED_PATH / 'features_with_regime.parquet'
        merged_df.to_parquet(output_path)
        
        # Print summary statistics
        print("\n✅ Analysis completed successfully!")
        print(f"📅 Date range: {merged_df['date'].min()} to {merged_df['date'].max()}")
        print(f"📦 Saved {len(merged_df)} records to {output_path}")
        
        print("\nMarket Regime Distribution:")
        regime_counts = merged_df['macro_regime'].value_counts(normalize=True)
        print(regime_counts.mul(100).round(1).astype(str) + '%')

    except Exception as e:
        print(f"❌ Analysis failed: {str(e)}")
        raise
    finally:
        print(f"\n⏱️ Total runtime: {datetime.now() - start_time}")

if __name__ == '__main__':
    main()

🏁 Starting market regime analysis...

✅ Analysis completed successfully!
📅 Date range: 2022-01-03 00:00:00 to 2025-05-12 00:00:00
📦 Saved 359244 records to data/processed/features_with_regime.parquet

Market Regime Distribution:
Risk-Off            66.5%
Neutral             32.5%
Moderate-Risk-On     1.1%
Name: macro_regime, dtype: object

⏱️ Total runtime: 0:00:02.727993
