# Regime Detection and Analysis

This notebook applies regime detection algorithms and analyzes market regimes.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.data_loader import DataLoader
from src.data.data_preprocessor import DataPreprocessor
from src.data.feature_engineer import FeatureEngineer
from src.regime_detection.gmm_detector import GMMDetector
from src.regime_detection.hmm_detector import HMMDetector
from src.regime_detection.dtw_clustering import DTWClustering
from src.utils.plotting import plot_regimes

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load and Prepare Data

In [None]:
# Load data
loader = DataLoader()
raw_data = loader.load_data('SPY', start_date='2015-01-01', end_date='2023-12-31')

# Preprocess
preprocessor = DataPreprocessor()
clean_data = preprocessor.clean_data(raw_data)

# Engineer features
engineer = FeatureEngineer()
features = engineer.create_features(clean_data)

# Extract regime-relevant features
regime_features = engineer.extract_regime_features(features)

print(f"Data shape: {features.shape}")
print(f"Regime features shape: {regime_features.shape}")
print(f"\nRegime features: {list(regime_features.columns)}")

## 2. GMM Regime Detection

In [None]:
# Test different numbers of regimes
n_regimes_options = [2, 3, 4, 5]
gmm_results = {}

for n in n_regimes_options:
    detector = GMMDetector(n_regimes=n, random_state=42)
    detector.fit(regime_features)
    score = detector.get_bic(regime_features)
    gmm_results[n] = {'detector': detector, 'bic': score}
    print(f"N={n}: BIC={score:.2f}")

# Plot BIC scores
plt.figure(figsize=(10, 6))
plt.plot(n_regimes_options, [gmm_results[n]['bic'] for n in n_regimes_options], 
         marker='o', linewidth=2, markersize=8)
plt.xlabel('Number of Regimes', fontsize=12)
plt.ylabel('BIC Score', fontsize=12)
plt.title('GMM Model Selection (Lower BIC is Better)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.show()

# Select optimal number of regimes (lowest BIC)
optimal_n = min(gmm_results.keys(), key=lambda k: gmm_results[k]['bic'])
print(f"\nOptimal number of regimes: {optimal_n}")

## 3. Detect and Visualize Regimes

In [None]:
# Use optimal GMM detector
gmm_detector = gmm_results[optimal_n]['detector']
gmm_regimes = gmm_detector.predict(regime_features)

# Get regime statistics
regime_stats = gmm_detector.get_regime_statistics(
    regime_features, 
    returns=features['returns']
)

print("\nRegime Statistics:")
for regime_id, stats in regime_stats.items():
    print(f"\nRegime {regime_id}:")
    print(f"  Count: {stats['count']}")
    print(f"  Frequency: {stats['frequency']:.2%}")
    print(f"  Avg Return: {stats['mean_return']:.4f}")
    print(f"  Volatility: {stats['volatility']:.4f}")

# Visualize regimes on price chart
fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True)

# Price with regime coloring
for regime in range(optimal_n):
    mask = gmm_regimes == regime
    axes[0].scatter(features.index[mask], features['close'][mask], 
                   label=f'Regime {regime}', alpha=0.6, s=2)

axes[0].set_title('Price with GMM Regimes', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Price ($)', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Returns with regime coloring
for regime in range(optimal_n):
    mask = gmm_regimes == regime
    axes[1].scatter(features.index[mask], features['returns'][mask], 
                   label=f'Regime {regime}', alpha=0.6, s=2)

axes[1].set_title('Returns with GMM Regimes', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Return', fontsize=12)
axes[1].set_xlabel('Date', fontsize=12)
axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.3)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. HMM Regime Detection

In [None]:
# Fit HMM
hmm_detector = HMMDetector(n_regimes=3, random_state=42)
hmm_detector.fit(regime_features)
hmm_regimes = hmm_detector.predict(regime_features)

print(f"HMM detected {hmm_regimes.nunique()} unique regimes")

# Get transition probabilities
trans_prob = hmm_detector.get_transition_probabilities()

print("\nTransition Probability Matrix:")
print(trans_prob)

# Visualize transition matrix
plt.figure(figsize=(8, 6))
sns.heatmap(trans_prob, annot=True, fmt='.3f', cmap='Blues', 
            xticklabels=[f'To {i}' for i in range(3)],
            yticklabels=[f'From {i}' for i in range(3)],
            cbar_kws={'label': 'Probability'})
plt.title('HMM Regime Transition Probabilities', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Compare HMM vs GMM regimes
fig, axes = plt.subplots(2, 1, figsize=(15, 8), sharex=True)

# GMM regimes
axes[0].plot(features.index, features['close'], color='lightgray', alpha=0.5, linewidth=0.5)
for regime in range(optimal_n):
    mask = gmm_regimes == regime
    axes[0].scatter(features.index[mask], features['close'][mask], 
                   label=f'Regime {regime}', alpha=0.7, s=3)
axes[0].set_title('GMM Regimes', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Price ($)', fontsize=12)
axes[0].legend(loc='upper left')
axes[0].grid(True, alpha=0.3)

# HMM regimes
axes[1].plot(features.index, features['close'], color='lightgray', alpha=0.5, linewidth=0.5)
for regime in range(3):
    mask = hmm_regimes == regime
    axes[1].scatter(features.index[mask], features['close'][mask], 
                   label=f'Regime {regime}', alpha=0.7, s=3)
axes[1].set_title('HMM Regimes', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Price ($)', fontsize=12)
axes[1].set_xlabel('Date', fontsize=12)
axes[1].legend(loc='upper left')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Regime Characteristics Analysis

In [None]:
# Analyze characteristics of each GMM regime
regime_characteristics = {}

for regime in range(optimal_n):
    mask = gmm_regimes == regime
    regime_data = features[mask]
    
    regime_characteristics[regime] = {
        'avg_return': regime_data['returns'].mean(),
        'volatility': regime_data['returns'].std(),
        'avg_volume': regime_data['volume'].mean(),
        'periods': len(regime_data),
        'frequency': len(regime_data) / len(features)
    }

# Create comparison dataframe
comparison_df = pd.DataFrame(regime_characteristics).T
print("\nRegime Characteristics:")
display(comparison_df)

# Visualize regime characteristics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Returns by regime
returns_by_regime = [features[gmm_regimes == r]['returns'].dropna() for r in range(optimal_n)]
axes[0, 0].boxplot(returns_by_regime, labels=[f'R{i}' for i in range(optimal_n)])
axes[0, 0].set_title('Returns Distribution by Regime', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Return', fontsize=10)
axes[0, 0].axhline(y=0, color='r', linestyle='--', alpha=0.3)
axes[0, 0].grid(True, alpha=0.3)

# Volatility by regime
vol_by_regime = [features[gmm_regimes == r]['returns'].std() for r in range(optimal_n)]
axes[0, 1].bar(range(optimal_n), vol_by_regime, color=sns.color_palette('husl', optimal_n))
axes[0, 1].set_title('Volatility by Regime', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Regime', fontsize=10)
axes[0, 1].set_ylabel('Volatility', fontsize=10)
axes[0, 1].set_xticks(range(optimal_n))
axes[0, 1].set_xticklabels([f'R{i}' for i in range(optimal_n)])
axes[0, 1].grid(True, alpha=0.3)

# Regime duration
regime_changes = (gmm_regimes.diff() != 0).astype(int)
regime_durations = regime_changes.groupby(regime_changes.cumsum()).size()
axes[1, 0].hist(regime_durations, bins=30, edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Regime Duration Distribution', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Duration (days)', fontsize=10)
axes[1, 0].set_ylabel('Frequency', fontsize=10)
axes[1, 0].axvline(x=regime_durations.mean(), color='r', linestyle='--', 
                   label=f'Mean: {regime_durations.mean():.1f}')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Regime frequency
regime_counts = gmm_regimes.value_counts().sort_index()
axes[1, 1].pie(regime_counts, labels=[f'Regime {i}' for i in regime_counts.index],
               autopct='%1.1f%%', startangle=90, colors=sns.color_palette('husl', optimal_n))
axes[1, 1].set_title('Regime Frequency', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nAverage regime duration: {regime_durations.mean():.1f} days")
print(f"Median regime duration: {regime_durations.median():.1f} days")

## 6. Save Regime Labels

In [None]:
# Save regime labels for strategy analysis
regime_df = pd.DataFrame({
    'date': features.index,
    'gmm_regime': gmm_regimes,
    'hmm_regime': hmm_regimes
})

regime_df.to_csv('../data/processed/regime_labels.csv', index=False)
print("Regime labels saved to data/processed/regime_labels.csv")

## Next Steps

- Proceed to `03_strategy_comparison.ipynb` for strategy backtesting
- Regimes have been identified and characterized
- Ready for regime-adaptive strategy selection