In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from hmmlearn import hmm
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Hidden Markov Model Modifiable Parameters
hidden_states = 3
em_iterations = 75

# Fixed training period 
train_start_date = "2018-01-01"
train_end_date = "2024-07-31"

# Date parameters for yfinance API - download data from 1995 to present
start_date = "1995-01-01"
end_date = datetime.today().strftime("%Y-%m-%d")

print(f"Downloading market data from {start_date} to {end_date}...")
# Download SPY and VIX data using yfinance
df_spy = yf.download('SPY', start=start_date, end=end_date, auto_adjust=True)
df_vix = yf.download('^VIX', start=start_date, end=end_date)

# Fix the multi-level column structure
df_spy.columns = df_spy.columns.droplevel(1) if len(df_spy.columns.names) > 1 else df_spy.columns
df_vix.columns = df_vix.columns.droplevel(1) if len(df_vix.columns.names) > 1 else df_vix.columns

# Reset index to make Date a column
df_spy = df_spy.reset_index()
df_vix = df_vix.reset_index()

# Keep only the Date and Close columns from VIX
df_vix = df_vix[['Date', 'Close']].rename(columns={'Close': 'VIX'})

# Merge SPY and VIX data
df = pd.merge(df_spy, df_vix, on='Date', how='left')

# Forward fill any missing VIX values
df['VIX'] = df['VIX'].fillna(method='ffill')

# Add log of VIX
df['LogVIX'] = np.log(df['VIX'])

# Display first few rows to check the structure
print("DataFrame structure:")
print("Data shape:", df.shape)
print("Data columns:", df.columns.tolist())
df.head()

Downloading market data from 1995-01-01 to 2025-04-13...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

YF.download() has changed argument auto_adjust default to True
DataFrame structure:
Data shape: (7621, 8)
Data columns: ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'VIX', 'LogVIX']





Price,Date,Close,High,Low,Open,Volume,VIX,LogVIX
0,1995-01-03,26.815863,26.852471,26.76095,26.770102,324300,14.25,2.656757
1,1995-01-04,26.943991,26.943991,26.797556,26.934839,351800,13.53,2.604909
2,1995-01-05,26.943991,27.008056,26.916534,26.962295,89800,13.5,2.60269
3,1995-01-06,26.97146,27.090439,26.889091,26.998917,448400,13.13,2.5749
4,1995-01-09,26.998922,26.998922,26.944009,26.962314,36800,13.33,2.590017


In [3]:
# Function to calculate technical indicators
def calculate_indicators(data):
    # Create a copy of the dataframe to avoid modifying original
    df_copy = data.copy()
    
    # Volatility is computed by obtaining variance between current close and prices of past 10 days
    volatility = []
    # MA is the 10 day SMA
    ma = []
    # Return is the single-day percentage return
    returns = []
    ma_sum = 0
    
    # Normalize LogVIX relative to its recent history (10-day window)
    log_vix_ratio = []
    
    # Warming up data for calculations
    for i in range(0, 10):
        volatility.append(0)
        ma.append(0)
        returns.append(0)
        log_vix_ratio.append(0)
        ma_sum += df_copy['Close'].iloc[i]
    
    # Filling in data for return, moving average, and volatility
    for ind in range(len(df_copy)):
        if ind >= 10:
            # SPY indicators
            tail_close = df_copy['Close'].iloc[ind-10]
            prev_close = df_copy['Close'].iloc[ind-1]
            head_close = df_copy['Close'].iloc[ind]
            ma_sum = (ma_sum - tail_close + head_close)
            ma_curr = ma_sum/10
            ma.append(ma_curr)
            returns.append(((head_close-prev_close)/prev_close)*100)
            
            # Computing Volatility
            vol_sum = 0
            for i in range(0, 10):
                curr_vol = abs(ma_curr - df_copy['Close'].iloc[ind-i])
                vol_sum += (curr_vol ** 2)
            volatility.append(vol_sum/10)
            
            # LogVIX-based indicator: current LogVIX relative to 10-day average
            log_vix_10day_avg = sum(df_copy['LogVIX'].iloc[ind-10:ind]) / 10
            log_vix_ratio.append(df_copy['LogVIX'].iloc[ind] / log_vix_10day_avg if log_vix_10day_avg > 0 else 1)
    
    # Adding columns to dataframe
    df_copy['Volatility'] = volatility
    df_copy['MA'] = ma
    df_copy['Return'] = returns
    df_copy['LogVIX_Ratio'] = log_vix_ratio
    
    return df_copy

# Calculate indicators for the entire dataset
df = calculate_indicators(df)
df.tail(5)

Price,Date,Close,High,Low,Open,Volume,VIX,LogVIX,Volatility,MA,Return,LogVIX_Ratio
7616,2025-04-07,504.380005,523.169983,481.799988,489.190002,256611400,46.98,3.849722,599.447177,549.803006,-0.178118,1.238541
7617,2025-04-08,496.480011,524.97998,489.160004,521.859985,165816600,52.330002,3.95757,755.574688,541.905005,-1.566278,1.233988
7618,2025-04-09,548.619995,548.619995,493.049988,493.440002,241867300,33.619999,3.515121,684.886736,539.908002,10.50193,1.059188
7619,2025-04-10,524.580017,533.5,509.320007,532.169983,162331200,40.720001,3.706719,616.487105,535.658002,-4.3819,1.096872
7620,2025-04-11,533.940002,536.429993,520.070007,523.01001,97741700,37.560001,3.62594,572.0568,533.486005,1.784282,1.0488


In [4]:
# Function to train HMM model on specified period
def train_hmm_model(data, start_date, end_date, n_states=3, n_iter=75):
    # Filter data to training period
    training = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    print(f"Training model on data from {start_date} to {end_date}")
    print(f"Training data shape: {training.shape}")
    
    # Prepare observations for HMM (using Volatility, Return, and LogVIX_Ratio)
    obs = np.column_stack([
        training['Volatility'].values, 
        training['Return'].values,
        training['LogVIX_Ratio'].values
    ])
    
    # Create and train the HMM model
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=n_iter)
    model.fit(obs)
    
    # Get predictions for training data
    predictions = model.predict(obs)
    
    # Analyze regime characteristics
    regime_vol = [0] * n_states
    regime_ret = [0] * n_states
    regime_vix = [0] * n_states
    regime_logvix = [0] * n_states
    regime_count = [0] * n_states
    
    for i in range(len(predictions)):
        regime = predictions[i]
        regime_count[regime] += 1
        regime_vol[regime] += training['Volatility'].iloc[i]
        regime_ret[regime] += training['Return'].iloc[i]
        regime_vix[regime] += training['VIX'].iloc[i]
        regime_logvix[regime] += training['LogVIX'].iloc[i]
    
    # Calculate averages
    for i in range(n_states):
        if regime_count[i] > 0:  # Prevent division by zero
            regime_vol[i] = regime_vol[i] / regime_count[i]
            regime_ret[i] = regime_ret[i] / regime_count[i]
            regime_vix[i] = regime_vix[i] / regime_count[i]
            regime_logvix[i] = regime_logvix[i] / regime_count[i]
    
    # Assign labels to regimes (Bull, Bear, Neutral) based on characteristics
    regime_labels = [""] * n_states
    
    # Create a scoring system for each regime
    bull_scores = []
    bear_scores = []
    neutral_scores = []
    
    for i in range(n_states):
        # Bull regime: high returns, lower VIX
        bull_score = 0
        if regime_ret[i] > 0.05:
            bull_score += 2
        elif regime_ret[i] > 0:
            bull_score += 1
        
        if regime_logvix[i] < 2.8:  # Log(16.5) ≈ 2.8
            bull_score += 2
        elif regime_logvix[i] < 3.0:  # Log(20) ≈ 3.0
            bull_score += 1
        
        # Bear regime: negative returns, higher VIX
        bear_score = 0
        if regime_ret[i] < -0.1:
            bear_score += 2
        elif regime_ret[i] < 0:
            bear_score += 1
        
        if regime_logvix[i] > 3.2:  # Log(25) ≈ 3.2
            bear_score += 2
        elif regime_logvix[i] > 3.0:
            bear_score += 1
        
        # Neutral regime: modest returns, moderate VIX
        neutral_score = 0
        if -0.05 < regime_ret[i] < 0.05:
            neutral_score += 2
        elif -0.1 < regime_ret[i] < 0.1:
            neutral_score += 1
        
        if 2.8 <= regime_logvix[i] <= 3.2:
            neutral_score += 2
        elif 2.7 <= regime_logvix[i] <= 3.3:
            neutral_score += 1
        
        bull_scores.append(bull_score)
        bear_scores.append(bear_score)
        neutral_scores.append(neutral_score)
    
    # Assign labels based on highest score
    # We need to ensure each label is used only once
    labels_to_assign = ["Bull", "Bear", "Neutral"]
    scores = [
        (i, max(bull_scores[i], bear_scores[i], neutral_scores[i]), 
         "Bull" if bull_scores[i] >= max(bear_scores[i], neutral_scores[i]) else
         "Bear" if bear_scores[i] >= max(bull_scores[i], neutral_scores[i]) else
         "Neutral") 
        for i in range(n_states)
    ]
    
    # Sort regimes by their maximum score (descending)
    scores.sort(key=lambda x: x[1], reverse=True)
    
    # Assign labels, handling ties by prioritizing the regime with the most clear characteristics
    assigned_labels = set()
    for regime_idx, _, preferred_label in scores:
        if preferred_label not in assigned_labels:
            regime_labels[regime_idx] = preferred_label
            assigned_labels.add(preferred_label)
        else:
            # Find the next best label that hasn't been assigned
            for label in labels_to_assign:
                if label not in assigned_labels:
                    regime_labels[regime_idx] = label
                    assigned_labels.add(label)
                    break
    
    # Print regime characteristics with labels
    for i in range(n_states):
        print(f"Regime {i} [{regime_labels[i]}]")
        print(f"Avg Vol: {regime_vol[i]:.4f}")
        print(f"Avg Return: {regime_ret[i]:.4f}%")
        print(f"Avg VIX: {regime_vix[i]:.2f}")
        print(f"Avg LogVIX: {regime_logvix[i]:.4f}")
        print(f"Occurrence: {regime_count[i]} days")
        print(f"Classification scores: Bull={bull_scores[i]}, Bear={bear_scores[i]}, Neutral={neutral_scores[i]}\n")
    
    return model, training, predictions, regime_labels

# Train the model on the fixed period
model, training_data, train_predictions, regime_labels = train_hmm_model(df, train_start_date, train_end_date, 
                                                                        hidden_states, em_iterations)

Training model on data from 2018-01-01 to 2024-07-31
Training data shape: (1655, 12)
Regime 0 [Bull]
Avg Vol: 5.6103
Avg Return: 0.1097%
Avg VIX: 16.15
Avg LogVIX: 2.7581
Occurrence: 774 days
Classification scores: Bull=4, Bear=0, Neutral=1

Regime 1 [Bear]
Avg Vol: 100.1015
Avg Return: -0.2328%
Avg VIX: 32.46
Avg LogVIX: 3.4170
Occurrence: 167 days
Classification scores: Bull=0, Bear=4, Neutral=0

Regime 2 [Neutral]
Avg Vol: 26.4383
Avg Return: 0.0700%
Avg VIX: 21.25
Avg LogVIX: 3.0173
Occurrence: 714 days
Classification scores: Bull=2, Bear=1, Neutral=3



In [5]:
# Visualize the training data with regime classifications using Plotly
training_with_predictions = training_data.copy()
training_with_predictions['Regime'] = train_predictions
training_with_predictions['Regime_Label'] = [regime_labels[r] for r in train_predictions]

# Create a categorical color map for regimes
regime_colors = px.colors.qualitative.Set2[:hidden_states]

# Create subplot figures for SPY, VIX, and LogVIX
fig = make_subplots(rows=3, cols=1, 
                   shared_xaxes=True,
                   vertical_spacing=0.1,
                   subplot_titles=('SPY Close Price by Regime', 'VIX by Regime', 'LogVIX by Regime'))

# SPY price by regime
for i, regime in enumerate(range(hidden_states)):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['Close'],
            mode='markers',
            marker=dict(color=regime_colors[i], size=6),
            name=f'Regime {regime}: {regime_labels[regime]}',
            showlegend=True
        ),
        row=1, col=1
    )

# VIX by regime
for i, regime in enumerate(range(hidden_states)):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['VIX'],
            mode='markers',
            marker=dict(color=regime_colors[i], size=6),
            name=f'Regime {regime}: {regime_labels[regime]}',
            showlegend=False
        ),
        row=2, col=1
    )

# LogVIX by regime
for i, regime in enumerate(range(hidden_states)):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['LogVIX'],
            mode='markers',
            marker=dict(color=regime_colors[i], size=6),
            name=f'Regime {regime}: {regime_labels[regime]}',
            showlegend=False
        ),
        row=3, col=1
    )

fig.update_layout(
    height=1000,
    title_text=f'Market Regimes with SPY, VIX, and LogVIX (Training Period: {train_start_date} to {train_end_date})',
    template='plotly_white',
    legend_title='Regime',
    hovermode='closest'
)

fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="SPY Price", row=1, col=1)
fig.update_yaxes(title_text="VIX", row=2, col=1)
fig.update_yaxes(title_text="LogVIX", row=3, col=1)

fig.show()

# Visualize transition probabilities with Plotly
transition_matrix = model.transmat_
regime_labels_with_numbers = [f"Regime {i}: {regime_labels[i]}" for i in range(hidden_states)]

# Create the heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=transition_matrix,
    x=regime_labels_with_numbers,
    y=regime_labels_with_numbers,
    colorscale='Blues',
    text=np.round(transition_matrix, 2),
    texttemplate="%{text:.2f}",
    textfont={"size": 14}
))

fig_heatmap.update_layout(
    title='Regime Transition Probabilities',
    xaxis_title='To Regime',
    yaxis_title='From Regime',
    width=700,
    height=600,
    template='plotly_white'
)

fig_heatmap.show()

# Show stationary distribution
stationary_dist = model.get_stationary_distribution()
print("\nStationary Distribution:")
for i in range(hidden_states):
    print(f"Regime {i} [{regime_labels[i]}]: {stationary_dist[i]*100:.2f}%")

# Create a pie chart for the stationary distribution
fig_pie = px.pie(
    values=stationary_dist * 100,
    names=regime_labels_with_numbers,
    title='Stationary Distribution of Regimes',
    color_discrete_sequence=regime_colors
)

fig_pie.update_traces(textinfo='percent+label', textposition='inside')
fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig_pie.show()

# Create a scatter plot of Volatility vs Return colored by regime
fig_scatter = px.scatter(
    training_with_predictions,
    x='LogVIX',
    y='Return',
    color='Regime_Label',
    color_discrete_sequence=regime_colors,
    labels={'LogVIX': 'Log(VIX)', 'Return': 'Return (%)'},
    title='Return vs LogVIX by Market Regime',
    opacity=0.7,
    hover_data=['Date', 'Close', 'VIX']
)

fig_scatter.update_layout(
    height=600,
    width=800,
    template='plotly_white',
    legend_title='Market Regime'
)

fig_scatter.show()


Stationary Distribution:
Regime 0 [Bull]: 45.48%
Regime 1 [Bear]: 10.53%
Regime 2 [Neutral]: 43.99%


In [6]:
# Function to predict regimes for a specific date range
def predict_regimes(model, data, start_date, end_date, regime_labels):
    """
    Predict market regimes for a specific date range using the trained HMM model
    
    Parameters:
    -----------
    model : hmm.GaussianHMM
        The trained HMM model
    data : DataFrame
        The full dataset with calculated indicators
    start_date : str
        Start date for prediction period in 'YYYY-MM-DD' format
    end_date : str
        End date for prediction period in 'YYYY-MM-DD' format
    regime_labels : list
        Labels for each regime (e.g., ["Bull", "Bear", "Neutral"])
        
    Returns:
    --------
    DataFrame with date, close price, VIX, LogVIX, and predicted regime
    """
    # Filter data for prediction period
    pred_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    if len(pred_data) == 0:
        print(f"No data available for period {start_date} to {end_date}")
        if end_date > data['Date'].max().strftime('%Y-%m-%d'):
            print("NOTE: Prediction period extends into the future")
            # Generate future dates for forecasting
            last_date = data['Date'].max()
            future_end = datetime.strptime(end_date, '%Y-%m-%d')
            
            # Get the most recent 30 days of data for calculating indicators
            recent_data = data.tail(30).copy()
            
            # For future dates, we'll extend from the last value and assume zero returns
            # This is a simplification - in reality you might want to use a forecasting model
            current_date = last_date + timedelta(days=1)
            while current_date <= future_end:
                if current_date.weekday() < 5:  # Only include weekdays
                    new_row = {
                        'Date': current_date,
                        'Close': recent_data['Close'].iloc[-1],  # Use the last known close price
                        'High': recent_data['Close'].iloc[-1],
                        'Low': recent_data['Close'].iloc[-1],
                        'Open': recent_data['Close'].iloc[-1],
                        'Volume': recent_data['Volume'].mean(),  # Use average volume
                        'VIX': recent_data['VIX'].mean(),  # Use average VIX
                        'LogVIX': np.log(recent_data['VIX'].mean()),  # Calculate LogVIX from mean VIX
                        'Volatility': recent_data['Volatility'].mean(),  # Use average volatility
                        'MA': recent_data['MA'].iloc[-1],  # Use the last MA
                        'Return': 0,  # Assume zero returns for future dates
                        'LogVIX_Ratio': 1.0  # Assume neutral LogVIX ratio
                    }
                    recent_data = pd.concat([recent_data, pd.DataFrame([new_row])], ignore_index=True)
                current_date += timedelta(days=1)
            
            # Keep only the future dates we generated
            future_data = recent_data[recent_data['Date'] > last_date].copy()
            pred_data = future_data[(future_data['Date'] >= start_date) & (future_data['Date'] <= end_date)].copy()
    
    # Prepare observations for prediction
    obs = np.column_stack([
        pred_data['Volatility'].values, 
        pred_data['Return'].values,
        pred_data['LogVIX_Ratio'].values
    ])
    
    # Predict regimes
    predictions = model.predict(obs)
    
    # Add predictions to dataframe
    pred_data['Predicted_Regime'] = predictions
    pred_data['Regime_Label'] = [regime_labels[r] for r in predictions]
    
    print(f"Predicted regimes for period {start_date} to {end_date}")
    print(f"Data points: {len(pred_data)}")
    
    # Calculate regime distribution
    regime_counts = pd.Series(predictions).value_counts(normalize=True) * 100
    print("\nRegime Distribution:")
    for regime, percentage in regime_counts.items():
        print(f"Regime {regime} [{regime_labels[regime]}]: {percentage:.2f}%")
    
    return pred_data[['Date', 'Close', 'VIX', 'LogVIX', 'Volatility', 'Return', 'Predicted_Regime', 'Regime_Label']]

In [7]:
# Unified function for regime prediction and visualization
def analyze_market_regimes(start_date, end_date, title=None):
    """
    Comprehensive function to predict and visualize market regimes for any date range
    
    Parameters:
    -----------
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format
    title : str, optional
        Custom title for the plots
    
    Returns:
    --------
    DataFrame with prediction results
    """
    # Get predictions
    results = predict_regimes(model, df, start_date, end_date, regime_labels)
    
    if results is None or results.empty:
        print("No data available for the specified period")
        return None
    
    # Set plot title
    if title is None:
        title = f'Market Regimes from {start_date} to {end_date}'
    
    # Create subplot figures for SPY, VIX, and LogVIX
    fig = make_subplots(rows=3, cols=1, 
                       shared_xaxes=True,
                       vertical_spacing=0.1,
                       subplot_titles=('SPY Close Price by Regime', 'VIX by Regime', 'LogVIX by Regime'))
    
    # SPY price by regime with labels
    for i, regime in enumerate(sorted(results['Predicted_Regime'].unique())):
        regime_data = results[results['Predicted_Regime'] == regime]
        regime_label = regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['Close'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'Regime {regime}: {regime_label}',
                showlegend=True
            ),
            row=1, col=1
        )
    
    # Add a line for the price trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['Close'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='SPY Price',
            showlegend=True
        ),
        row=1, col=1
    )
    
    # VIX by regime with labels
    for i, regime in enumerate(sorted(results['Predicted_Regime'].unique())):
        regime_data = results[results['Predicted_Regime'] == regime]
        regime_label = regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['VIX'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'Regime {regime}: {regime_label}',
                showlegend=False
            ),
            row=2, col=1
        )
    
    # Add a line for the VIX trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['VIX'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='VIX',
            showlegend=True
        ),
        row=2, col=1
    )
    
    # LogVIX by regime with labels
    for i, regime in enumerate(sorted(results['Predicted_Regime'].unique())):
        regime_data = results[results['Predicted_Regime'] == regime]
        regime_label = regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['LogVIX'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'Regime {regime}: {regime_label}',
                showlegend=False
            ),
            row=3, col=1
        )
    
    # Add a line for the LogVIX trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['LogVIX'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='LogVIX',
            showlegend=True
        ),
        row=3, col=1
    )
    
    fig.update_layout(
        height=1000,
        title_text=title,
        template='plotly_white',
        legend_title='Regime',
        hovermode='closest'
    )
    
    fig.update_xaxes(title_text="Date", row=3, col=1)
    fig.update_yaxes(title_text="SPY Price", row=1, col=1)
    fig.update_yaxes(title_text="VIX", row=2, col=1)
    fig.update_yaxes(title_text="LogVIX", row=3, col=1)
    
    fig.show()
    
    # Distribution of regimes pie chart
    regime_percentage_data = []
    for regime, label in enumerate(regime_labels):
        if regime in results['Predicted_Regime'].values:
            percentage = (results['Predicted_Regime'] == regime).mean() * 100
            regime_percentage_data.append({
                'Regime': f'Regime {regime}: {label}',
                'Percentage': percentage
            })
    
    regime_df = pd.DataFrame(regime_percentage_data)
    
    fig_pie = px.pie(
        regime_df,
        values='Percentage',
        names='Regime',
        title=f'Percentage of Time in Each Regime ({start_date} to {end_date})',
        color_discrete_sequence=regime_colors
    )
    
    fig_pie.update_traces(textinfo='percent+label', textposition='inside')
    fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
    fig_pie.show()
    
    # Scatter plot of LogVIX vs Return colored by regime with labels
    fig_scatter = px.scatter(
        results,
        x='LogVIX',
        y='Return',
        color='Regime_Label',
        color_discrete_sequence=regime_colors,
        title=f'LogVIX vs Return by Market Regime ({start_date} to {end_date})',
        labels={'LogVIX': 'Log(VIX)', 'Return': 'Return (%)', 'Regime_Label': 'Market Regime'},
        opacity=0.8,
        hover_data=['Date', 'Close', 'VIX']
    )
    
    fig_scatter.update_layout(
        legend_title='Market Regime',
        hovermode='closest',
        template='plotly_white',
        height=600,
        width=900
    )
    
    fig_scatter.update_xaxes(rangemode='tozero')
    fig_scatter.show()
    
    # Add a summary of time spent in each regime
    regime_summary = results.groupby('Regime_Label').agg({
        'Close': ['mean', 'min', 'max', 'std'],
        'VIX': ['mean', 'min', 'max', 'std'],
        'Return': ['mean', 'min', 'max', 'std'],
        'Predicted_Regime': 'count'
    }).reset_index()
    
    regime_summary.columns = ['Regime_Label', 'Avg_Price', 'Min_Price', 'Max_Price', 'Std_Price',
                             'Avg_VIX', 'Min_VIX', 'Max_VIX', 'Std_VIX',
                             'Avg_Return', 'Min_Return', 'Max_Return', 'Std_Return', 'Days']
    
    regime_summary['Percentage'] = regime_summary['Days'] / len(results) * 100
    
    print("\nRegime Summary Statistics:")
    display(regime_summary[['Regime_Label', 'Days', 'Percentage', 'Avg_Return', 'Avg_VIX', 'Avg_Price']])
    
    return results

# Example: Analyze the most recent 3 months
from datetime import datetime, timedelta

today = datetime.today()
three_months_ago = today - timedelta(days=360)

start_date = three_months_ago.strftime('%Y-%m-%d')
end_date = today.strftime('%Y-%m-%d')

recent_results = analyze_market_regimes(
    start_date, 
    end_date, 
    title=f'Market Regimes in Recent 3 Months ({start_date} to {end_date})'
)

Predicted regimes for period 2024-04-18 to 2025-04-13
Data points: 247

Regime Distribution:
Regime 2 [Neutral]: 56.28%
Regime 1 [Bear]: 23.08%
Regime 0 [Bull]: 20.65%



Regime Summary Statistics:


Unnamed: 0,Regime_Label,Days,Percentage,Avg_Return,Avg_VIX,Avg_Price
0,Bear,57,23.076923,-0.188246,22.465439,556.424563
1,Bull,51,20.647773,0.113033,15.773137,565.904257
2,Neutral,139,56.275304,0.104733,16.191511,560.369938


#HPF

In [9]:
# %%
# Function to implement a high-pass filter as described in Matthew Wang's article
def apply_high_pass_filter(data, window_size=10):
    """
    Implements a high-pass filter by subtracting a moving average (low frequency component)
    from the original signal to emphasize significant market movements.
    
    Parameters:
    -----------
    data : DataFrame
        Input data with financial time series
    window_size : int, optional
        Size of the moving average window, default is 10 days
        
    Returns:
    --------
    DataFrame with additional filtered data columns
    """
    # Create a copy of the dataframe to avoid modifying the original
    filtered_data = data.copy()
    
    # Calculate moving averages for SPY returns
    filtered_data['Return_MA'] = filtered_data['Return'].rolling(window=window_size).mean()
    
    # Apply high-pass filter to returns by subtracting the moving average
    # This removes low-frequency components and keeps high-frequency components
    filtered_data['HPF_Return'] = filtered_data['Return'] - filtered_data['Return_MA']
    
    # Also apply to LogVIX for consistent filtering
    filtered_data['LogVIX_MA'] = filtered_data['LogVIX'].rolling(window=window_size).mean()
    filtered_data['HPF_LogVIX'] = filtered_data['LogVIX'] - filtered_data['LogVIX_MA']
    
    # Similarly for volatility
    filtered_data['Volatility_MA'] = filtered_data['Volatility'].rolling(window=window_size).mean()
    filtered_data['HPF_Volatility'] = filtered_data['Volatility'] - filtered_data['Volatility_MA']
    
    # Forward fill any NaN values created by the rolling window
    filtered_data = filtered_data.fillna(method='ffill')
    
    # Backward fill any remaining NaN values at the beginning
    filtered_data = filtered_data.fillna(method='bfill')
    
    return filtered_data

# Apply the high-pass filter to our dataset
df_filtered = apply_high_pass_filter(df)

# %%
# Function to train HMM model using high-pass filtered data
def train_hmm_model_with_hpf(data, start_date, end_date, n_states=3, n_iter=75):
    """
    Train HMM model using high-pass filtered features instead of raw features
    """
    # Filter data to training period
    training = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    print(f"Training HPF model on data from {start_date} to {end_date}")
    print(f"Training data shape: {training.shape}")
    
    # Prepare observations for HMM using high-pass filtered features
    obs = np.column_stack([
        training['HPF_Volatility'].values, 
        training['HPF_Return'].values,
        training['HPF_LogVIX'].values
    ])
    
    # Create and train the HMM model
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=n_iter)
    model.fit(obs)
    
    # Get predictions for training data
    predictions = model.predict(obs)
    
    # Analyze regime characteristics using the original (unfiltered) data
    # This makes interpretation easier while using the improved classifications
    regime_vol = [0] * n_states
    regime_ret = [0] * n_states
    regime_vix = [0] * n_states
    regime_logvix = [0] * n_states
    regime_count = [0] * n_states
    
    for i in range(len(predictions)):
        regime = predictions[i]
        regime_count[regime] += 1
        regime_vol[regime] += training['Volatility'].iloc[i]
        regime_ret[regime] += training['Return'].iloc[i]
        regime_vix[regime] += training['VIX'].iloc[i]
        regime_logvix[regime] += training['LogVIX'].iloc[i]
    
    # Calculate averages
    for i in range(n_states):
        if regime_count[i] > 0:  # Prevent division by zero
            regime_vol[i] = regime_vol[i] / regime_count[i]
            regime_ret[i] = regime_ret[i] / regime_count[i]
            regime_vix[i] = regime_vix[i] / regime_count[i]
            regime_logvix[i] = regime_logvix[i] / regime_count[i]
    
    # Same regime labeling logic as in your original function
    regime_labels = [""] * n_states
    bull_scores = []
    bear_scores = []
    neutral_scores = []
    
    for i in range(n_states):
        # Bull regime: high returns, lower VIX
        bull_score = 0
        if regime_ret[i] > 0.05:
            bull_score += 2
        elif regime_ret[i] > 0:
            bull_score += 1
        
        if regime_logvix[i] < 2.8:  # Log(16.5) ≈ 2.8
            bull_score += 2
        elif regime_logvix[i] < 3.0:  # Log(20) ≈ 3.0
            bull_score += 1
        
        # Bear regime: negative returns, higher VIX
        bear_score = 0
        if regime_ret[i] < -0.1:
            bear_score += 2
        elif regime_ret[i] < 0:
            bear_score += 1
        
        if regime_logvix[i] > 3.2:  # Log(25) ≈ 3.2
            bear_score += 2
        elif regime_logvix[i] > 3.0:
            bear_score += 1
        
        # Neutral regime: modest returns, moderate VIX
        neutral_score = 0
        if -0.05 < regime_ret[i] < 0.05:
            neutral_score += 2
        elif -0.1 < regime_ret[i] < 0.1:
            neutral_score += 1
        
        if 2.8 <= regime_logvix[i] <= 3.2:
            neutral_score += 2
        elif 2.7 <= regime_logvix[i] <= 3.3:
            neutral_score += 1
        
        bull_scores.append(bull_score)
        bear_scores.append(bear_score)
        neutral_scores.append(neutral_score)
    
    # Assign labels based on highest score
    labels_to_assign = ["Bull", "Bear", "Neutral"]
    scores = [
        (i, max(bull_scores[i], bear_scores[i], neutral_scores[i]), 
         "Bull" if bull_scores[i] >= max(bear_scores[i], neutral_scores[i]) else
         "Bear" if bear_scores[i] >= max(bull_scores[i], neutral_scores[i]) else
         "Neutral") 
        for i in range(n_states)
    ]
    
    # Sort regimes by their maximum score (descending)
    scores.sort(key=lambda x: x[1], reverse=True)
    
    # Assign labels, handling ties by prioritizing the regime with the most clear characteristics
    assigned_labels = set()
    for regime_idx, _, preferred_label in scores:
        if preferred_label not in assigned_labels:
            regime_labels[regime_idx] = preferred_label
            assigned_labels.add(preferred_label)
        else:
            # Find the next best label that hasn't been assigned
            for label in labels_to_assign:
                if label not in assigned_labels:
                    regime_labels[regime_idx] = label
                    assigned_labels.add(label)
                    break
    
    # Print regime characteristics with labels
    for i in range(n_states):
        print(f"Regime {i} [{regime_labels[i]}]")
        print(f"Avg Vol: {regime_vol[i]:.4f}")
        print(f"Avg Return: {regime_ret[i]:.4f}%")
        print(f"Avg VIX: {regime_vix[i]:.2f}")
        print(f"Avg LogVIX: {regime_logvix[i]:.4f}")
        print(f"Occurrence: {regime_count[i]} days")
        print(f"Classification scores: Bull={bull_scores[i]}, Bear={bear_scores[i]}, Neutral={neutral_scores[i]}\n")
    
    return model, training, predictions, regime_labels

# Train a new model using high-pass filtered data
hpf_model, hpf_training_data, hpf_train_predictions, hpf_regime_labels = train_hmm_model_with_hpf(
    df_filtered, train_start_date, train_end_date, hidden_states, em_iterations
)

# %%
# Function to predict regimes using HPF model
def predict_regimes_with_hpf(model, data, start_date, end_date, regime_labels):
    """
    Predict market regimes using the HPF-trained model
    """
    # Filter data for prediction period
    pred_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    if len(pred_data) == 0:
        print(f"No data available for period {start_date} to {end_date}")
        return None
    
    # Prepare observations using HPF features
    obs = np.column_stack([
        pred_data['HPF_Volatility'].values, 
        pred_data['HPF_Return'].values,
        pred_data['HPF_LogVIX'].values
    ])
    
    # Predict regimes
    predictions = model.predict(obs)
    
    # Add predictions to dataframe
    pred_data['Predicted_Regime'] = predictions
    pred_data['Regime_Label'] = [regime_labels[r] for r in predictions]
    
    print(f"Predicted regimes for period {start_date} to {end_date} using HPF model")
    print(f"Data points: {len(pred_data)}")
    
    # Calculate regime distribution
    regime_counts = pd.Series(predictions).value_counts(normalize=True) * 100
    print("\nRegime Distribution:")
    for regime, percentage in regime_counts.items():
        print(f"Regime {regime} [{regime_labels[regime]}]: {percentage:.2f}%")
    
    return pred_data[['Date', 'Close', 'VIX', 'LogVIX', 'Volatility', 'Return', 'HPF_Return', 'HPF_LogVIX', 'Predicted_Regime', 'Regime_Label']]

# %%
# Unified function for comparative regime analysis
def compare_regime_models(start_date, end_date, title=None):
    """
    Compare standard HMM and HPF-enhanced HMM regime predictions
    
    Parameters:
    -----------
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format
    title : str, optional
        Custom title for the plots
    
    Returns:
    --------
    Tuple of DataFrames with prediction results (standard, HPF)
    """
    # Get predictions from both models
    standard_results = predict_regimes(model, df, start_date, end_date, regime_labels)
    hpf_results = predict_regimes_with_hpf(hpf_model, df_filtered, start_date, end_date, hpf_regime_labels)
    
    if standard_results is None or standard_results.empty or hpf_results is None or hpf_results.empty:
        print("No data available for the specified period")
        return None, None
    
    # Set plot title
    if title is None:
        title = f'Comparison of Market Regime Models ({start_date} to {end_date})'
    
    # Create subplot figures for comparing the models
    fig = make_subplots(rows=2, cols=1, 
                       shared_xaxes=True,
                       vertical_spacing=0.1,
                       subplot_titles=('Standard HMM Model', 'High-Pass Filtered HMM Model'))
    
    # Plot for Standard HMM Model
    fig.add_trace(
        go.Scatter(
            x=standard_results['Date'],
            y=standard_results['Close'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='SPY Price',
            showlegend=True
        ),
        row=1, col=1
    )
    
    # Add regime classifications as colored markers
    for i, regime in enumerate(sorted(standard_results['Predicted_Regime'].unique())):
        regime_data = standard_results[standard_results['Predicted_Regime'] == regime]
        regime_label = regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['Close'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'Standard: {regime_label}',
                showlegend=True
            ),
            row=1, col=1
        )
    
    # Plot for HPF HMM Model
    fig.add_trace(
        go.Scatter(
            x=hpf_results['Date'],
            y=hpf_results['Close'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='SPY Price',
            showlegend=False
        ),
        row=2, col=1
    )
    
    # Add regime classifications as colored markers
    for i, regime in enumerate(sorted(hpf_results['Predicted_Regime'].unique())):
        regime_data = hpf_results[hpf_results['Predicted_Regime'] == regime]
        regime_label = hpf_regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['Close'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'HPF: {regime_label}',
                showlegend=True
            ),
            row=2, col=1
        )
    
    fig.update_layout(
        height=800,
        title_text=title,
        template='plotly_white',
        legend_title='Regime Models',
        hovermode='closest'
    )
    
    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="SPY Price (Standard HMM)", row=1, col=1)
    fig.update_yaxes(title_text="SPY Price (HPF HMM)", row=2, col=1)
    
    fig.show()
    
    # Create a DataFrame to compare regime persistence
    standard_regimes = standard_results['Regime_Label'].copy()
    hpf_regimes = hpf_results['Regime_Label'].copy()
    
    # Count regime changes (transitions)
    standard_changes = (standard_regimes != standard_regimes.shift(1)).sum()
    hpf_changes = (hpf_regimes != hpf_regimes.shift(1)).sum()
    
    # Calculate average regime duration
    standard_duration = len(standard_regimes) / (standard_changes if standard_changes > 0 else 1)
    hpf_duration = len(hpf_regimes) / (hpf_changes if hpf_changes > 0 else 1)
    
    # Print comparison statistics
    print("\nRegime Persistence Comparison:")
    print(f"{'Model':<15} {'Regime Changes':<15} {'Avg Days/Regime':<15}")
    print(f"{'-'*45}")
    print(f"{'Standard HMM':<15} {standard_changes:<15} {standard_duration:.2f}")
    print(f"{'HPF HMM':<15} {hpf_changes:<15} {hpf_duration:.2f}")
    
    # Comparison of regime distribution
    standard_dist = standard_results['Regime_Label'].value_counts(normalize=True) * 100
    hpf_dist = hpf_results['Regime_Label'].value_counts(normalize=True) * 100
    
    # Create a DataFrame for comparison
    comparison_df = pd.DataFrame({
        'Standard HMM (%)': standard_dist,
        'HPF HMM (%)': hpf_dist
    }).sort_index()
    
    print("\nRegime Distribution Comparison:")
    display(comparison_df)
    
    return standard_results, hpf_results

# %%
# Run a comparison for the most recent period
from datetime import datetime, timedelta

today = datetime.today()
three_months_ago = today - timedelta(days=90)  # Use 90 days for clearer visualization

start_date = three_months_ago.strftime('%Y-%m-%d')
end_date = today.strftime('%Y-%m-%d')

standard_results, hpf_results = compare_regime_models(
    start_date, 
    end_date, 
    title=f'Standard vs HPF Regime Models ({start_date} to {end_date})'
)

# Visualize the HPF signal vs original signal
if hpf_results is not None:
    # Create plot to show the effect of high-pass filtering
    fig = make_subplots(rows=2, cols=1, 
                       shared_xaxes=True,
                       vertical_spacing=0.1,
                       subplot_titles=('Original Returns', 'High-Pass Filtered Returns'))
    
    # Original returns
    fig.add_trace(
        go.Scatter(
            x=hpf_results['Date'],
            y=hpf_results['Return'],
            mode='lines',
            line=dict(color='blue'),
            name='Original Returns'
        ),
        row=1, col=1
    )
    
    # High-pass filtered returns
    fig.add_trace(
        go.Scatter(
            x=hpf_results['Date'],
            y=hpf_results['HPF_Return'],
            mode='lines',
            line=dict(color='red'),
            name='HPF Returns'
        ),
        row=2, col=1
    )
    
    fig.update_layout(
        height=600,
        title_text='Effect of High-Pass Filtering on Returns',
        template='plotly_white',
        hovermode='closest'
    )
    
    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Original Return (%)", row=1, col=1)
    fig.update_yaxes(title_text="HPF Return", row=2, col=1)
    
    fig.show()


Training HPF model on data from 2018-01-01 to 2024-07-31
Training data shape: (1655, 18)
Regime 0 [Bull]
Avg Vol: 7.0630
Avg Return: 0.0639%
Avg VIX: 15.74
Avg LogVIX: 2.7360
Occurrence: 654 days
Classification scores: Bull=4, Bear=0, Neutral=2

Regime 1 [Bear]
Avg Vol: 68.5648
Avg Return: -0.1581%
Avg VIX: 29.57
Avg LogVIX: 3.3345
Occurrence: 292 days
Classification scores: Bull=0, Bear=4, Neutral=0

Regime 2 [Neutral]
Avg Vol: 21.5743
Avg Return: 0.1417%
Avg VIX: 19.98
Avg LogVIX: 2.9573
Occurrence: 709 days
Classification scores: Bull=3, Bear=0, Neutral=2

Predicted regimes for period 2025-01-13 to 2025-04-13
Data points: 63

Regime Distribution:
Regime 2 [Neutral]: 42.86%
Regime 1 [Bear]: 41.27%
Regime 0 [Bull]: 15.87%
Predicted regimes for period 2025-01-13 to 2025-04-13 using HPF model
Data points: 63

Regime Distribution:
Regime 1 [Bear]: 74.60%
Regime 2 [Neutral]: 14.29%
Regime 0 [Bull]: 11.11%



Regime Persistence Comparison:
Model           Regime Changes  Avg Days/Regime
---------------------------------------------
Standard HMM    9               7.00
HPF HMM         6               10.50

Regime Distribution Comparison:


Unnamed: 0_level_0,Standard HMM (%),HPF HMM (%)
Regime_Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Bear,41.269841,74.603175
Bull,15.873016,11.111111
Neutral,42.857143,14.285714
