In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from hmmlearn import hmm
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Hidden Markov Model Modifiable Parameters
hidden_states = 3
em_iterations = 75

# Fixed training period 
train_start_date = "2018-01-01"
train_end_date = "2024-07-31"

# Date parameters for yfinance API - download data from 1995 to present
start_date = "1995-01-01"
end_date = datetime.today().strftime("%Y-%m-%d")

print(f"Downloading market data from {start_date} to {end_date}...")
# Download SPY and VIX data using yfinance
df_spy = yf.download('SPY', start=start_date, end=end_date, auto_adjust=True)
df_vix = yf.download('^VIX', start=start_date, end=end_date)

# Fix the multi-level column structure
df_spy.columns = df_spy.columns.droplevel(1) if len(df_spy.columns.names) > 1 else df_spy.columns
df_vix.columns = df_vix.columns.droplevel(1) if len(df_vix.columns.names) > 1 else df_vix.columns

# Reset index to make Date a column
df_spy = df_spy.reset_index()
df_vix = df_vix.reset_index()

# Keep only the Date and Close columns from VIX
df_vix = df_vix[['Date', 'Close']].rename(columns={'Close': 'VIX'})

# Merge SPY and VIX data
df = pd.merge(df_spy, df_vix, on='Date', how='left')

# Forward fill any missing VIX values
df['VIX'] = df['VIX'].fillna(method='ffill')

# Add log of VIX
df['LogVIX'] = np.log(df['VIX'])

# Display first few rows to check the structure
print("DataFrame structure:")
print("Data shape:", df.shape)
print("Data columns:", df.columns.tolist())
df.head()

Downloading market data from 1995-01-01 to 2025-04-13...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

YF.download() has changed argument auto_adjust default to True
DataFrame structure:
Data shape: (7621, 8)
Data columns: ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'VIX', 'LogVIX']





Price,Date,Close,High,Low,Open,Volume,VIX,LogVIX
0,1995-01-03,26.815863,26.852471,26.76095,26.770102,324300,14.25,2.656757
1,1995-01-04,26.944002,26.944002,26.797567,26.93485,351800,13.53,2.604909
2,1995-01-05,26.944002,27.008067,26.916546,26.962307,89800,13.5,2.60269
3,1995-01-06,26.971443,27.090421,26.889074,26.9989,448400,13.13,2.5749
4,1995-01-09,26.998919,26.998919,26.944005,26.96231,36800,13.33,2.590017


In [2]:
# Function to calculate technical indicators
def calculate_indicators(data):
    # Create a copy of the dataframe to avoid modifying original
    df_copy = data.copy()
    
    # Volatility is computed by obtaining variance between current close and prices of past 10 days
    volatility = []
    # MA is the 10 day SMA
    ma = []
    # Return is the single-day percentage return
    returns = []
    ma_sum = 0
    
    # Normalize LogVIX relative to its recent history (10-day window)
    log_vix_ratio = []
    
    # Warming up data for calculations
    for i in range(0, 10):
        volatility.append(0)
        ma.append(0)
        returns.append(0)
        log_vix_ratio.append(0)
        ma_sum += df_copy['Close'].iloc[i]
    
    # Filling in data for return, moving average, and volatility
    for ind in range(len(df_copy)):
        if ind >= 10:
            # SPY indicators
            tail_close = df_copy['Close'].iloc[ind-10]
            prev_close = df_copy['Close'].iloc[ind-1]
            head_close = df_copy['Close'].iloc[ind]
            ma_sum = (ma_sum - tail_close + head_close)
            ma_curr = ma_sum/10
            ma.append(ma_curr)
            returns.append(((head_close-prev_close)/prev_close)*100)
            
            # Computing Volatility
            vol_sum = 0
            for i in range(0, 10):
                curr_vol = abs(ma_curr - df_copy['Close'].iloc[ind-i])
                vol_sum += (curr_vol ** 2)
            volatility.append(vol_sum/10)
            
            # LogVIX-based indicator: current LogVIX relative to 10-day average
            log_vix_10day_avg = sum(df_copy['LogVIX'].iloc[ind-10:ind]) / 10
            log_vix_ratio.append(df_copy['LogVIX'].iloc[ind] / log_vix_10day_avg if log_vix_10day_avg > 0 else 1)
    
    # Adding columns to dataframe
    df_copy['Volatility'] = volatility
    df_copy['MA'] = ma
    df_copy['Return'] = returns
    df_copy['LogVIX_Ratio'] = log_vix_ratio
    
    return df_copy

# Calculate indicators for the entire dataset
df = calculate_indicators(df)
df.tail(5)

Price,Date,Close,High,Low,Open,Volume,VIX,LogVIX,Volatility,MA,Return,LogVIX_Ratio
7616,2025-04-07,504.380005,523.169983,481.799988,489.190002,256611400,46.98,3.849722,599.447177,549.803006,-0.178118,1.238541
7617,2025-04-08,496.480011,524.97998,489.160004,521.859985,165816600,52.330002,3.95757,755.574688,541.905005,-1.566278,1.233988
7618,2025-04-09,548.619995,548.619995,493.049988,493.440002,241867300,33.619999,3.515121,684.886736,539.908002,10.50193,1.059188
7619,2025-04-10,524.580017,533.5,509.320007,532.169983,162331200,40.720001,3.706719,616.487105,535.658002,-4.3819,1.096872
7620,2025-04-11,533.940002,536.429993,520.070007,523.01001,97741700,37.560001,3.62594,572.0568,533.486005,1.784282,1.0488


In [3]:
# Function to train HMM model on specified period
def train_hmm_model(data, start_date, end_date, n_states=3, n_iter=75):
    # Filter data to training period
    training = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    print(f"Training model on data from {start_date} to {end_date}")
    print(f"Training data shape: {training.shape}")
    
    # Prepare observations for HMM (using Volatility, Return, and LogVIX_Ratio)
    obs = np.column_stack([
        training['Volatility'].values, 
        training['Return'].values,
        training['LogVIX_Ratio'].values
    ])
    
    # Create and train the HMM model
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=n_iter)
    model.fit(obs)
    
    # Get predictions for training data
    predictions = model.predict(obs)
    
    # Analyze regime characteristics
    regime_vol = [0] * n_states
    regime_ret = [0] * n_states
    regime_vix = [0] * n_states
    regime_logvix = [0] * n_states
    regime_count = [0] * n_states
    
    for i in range(len(predictions)):
        regime = predictions[i]
        regime_count[regime] += 1
        regime_vol[regime] += training['Volatility'].iloc[i]
        regime_ret[regime] += training['Return'].iloc[i]
        regime_vix[regime] += training['VIX'].iloc[i]
        regime_logvix[regime] += training['LogVIX'].iloc[i]
    
    # Calculate averages
    for i in range(n_states):
        if regime_count[i] > 0:  # Prevent division by zero
            regime_vol[i] = regime_vol[i] / regime_count[i]
            regime_ret[i] = regime_ret[i] / regime_count[i]
            regime_vix[i] = regime_vix[i] / regime_count[i]
            regime_logvix[i] = regime_logvix[i] / regime_count[i]
    
    # Assign labels to regimes (Bull, Bear, Neutral) based on characteristics
    regime_labels = [""] * n_states
    
    # Create a scoring system for each regime
    bull_scores = []
    bear_scores = []
    neutral_scores = []
    
    for i in range(n_states):
        # Bull regime: high returns, lower VIX
        bull_score = 0
        if regime_ret[i] > 0.05:
            bull_score += 2
        elif regime_ret[i] > 0:
            bull_score += 1
        
        if regime_logvix[i] < 2.8:  # Log(16.5) ≈ 2.8
            bull_score += 2
        elif regime_logvix[i] < 3.0:  # Log(20) ≈ 3.0
            bull_score += 1
        
        # Bear regime: negative returns, higher VIX
        bear_score = 0
        if regime_ret[i] < -0.1:
            bear_score += 2
        elif regime_ret[i] < 0:
            bear_score += 1
        
        if regime_logvix[i] > 3.2:  # Log(25) ≈ 3.2
            bear_score += 2
        elif regime_logvix[i] > 3.0:
            bear_score += 1
        
        # Neutral regime: modest returns, moderate VIX
        neutral_score = 0
        if -0.05 < regime_ret[i] < 0.05:
            neutral_score += 2
        elif -0.1 < regime_ret[i] < 0.1:
            neutral_score += 1
        
        if 2.8 <= regime_logvix[i] <= 3.2:
            neutral_score += 2
        elif 2.7 <= regime_logvix[i] <= 3.3:
            neutral_score += 1
        
        bull_scores.append(bull_score)
        bear_scores.append(bear_score)
        neutral_scores.append(neutral_score)
    
    # Assign labels based on highest score
    # We need to ensure each label is used only once
    labels_to_assign = ["Bull", "Bear", "Neutral"]
    scores = [
        (i, max(bull_scores[i], bear_scores[i], neutral_scores[i]), 
         "Bull" if bull_scores[i] >= max(bear_scores[i], neutral_scores[i]) else
         "Bear" if bear_scores[i] >= max(bull_scores[i], neutral_scores[i]) else
         "Neutral") 
        for i in range(n_states)
    ]
    
    # Sort regimes by their maximum score (descending)
    scores.sort(key=lambda x: x[1], reverse=True)
    
    # Assign labels, handling ties by prioritizing the regime with the most clear characteristics
    assigned_labels = set()
    for regime_idx, _, preferred_label in scores:
        if preferred_label not in assigned_labels:
            regime_labels[regime_idx] = preferred_label
            assigned_labels.add(preferred_label)
        else:
            # Find the next best label that hasn't been assigned
            for label in labels_to_assign:
                if label not in assigned_labels:
                    regime_labels[regime_idx] = label
                    assigned_labels.add(label)
                    break
    
    # Print regime characteristics with labels
    for i in range(n_states):
        print(f"Regime {i} [{regime_labels[i]}]")
        print(f"Avg Vol: {regime_vol[i]:.4f}")
        print(f"Avg Return: {regime_ret[i]:.4f}%")
        print(f"Avg VIX: {regime_vix[i]:.2f}")
        print(f"Avg LogVIX: {regime_logvix[i]:.4f}")
        print(f"Occurrence: {regime_count[i]} days")
        print(f"Classification scores: Bull={bull_scores[i]}, Bear={bear_scores[i]}, Neutral={neutral_scores[i]}\n")
    
    return model, training, predictions, regime_labels

# Train the model on the fixed period
model, training_data, train_predictions, regime_labels = train_hmm_model(df, train_start_date, train_end_date, 
                                                                        hidden_states, em_iterations)

Training model on data from 2018-01-01 to 2024-07-31
Training data shape: (1655, 12)
Regime 0 [Bull]
Avg Vol: 4.7859
Avg Return: 0.0714%
Avg VIX: 15.96
Avg LogVIX: 2.7458
Occurrence: 674 days
Classification scores: Bull=4, Bear=0, Neutral=2

Regime 1 [Bear]
Avg Vol: 82.0140
Avg Return: -0.1289%
Avg VIX: 30.35
Avg LogVIX: 3.3577
Occurrence: 253 days
Classification scores: Bull=0, Bear=4, Neutral=0

Regime 2 [Neutral]
Avg Vol: 21.9245
Avg Return: 0.1106%
Avg VIX: 20.14
Avg LogVIX: 2.9664
Occurrence: 728 days
Classification scores: Bull=3, Bear=0, Neutral=2



In [4]:
# Visualize the training data with regime classifications using Plotly
training_with_predictions = training_data.copy()
training_with_predictions['Regime'] = train_predictions
training_with_predictions['Regime_Label'] = [regime_labels[r] for r in train_predictions]

# Create a categorical color map for regimes
regime_colors = px.colors.qualitative.Set2[:hidden_states]

# Create subplot figures for SPY, VIX, and LogVIX
fig = make_subplots(rows=3, cols=1, 
                   shared_xaxes=True,
                   vertical_spacing=0.1,
                   subplot_titles=('SPY Close Price by Regime', 'VIX by Regime', 'LogVIX by Regime'))

# SPY price by regime
for i, regime in enumerate(range(hidden_states)):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['Close'],
            mode='markers',
            marker=dict(color=regime_colors[i], size=6),
            name=f'Regime {regime}: {regime_labels[regime]}',
            showlegend=True
        ),
        row=1, col=1
    )

# VIX by regime
for i, regime in enumerate(range(hidden_states)):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['VIX'],
            mode='markers',
            marker=dict(color=regime_colors[i], size=6),
            name=f'Regime {regime}: {regime_labels[regime]}',
            showlegend=False
        ),
        row=2, col=1
    )

# LogVIX by regime
for i, regime in enumerate(range(hidden_states)):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['LogVIX'],
            mode='markers',
            marker=dict(color=regime_colors[i], size=6),
            name=f'Regime {regime}: {regime_labels[regime]}',
            showlegend=False
        ),
        row=3, col=1
    )

fig.update_layout(
    height=1000,
    title_text=f'Market Regimes with SPY, VIX, and LogVIX (Training Period: {train_start_date} to {train_end_date})',
    template='plotly_white',
    legend_title='Regime',
    hovermode='closest'
)

fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="SPY Price", row=1, col=1)
fig.update_yaxes(title_text="VIX", row=2, col=1)
fig.update_yaxes(title_text="LogVIX", row=3, col=1)

fig.show()

# Visualize transition probabilities with Plotly
transition_matrix = model.transmat_
regime_labels_with_numbers = [f"Regime {i}: {regime_labels[i]}" for i in range(hidden_states)]

# Create the heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=transition_matrix,
    x=regime_labels_with_numbers,
    y=regime_labels_with_numbers,
    colorscale='Blues',
    text=np.round(transition_matrix, 2),
    texttemplate="%{text:.2f}",
    textfont={"size": 14}
))

fig_heatmap.update_layout(
    title='Regime Transition Probabilities',
    xaxis_title='To Regime',
    yaxis_title='From Regime',
    width=700,
    height=600,
    template='plotly_white'
)

fig_heatmap.show()

# Show stationary distribution
stationary_dist = model.get_stationary_distribution()
print("\nStationary Distribution:")
for i in range(hidden_states):
    print(f"Regime {i} [{regime_labels[i]}]: {stationary_dist[i]*100:.2f}%")

# Create a pie chart for the stationary distribution
fig_pie = px.pie(
    values=stationary_dist * 100,
    names=regime_labels_with_numbers,
    title='Stationary Distribution of Regimes',
    color_discrete_sequence=regime_colors
)

fig_pie.update_traces(textinfo='percent+label', textposition='inside')
fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig_pie.show()

# Create a scatter plot of Volatility vs Return colored by regime
fig_scatter = px.scatter(
    training_with_predictions,
    x='LogVIX',
    y='Return',
    color='Regime_Label',
    color_discrete_sequence=regime_colors,
    labels={'LogVIX': 'Log(VIX)', 'Return': 'Return (%)'},
    title='Return vs LogVIX by Market Regime',
    opacity=0.7,
    hover_data=['Date', 'Close', 'VIX']
)

fig_scatter.update_layout(
    height=600,
    width=800,
    template='plotly_white',
    legend_title='Market Regime'
)

fig_scatter.show()


Stationary Distribution:
Regime 0 [Bull]: 39.88%
Regime 1 [Bear]: 15.61%
Regime 2 [Neutral]: 44.51%


In [5]:
# Function to predict regimes for a specific date range
def predict_regimes(model, data, start_date, end_date, regime_labels):
    """
    Predict market regimes for a specific date range using the trained HMM model
    
    Parameters:
    -----------
    model : hmm.GaussianHMM
        The trained HMM model
    data : DataFrame
        The full dataset with calculated indicators
    start_date : str
        Start date for prediction period in 'YYYY-MM-DD' format
    end_date : str
        End date for prediction period in 'YYYY-MM-DD' format
    regime_labels : list
        Labels for each regime (e.g., ["Bull", "Bear", "Neutral"])
        
    Returns:
    --------
    DataFrame with date, close price, VIX, LogVIX, and predicted regime
    """
    # Filter data for prediction period
    pred_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    if len(pred_data) == 0:
        print(f"No data available for period {start_date} to {end_date}")
        if end_date > data['Date'].max().strftime('%Y-%m-%d'):
            print("NOTE: Prediction period extends into the future")
            # Generate future dates for forecasting
            last_date = data['Date'].max()
            future_end = datetime.strptime(end_date, '%Y-%m-%d')
            
            # Get the most recent 30 days of data for calculating indicators
            recent_data = data.tail(30).copy()
            
            # For future dates, we'll extend from the last value and assume zero returns
            # This is a simplification - in reality you might want to use a forecasting model
            current_date = last_date + timedelta(days=1)
            while current_date <= future_end:
                if current_date.weekday() < 5:  # Only include weekdays
                    new_row = {
                        'Date': current_date,
                        'Close': recent_data['Close'].iloc[-1],  # Use the last known close price
                        'High': recent_data['Close'].iloc[-1],
                        'Low': recent_data['Close'].iloc[-1],
                        'Open': recent_data['Close'].iloc[-1],
                        'Volume': recent_data['Volume'].mean(),  # Use average volume
                        'VIX': recent_data['VIX'].mean(),  # Use average VIX
                        'LogVIX': np.log(recent_data['VIX'].mean()),  # Calculate LogVIX from mean VIX
                        'Volatility': recent_data['Volatility'].mean(),  # Use average volatility
                        'MA': recent_data['MA'].iloc[-1],  # Use the last MA
                        'Return': 0,  # Assume zero returns for future dates
                        'LogVIX_Ratio': 1.0  # Assume neutral LogVIX ratio
                    }
                    recent_data = pd.concat([recent_data, pd.DataFrame([new_row])], ignore_index=True)
                current_date += timedelta(days=1)
            
            # Keep only the future dates we generated
            future_data = recent_data[recent_data['Date'] > last_date].copy()
            pred_data = future_data[(future_data['Date'] >= start_date) & (future_data['Date'] <= end_date)].copy()
    
    # Prepare observations for prediction
    obs = np.column_stack([
        pred_data['Volatility'].values, 
        pred_data['Return'].values,
        pred_data['LogVIX_Ratio'].values
    ])
    
    # Predict regimes
    predictions = model.predict(obs)
    
    # Add predictions to dataframe
    pred_data['Predicted_Regime'] = predictions
    pred_data['Regime_Label'] = [regime_labels[r] for r in predictions]
    
    print(f"Predicted regimes for period {start_date} to {end_date}")
    print(f"Data points: {len(pred_data)}")
    
    # Calculate regime distribution
    regime_counts = pd.Series(predictions).value_counts(normalize=True) * 100
    print("\nRegime Distribution:")
    for regime, percentage in regime_counts.items():
        print(f"Regime {regime} [{regime_labels[regime]}]: {percentage:.2f}%")
    
    return pred_data[['Date', 'Close', 'VIX', 'LogVIX', 'Volatility', 'Return', 'Predicted_Regime', 'Regime_Label']]

In [6]:
# Unified function for regime prediction and visualization
def analyze_market_regimes(start_date, end_date, title=None):
    """
    Comprehensive function to predict and visualize market regimes for any date range
    
    Parameters:
    -----------
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format
    title : str, optional
        Custom title for the plots
    
    Returns:
    --------
    DataFrame with prediction results
    """
    # Get predictions
    results = predict_regimes(model, df, start_date, end_date, regime_labels)
    
    if results is None or results.empty:
        print("No data available for the specified period")
        return None
    
    # Set plot title
    if title is None:
        title = f'Market Regimes from {start_date} to {end_date}'
    
    # Create subplot figures for SPY, VIX, and LogVIX
    fig = make_subplots(rows=3, cols=1, 
                       shared_xaxes=True,
                       vertical_spacing=0.1,
                       subplot_titles=('SPY Close Price by Regime', 'VIX by Regime', 'LogVIX by Regime'))
    
    # SPY price by regime with labels
    for i, regime in enumerate(sorted(results['Predicted_Regime'].unique())):
        regime_data = results[results['Predicted_Regime'] == regime]
        regime_label = regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['Close'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'Regime {regime}: {regime_label}',
                showlegend=True
            ),
            row=1, col=1
        )
    
    # Add a line for the price trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['Close'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='SPY Price',
            showlegend=True
        ),
        row=1, col=1
    )
    
    # VIX by regime with labels
    for i, regime in enumerate(sorted(results['Predicted_Regime'].unique())):
        regime_data = results[results['Predicted_Regime'] == regime]
        regime_label = regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['VIX'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'Regime {regime}: {regime_label}',
                showlegend=False
            ),
            row=2, col=1
        )
    
    # Add a line for the VIX trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['VIX'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='VIX',
            showlegend=True
        ),
        row=2, col=1
    )
    
    # LogVIX by regime with labels
    for i, regime in enumerate(sorted(results['Predicted_Regime'].unique())):
        regime_data = results[results['Predicted_Regime'] == regime]
        regime_label = regime_labels[regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['LogVIX'],
                mode='markers',
                marker=dict(color=regime_colors[i], size=6),
                name=f'Regime {regime}: {regime_label}',
                showlegend=False
            ),
            row=3, col=1
        )
    
    # Add a line for the LogVIX trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['LogVIX'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='LogVIX',
            showlegend=True
        ),
        row=3, col=1
    )
    
    fig.update_layout(
        height=1000,
        title_text=title,
        template='plotly_white',
        legend_title='Regime',
        hovermode='closest'
    )
    
    fig.update_xaxes(title_text="Date", row=3, col=1)
    fig.update_yaxes(title_text="SPY Price", row=1, col=1)
    fig.update_yaxes(title_text="VIX", row=2, col=1)
    fig.update_yaxes(title_text="LogVIX", row=3, col=1)
    
    fig.show()
    
    # Distribution of regimes pie chart
    regime_percentage_data = []
    for regime, label in enumerate(regime_labels):
        if regime in results['Predicted_Regime'].values:
            percentage = (results['Predicted_Regime'] == regime).mean() * 100
            regime_percentage_data.append({
                'Regime': f'Regime {regime}: {label}',
                'Percentage': percentage
            })
    
    regime_df = pd.DataFrame(regime_percentage_data)
    
    fig_pie = px.pie(
        regime_df,
        values='Percentage',
        names='Regime',
        title=f'Percentage of Time in Each Regime ({start_date} to {end_date})',
        color_discrete_sequence=regime_colors
    )
    
    fig_pie.update_traces(textinfo='percent+label', textposition='inside')
    fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
    fig_pie.show()
    
    # Scatter plot of LogVIX vs Return colored by regime with labels
    fig_scatter = px.scatter(
        results,
        x='LogVIX',
        y='Return',
        color='Regime_Label',
        color_discrete_sequence=regime_colors,
        title=f'LogVIX vs Return by Market Regime ({start_date} to {end_date})',
        labels={'LogVIX': 'Log(VIX)', 'Return': 'Return (%)', 'Regime_Label': 'Market Regime'},
        opacity=0.8,
        hover_data=['Date', 'Close', 'VIX']
    )
    
    fig_scatter.update_layout(
        legend_title='Market Regime',
        hovermode='closest',
        template='plotly_white',
        height=600,
        width=900
    )
    
    fig_scatter.update_xaxes(rangemode='tozero')
    fig_scatter.show()
    
    # Add a summary of time spent in each regime
    regime_summary = results.groupby('Regime_Label').agg({
        'Close': ['mean', 'min', 'max', 'std'],
        'VIX': ['mean', 'min', 'max', 'std'],
        'Return': ['mean', 'min', 'max', 'std'],
        'Predicted_Regime': 'count'
    }).reset_index()
    
    regime_summary.columns = ['Regime_Label', 'Avg_Price', 'Min_Price', 'Max_Price', 'Std_Price',
                             'Avg_VIX', 'Min_VIX', 'Max_VIX', 'Std_VIX',
                             'Avg_Return', 'Min_Return', 'Max_Return', 'Std_Return', 'Days']
    
    regime_summary['Percentage'] = regime_summary['Days'] / len(results) * 100
    
    print("\nRegime Summary Statistics:")
    display(regime_summary[['Regime_Label', 'Days', 'Percentage', 'Avg_Return', 'Avg_VIX', 'Avg_Price']])
    
    return results

# Example: Analyze the most recent 3 months
from datetime import datetime, timedelta

today = datetime.today()
three_months_ago = today - timedelta(days=360)

start_date = three_months_ago.strftime('%Y-%m-%d')
end_date = today.strftime('%Y-%m-%d')

recent_results = analyze_market_regimes(
    start_date, 
    end_date, 
    title=f'Market Regimes in Recent 3 Months ({start_date} to {end_date})'
)

Predicted regimes for period 2024-04-18 to 2025-04-13
Data points: 247

Regime Distribution:
Regime 2 [Neutral]: 51.82%
Regime 1 [Bear]: 30.77%
Regime 0 [Bull]: 17.41%



Regime Summary Statistics:


Unnamed: 0,Regime_Label,Days,Percentage,Avg_Return,Avg_VIX,Avg_Price
0,Bear,76,30.769231,-0.156073,21.392895,558.942797
1,Bull,43,17.408907,0.112358,16.010465,565.813948
2,Neutral,128,51.821862,0.129864,15.791172,559.836613


In [7]:
# Visualize market regime predictions for specific historical periods of interest

# Function to plot regime histogram for a specific time period
def plot_regime_distribution(start_date, end_date, title=None):
    """Plot distribution of regimes for a specific time period"""
    results = predict_regimes(model, df, start_date, end_date, regime_labels)
    
    if results is None or results.empty:
        print(f"No data available for period {start_date} to {end_date}")
        return
    
    # Count by regime label
    counts = results.groupby('Regime_Label')['Date'].count()
    percentages = counts / len(results) * 100
    
    # Create a dataframe for plotting
    plot_data = pd.DataFrame({
        'Regime': counts.index,
        'Count': counts.values,
        'Percentage': percentages.values
    })
    
    # Plot
    fig = px.bar(
        plot_data,
        x='Regime',
        y='Percentage',
        color='Regime',
        color_discrete_sequence=regime_colors[:len(counts)],
        text='Count',
        title=title or f'Regime Distribution ({start_date} to {end_date})'
    )
    
    fig.update_layout(
        height=500,
        width=800,
        template='plotly_white',
        showlegend=False,
        yaxis_title='Percentage of Days (%)',
        xaxis_title='Market Regime'
    )
    
    fig.update_traces(textposition='outside')
    fig.show()
    
    # Return summary statistics
    return results.groupby('Regime_Label').agg({
        'Close': ['mean', 'std'],
        'Return': ['mean', 'std'],
        'VIX': ['mean', 'std'],
        'LogVIX': ['mean', 'std'],
        'Predicted_Regime': 'count'
    })

# 1. COVID-19 Market Crash (February-April 2020)
print("====== COVID-19 Market Crash (February-April 2020) ======")
covid_crash = plot_regime_distribution("2020-02-01", "2020-04-30", 
                                      "Market Regimes During COVID-19 Crash (Feb-Apr 2020)")

# 2. Post-COVID Recovery (May-December 2020)
print("\n====== Post-COVID Recovery (May-December 2020) ======")
covid_recovery = plot_regime_distribution("2020-05-01", "2020-12-31", 
                                         "Market Regimes During Post-COVID Recovery (May-Dec 2020)")

# 3. 2022 Bear Market
print("\n====== 2022 Bear Market ======")
bear_2022 = plot_regime_distribution("2022-01-01", "2022-12-31", 
                                    "Market Regimes During 2022 Bear Market")

# 4. 2023 Recovery
print("\n====== 2023 Recovery ======")
recovery_2023 = plot_regime_distribution("2023-01-01", "2023-12-31", 
                                        "Market Regimes During 2023 Recovery")

# 5. Current Year
print("\n====== Current Year (2024) ======")
current_year = plot_regime_distribution("2024-01-01", end_date, 
                                       f"Market Regimes in 2024 (Through {end_date})")

Predicted regimes for period 2020-02-01 to 2020-04-30
Data points: 62

Regime Distribution:
Regime 1 [Bear]: 62.90%
Regime 2 [Neutral]: 29.03%
Regime 0 [Bull]: 8.06%



Predicted regimes for period 2020-05-01 to 2020-12-31
Data points: 170

Regime Distribution:
Regime 2 [Neutral]: 53.53%
Regime 0 [Bull]: 31.76%
Regime 1 [Bear]: 14.71%



Predicted regimes for period 2022-01-01 to 2022-12-31
Data points: 251

Regime Distribution:
Regime 1 [Bear]: 52.59%
Regime 2 [Neutral]: 45.02%
Regime 0 [Bull]: 2.39%



Predicted regimes for period 2023-01-01 to 2023-12-31
Data points: 250

Regime Distribution:
Regime 2 [Neutral]: 68.80%
Regime 0 [Bull]: 25.20%
Regime 1 [Bear]: 6.00%



Predicted regimes for period 2024-01-01 to 2025-04-13
Data points: 321

Regime Distribution:
Regime 2 [Neutral]: 54.83%
Regime 1 [Bear]: 23.99%
Regime 0 [Bull]: 21.18%


In [8]:
# Function to predict future market regimes
def predict_future_market_regimes(days=30):
    """
    Predict market regimes for the coming days
    
    Parameters:
    -----------
    days : int
        Number of days to predict into the future
    
    Returns:
    --------
    DataFrame with prediction results and visualizations
    """
    today = datetime.today()
    future_date = today + timedelta(days=days)
    
    # Format dates
    start_date = today.strftime('%Y-%m-%d')
    end_date = future_date.strftime('%Y-%m-%d')
    
    print(f"Predicting market regimes from {start_date} to {end_date}")
    
    # Get predictions
    future_results = predict_regimes(model, df, start_date, end_date, regime_labels)
    
    if future_results is None or future_results.empty:
        print("Failed to generate predictions")
        return None
    
    # Create regime breakdown barplot
    regime_counts = future_results.groupby('Regime_Label')['Predicted_Regime'].count().reset_index()
    regime_counts['Percentage'] = regime_counts['Predicted_Regime'] / len(future_results) * 100
    
    fig_bar = px.bar(
        regime_counts,
        x='Regime_Label',
        y='Percentage',
        color='Regime_Label',
        text=regime_counts['Predicted_Regime'],
        color_discrete_sequence=regime_colors[:len(regime_counts)],
        title=f'Predicted Market Regime Distribution (Next {days} Days)'
    )
    
    fig_bar.update_layout(
        height=500,
        width=800,
        template='plotly_white',
        showlegend=False,
        xaxis_title='Market Regime',
        yaxis_title='Percentage of Days (%)'
    )
    
    fig_bar.update_traces(textposition='outside')
    fig_bar.show()
    
    # Create a calendar view of predicted regimes
    future_results['Day'] = future_results['Date'].dt.day
    future_results['Month'] = future_results['Date'].dt.month_name()
    future_results['Weekday'] = future_results['Date'].dt.day_name()
    
    # Create a calendar heatmap
    fig_cal = px.scatter(
        future_results,
        x='Day', 
        y='Weekday',
        color='Regime_Label',
        color_discrete_sequence=regime_colors,
        title=f'Calendar View of Predicted Regimes ({start_date} to {end_date})',
        hover_data=['Date', 'Close', 'VIX', 'LogVIX'],
        size_max=15,
        size=[10] * len(future_results)  # Fixed size for all points
    )
    
    fig_cal.update_layout(
        xaxis_title='Day of Month',
        yaxis_title='Weekday',
        yaxis=dict(
            categoryorder='array',
            categoryarray=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
        ),
        template='plotly_white',
        height=600,
        width=900
    )
    
    fig_cal.show()
    
    # Calculate the dominant regime
    dominant_regime = regime_counts.iloc[regime_counts['Predicted_Regime'].argmax()]['Regime_Label']
    dominant_percentage = regime_counts.iloc[regime_counts['Predicted_Regime'].argmax()]['Percentage']
    
    print("\nMarket Outlook Summary:")
    print(f"Dominant predicted regime: {dominant_regime} ({dominant_percentage:.2f}% of days)")
    
    # More detailed breakdown
    print("\nDetailed Regime Breakdown:")
    for _, row in regime_counts.iterrows():
        print(f"{row['Regime_Label']}: {row['Predicted_Regime']} days ({row['Percentage']:.2f}%)")
    
    return future_results

# Predict regimes for the next 30 days
future_predictions = predict_future_market_regimes(30)

Predicting market regimes from 2025-04-13 to 2025-05-13
No data available for period 2025-04-13 to 2025-05-13
NOTE: Prediction period extends into the future
Predicted regimes for period 2025-04-13 to 2025-05-13
Data points: 22

Regime Distribution:
Regime 1 [Bear]: 95.45%
Regime 2 [Neutral]: 4.55%



Market Outlook Summary:
Dominant predicted regime: Bear (95.45% of days)

Detailed Regime Breakdown:
Bear: 21 days (95.45%)
Neutral: 1 days (4.55%)
