In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from hmmlearn import hmm
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Hidden Markov Model Modifiable Parameters
hidden_states = 3
em_iterations = 75

# Fixed training period 
train_start_date = "2018-01-01"
train_end_date = "2024-12-31"

# Date parameters for yfinance API - download data from 1995 to present
start_date = "1995-01-01"
end_date = datetime.today().strftime("%Y-%m-%d")

print(f"Downloading market data from {start_date} to {end_date}...")
# Download SPY and VIX data using yfinance
df_spy = yf.download('SPY', start=start_date, end=end_date, auto_adjust=True)
df_vix = yf.download('^VIX', start=start_date, end=end_date)

# Fix the multi-level column structure
df_spy.columns = df_spy.columns.droplevel(1) if len(df_spy.columns.names) > 1 else df_spy.columns
df_vix.columns = df_vix.columns.droplevel(1) if len(df_vix.columns.names) > 1 else df_vix.columns

# Reset index to make Date a column
df_spy = df_spy.reset_index()
df_vix = df_vix.reset_index()

# Keep only the Date and Close columns from VIX
df_vix = df_vix[['Date', 'Close']].rename(columns={'Close': 'VIX'})

# Merge SPY and VIX data
df = pd.merge(df_spy, df_vix, on='Date', how='left')

# Forward fill any missing VIX values
df['VIX'] = df['VIX'].fillna(method='ffill')

# Add log of VIX
df['LogVIX'] = np.log(df['VIX'])

# Display first few rows to check the structure
print("DataFrame structure:")
print("Data shape:", df.shape)
print("Data columns:", df.columns.tolist())
df.head()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloading market data from 1995-01-01 to 2025-04-11...
DataFrame structure:
Data shape: (7620, 8)
Data columns: ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'VIX', 'LogVIX']





Price,Date,Close,High,Low,Open,Volume,VIX,LogVIX
0,1995-01-03,26.815872,26.852481,26.760959,26.770111,324300,14.25,2.656757
1,1995-01-04,26.944002,26.944002,26.797567,26.93485,351800,13.53,2.604909
2,1995-01-05,26.944002,27.008067,26.916546,26.962307,89800,13.5,2.60269
3,1995-01-06,26.971464,27.090442,26.889095,26.998921,448400,13.13,2.5749
4,1995-01-09,26.998913,26.998913,26.944,26.962304,36800,13.33,2.590017


In [22]:
# Function to calculate technical indicators
def calculate_indicators(data):
    # Create a copy of the dataframe to avoid modifying original
    df_copy = data.copy()
    
    # Volatility is computed by obtaining variance between current close and prices of past 10 days
    volatility = []
    # MA is the 10 day SMA
    ma = []
    # Return is the single-day percentage return
    returns = []
    ma_sum = 0
    
    # Normalize LogVIX relative to its recent history (10-day window)
    log_vix_ratio = []
    
    # Warming up data for calculations
    for i in range(0, 10):
        volatility.append(0)
        ma.append(0)
        returns.append(0)
        log_vix_ratio.append(0)
        ma_sum += df_copy['Close'].iloc[i]
    
    # Filling in data for return, moving average, and volatility
    for ind in range(len(df_copy)):
        if ind >= 10:
            # SPY indicators
            tail_close = df_copy['Close'].iloc[ind-10]
            prev_close = df_copy['Close'].iloc[ind-1]
            head_close = df_copy['Close'].iloc[ind]
            ma_sum = (ma_sum - tail_close + head_close)
            ma_curr = ma_sum/10
            ma.append(ma_curr)
            returns.append(((head_close-prev_close)/prev_close)*100)
            
            # Computing Volatility
            vol_sum = 0
            for i in range(0, 10):
                curr_vol = abs(ma_curr - df_copy['Close'].iloc[ind-i])
                vol_sum += (curr_vol ** 2)
            volatility.append(vol_sum/10)
            
            # LogVIX-based indicator: current LogVIX relative to 10-day average
            log_vix_10day_avg = sum(df_copy['LogVIX'].iloc[ind-10:ind]) / 10
            log_vix_ratio.append(df_copy['LogVIX'].iloc[ind] / log_vix_10day_avg if log_vix_10day_avg > 0 else 1)
    
    # Adding columns to dataframe
    df_copy['Volatility'] = volatility
    df_copy['MA'] = ma
    df_copy['Return'] = returns
    df_copy['LogVIX_Ratio'] = log_vix_ratio
    
    return df_copy

# Calculate indicators for the entire dataset
df = calculate_indicators(df)
df.head(15)

Price,Date,Close,High,Low,Open,Volume,VIX,LogVIX,Volatility,MA,Return,LogVIX_Ratio
0,1995-01-03,26.815872,26.852481,26.760959,26.770111,324300,14.25,2.656757,0.0,0.0,0.0,0.0
1,1995-01-04,26.944002,26.944002,26.797567,26.93485,351800,13.53,2.604909,0.0,0.0,0.0,0.0
2,1995-01-05,26.944002,27.008067,26.916546,26.962307,89800,13.5,2.60269,0.0,0.0,0.0,0.0
3,1995-01-06,26.971464,27.090442,26.889095,26.998921,448400,13.13,2.5749,0.0,0.0,0.0,0.0
4,1995-01-09,26.998913,26.998913,26.944,26.962304,36800,13.33,2.590017,0.0,0.0,0.0,0.0
5,1995-01-10,27.026375,27.17281,27.026375,27.062984,229800,12.52,2.527327,0.0,0.0,0.0,0.0
6,1995-01-11,27.044676,27.117893,26.834176,27.117893,222400,12.15,2.497329,0.0,0.0,0.0,0.0
7,1995-01-12,27.053829,27.072134,26.962307,27.01722,40300,12.83,2.551786,0.0,0.0,0.0,0.0
8,1995-01-13,27.374159,27.374159,27.163659,27.200268,170600,11.1,2.406945,0.0,0.0,0.0,0.0
9,1995-01-16,27.538895,27.548047,27.365003,27.365003,105100,11.14,2.410542,0.0,0.0,0.0,0.0


In [23]:
# Function to train HMM model on specified period
def train_hmm_model(data, start_date, end_date, n_states=3, n_iter=75):
    # Filter data to training period
    training = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    print(f"Training model on data from {start_date} to {end_date}")
    print(f"Training data shape: {training.shape}")
    
    # Prepare observations for HMM (using Volatility, Return, and LogVIX_Ratio)
    obs = np.column_stack([
        training['Volatility'].values, 
        training['Return'].values,
        training['LogVIX_Ratio'].values
    ])
    
    # Create and train the HMM model
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=n_iter)
    model.fit(obs)
    
    # Get predictions for training data
    predictions = model.predict(obs)
    
    # Analyze regime characteristics
    regime_vol = [0] * n_states
    regime_ret = [0] * n_states
    regime_vix = [0] * n_states
    regime_logvix = [0] * n_states
    regime_count = [0] * n_states
    
    for i in range(len(predictions)):
        regime = predictions[i]
        regime_count[regime] += 1
        regime_vol[regime] += training['Volatility'].iloc[i]
        regime_ret[regime] += training['Return'].iloc[i]
        regime_vix[regime] += training['VIX'].iloc[i]
        regime_logvix[regime] += training['LogVIX'].iloc[i]
    
    # Calculate averages
    for i in range(n_states):
        if regime_count[i] > 0:  # Prevent division by zero
            regime_vol[i] = regime_vol[i] / regime_count[i]
            regime_ret[i] = regime_ret[i] / regime_count[i]
            regime_vix[i] = regime_vix[i] / regime_count[i]
            regime_logvix[i] = regime_logvix[i] / regime_count[i]
    
    # Print regime characteristics
    for i in range(n_states):
        print(f"Regime {i}")
        print(f"Avg Vol: {regime_vol[i]:.4f}")
        print(f"Avg Return: {regime_ret[i]:.4f}%")
        print(f"Avg VIX: {regime_vix[i]:.2f}")
        print(f"Avg LogVIX: {regime_logvix[i]:.4f}")
        print(f"Occurrence: {regime_count[i]} days\n")
    
    return model, training, predictions

# Train the model on the fixed period
model, training_data, train_predictions = train_hmm_model(df, train_start_date, train_end_date, 
                                                         hidden_states, em_iterations)

Training model on data from 2018-01-01 to 2024-12-31
Training data shape: (1761, 12)
Regime 0
Avg Vol: 4.7944
Avg Return: 0.0709%
Avg VIX: 16.02
Avg LogVIX: 2.7498
Occurrence: 696 days

Regime 1
Avg Vol: 80.5913
Avg Return: -0.1366%
Avg VIX: 28.65
Avg LogVIX: 3.2938
Occurrence: 307 days

Regime 2
Avg Vol: 21.8409
Avg Return: 0.1265%
Avg VIX: 19.85
Avg LogVIX: 2.9529
Occurrence: 758 days



In [24]:
# Visualize the training data with regime classifications using Plotly
training_with_predictions = training_data.copy()
training_with_predictions['Regime'] = train_predictions

# Create a categorical color map for regimes
regime_colors = px.colors.qualitative.Set2[:hidden_states]

# Create subplot figures for SPY, VIX, and LogVIX
fig = make_subplots(rows=3, cols=1, 
                   shared_xaxes=True,
                   vertical_spacing=0.1,
                   subplot_titles=('SPY Close Price by Regime', 'VIX by Regime', 'LogVIX by Regime'))

# SPY price by regime
for regime in range(hidden_states):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['Close'],
            mode='markers',
            marker=dict(color=regime_colors[regime], size=6),
            name=f'Regime {regime}',
            showlegend=True
        ),
        row=1, col=1
    )

# VIX by regime
for regime in range(hidden_states):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['VIX'],
            mode='markers',
            marker=dict(color=regime_colors[regime], size=6),
            name=f'Regime {regime}',
            showlegend=False
        ),
        row=2, col=1
    )

# LogVIX by regime
for regime in range(hidden_states):
    regime_data = training_with_predictions[training_with_predictions['Regime'] == regime]
    fig.add_trace(
        go.Scatter(
            x=regime_data['Date'], 
            y=regime_data['LogVIX'],
            mode='markers',
            marker=dict(color=regime_colors[regime], size=6),
            name=f'Regime {regime}',
            showlegend=False
        ),
        row=3, col=1
    )

fig.update_layout(
    height=1000,
    title_text=f'Market Regimes with SPY, VIX, and LogVIX (Training Period: {train_start_date} to {train_end_date})',
    template='plotly_white',
    legend_title='Regime',
    hovermode='closest'
)

fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="SPY Price", row=1, col=1)
fig.update_yaxes(title_text="VIX", row=2, col=1)
fig.update_yaxes(title_text="LogVIX", row=3, col=1)

fig.show()

# Visualize transition probabilities with Plotly
transition_matrix = model.transmat_
regime_labels = [f"Regime {i}" for i in range(hidden_states)]

# Create the heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=transition_matrix,
    x=regime_labels,
    y=regime_labels,
    colorscale='Blues',
    text=np.round(transition_matrix, 2),
    texttemplate="%{text:.2f}",
    textfont={"size": 14}
))

fig_heatmap.update_layout(
    title='Regime Transition Probabilities',
    xaxis_title='To Regime',
    yaxis_title='From Regime',
    width=700,
    height=600,
    template='plotly_white'
)

fig_heatmap.show()

# Show stationary distribution
stationary_dist = model.get_stationary_distribution()
print("\nStationary Distribution:")
for i in range(hidden_states):
    print(f"Regime {i}: {stationary_dist[i]*100:.2f}%")

# Create a pie chart for the stationary distribution
fig_pie = px.pie(
    values=stationary_dist * 100,
    names=regime_labels,
    title='Stationary Distribution of Regimes',
    color_discrete_sequence=regime_colors
)

fig_pie.update_traces(textinfo='percent+label', textposition='inside')
fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig_pie.show()


Stationary Distribution:
Regime 0: 38.51%
Regime 1: 17.81%
Regime 2: 43.68%


In [25]:
# Function to predict regimes for a specific date range
def predict_regimes(model, data, start_date, end_date):
    """
    Predict market regimes for a specific date range using the trained HMM model
    
    Parameters:
    -----------
    model : hmm.GaussianHMM
        The trained HMM model
    data : DataFrame
        The full dataset with calculated indicators
    start_date : str
        Start date for prediction period in 'YYYY-MM-DD' format
    end_date : str
        End date for prediction period in 'YYYY-MM-DD' format
        
    Returns:
    --------
    DataFrame with date, close price, VIX, LogVIX, and predicted regime
    """
    # Filter data for prediction period
    pred_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    if len(pred_data) == 0:
        print(f"No data available for period {start_date} to {end_date}")
        if end_date > data['Date'].max().strftime('%Y-%m-%d'):
            print("NOTE: Prediction period extends into the future")
            # Generate future dates for forecasting
            last_date = data['Date'].max()
            future_end = datetime.strptime(end_date, '%Y-%m-%d')
            
            # Get the most recent 30 days of data for calculating indicators
            recent_data = data.tail(30).copy()
            
            # For future dates, we'll extend from the last value and assume zero returns
            # This is a simplification - in reality you might want to use a forecasting model
            current_date = last_date + timedelta(days=1)
            while current_date <= future_end:
                if current_date.weekday() < 5:  # Only include weekdays
                    new_row = {
                        'Date': current_date,
                        'Close': recent_data['Close'].iloc[-1],  # Use the last known close price
                        'High': recent_data['Close'].iloc[-1],
                        'Low': recent_data['Close'].iloc[-1],
                        'Open': recent_data['Close'].iloc[-1],
                        'Volume': recent_data['Volume'].mean(),  # Use average volume
                        'VIX': recent_data['VIX'].mean(),  # Use average VIX
                        'LogVIX': np.log(recent_data['VIX'].mean()),  # Calculate LogVIX from mean VIX
                        'Volatility': recent_data['Volatility'].mean(),  # Use average volatility
                        'MA': recent_data['MA'].iloc[-1],  # Use the last MA
                        'Return': 0,  # Assume zero returns for future dates
                        'LogVIX_Ratio': 1.0  # Assume neutral LogVIX ratio
                    }
                    recent_data = pd.concat([recent_data, pd.DataFrame([new_row])], ignore_index=True)
                current_date += timedelta(days=1)
            
            # Keep only the future dates we generated
            future_data = recent_data[recent_data['Date'] > last_date].copy()
            pred_data = future_data[(future_data['Date'] >= start_date) & (future_data['Date'] <= end_date)].copy()
    
    # Prepare observations for prediction
    obs = np.column_stack([
        pred_data['Volatility'].values, 
        pred_data['Return'].values,
        pred_data['LogVIX_Ratio'].values
    ])
    
    # Predict regimes
    predictions = model.predict(obs)
    
    # Add predictions to dataframe
    pred_data['Predicted_Regime'] = predictions
    
    print(f"Predicted regimes for period {start_date} to {end_date}")
    print(f"Data points: {len(pred_data)}")
    
    # Calculate regime distribution
    regime_counts = pd.Series(predictions).value_counts(normalize=True) * 100
    print("\nRegime Distribution:")
    for regime, percentage in regime_counts.items():
        print(f"Regime {regime}: {percentage:.2f}%")
    
    return pred_data[['Date', 'Close', 'VIX', 'LogVIX', 'Volatility', 'Return', 'Predicted_Regime']]

In [26]:
# Unified function for regime prediction and visualization
def analyze_market_regimes(start_date, end_date, title=None):
    """
    Comprehensive function to predict and visualize market regimes for any date range
    
    Parameters:
    -----------
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format
    title : str, optional
        Custom title for the plots
    
    Returns:
    --------
    DataFrame with prediction results
    """
    # Get predictions
    results = predict_regimes(model, df, start_date, end_date)
    
    if results is None or results.empty:
        print("No data available for the specified period")
        return None
    
    # Set plot title
    if title is None:
        title = f'Market Regimes from {start_date} to {end_date}'
    
    # Create subplot figures for SPY, VIX, and LogVIX
    fig = make_subplots(rows=3, cols=1, 
                       shared_xaxes=True,
                       vertical_spacing=0.1,
                       subplot_titles=('SPY Close Price by Regime', 'VIX by Regime', 'LogVIX by Regime'))
    
    # SPY price by regime
    for regime in sorted(results['Predicted_Regime'].unique()):
        regime_data = results[results['Predicted_Regime'] == regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['Close'],
                mode='markers',
                marker=dict(color=regime_colors[regime], size=6),
                name=f'Regime {regime}',
                showlegend=True
            ),
            row=1, col=1
        )
    
    # Add a line for the price trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['Close'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='SPY Price',
            showlegend=True
        ),
        row=1, col=1
    )
    
    # VIX by regime
    for regime in sorted(results['Predicted_Regime'].unique()):
        regime_data = results[results['Predicted_Regime'] == regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['VIX'],
                mode='markers',
                marker=dict(color=regime_colors[regime], size=6),
                name=f'Regime {regime}',
                showlegend=False
            ),
            row=2, col=1
        )
    
    # Add a line for the VIX trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['VIX'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='VIX',
            showlegend=True
        ),
        row=2, col=1
    )
    
    # LogVIX by regime
    for regime in sorted(results['Predicted_Regime'].unique()):
        regime_data = results[results['Predicted_Regime'] == regime]
        fig.add_trace(
            go.Scatter(
                x=regime_data['Date'], 
                y=regime_data['LogVIX'],
                mode='markers',
                marker=dict(color=regime_colors[regime], size=6),
                name=f'Regime {regime}',
                showlegend=False
            ),
            row=3, col=1
        )
    
    # Add a line for the LogVIX trend
    fig.add_trace(
        go.Scatter(
            x=results['Date'],
            y=results['LogVIX'],
            mode='lines',
            line=dict(color='rgba(0,0,0,0.3)'),
            name='LogVIX',
            showlegend=True
        ),
        row=3, col=1
    )
    
    fig.update_layout(
        height=1000,
        title_text=title,
        template='plotly_white',
        legend_title='Regime',
        hovermode='closest'
    )
    
    fig.update_xaxes(title_text="Date", row=3, col=1)
    fig.update_yaxes(title_text="SPY Price", row=1, col=1)
    fig.update_yaxes(title_text="VIX", row=2, col=1)
    fig.update_yaxes(title_text="LogVIX", row=3, col=1)
    
    fig.show()
    
    # Distribution of regimes pie chart
    regime_percentages = results['Predicted_Regime'].value_counts(normalize=True) * 100
    labels = [f"Regime {i}" for i in sorted(regime_percentages.index)]
    
    fig_pie = px.pie(
        values=regime_percentages.values,
        names=labels,
        title=f'Percentage of Time in Each Regime ({start_date} to {end_date})',
        color_discrete_sequence=[regime_colors[i] for i in sorted(regime_percentages.index)]
    )
    
    fig_pie.update_traces(textinfo='percent+label', textposition='inside')
    fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
    fig_pie.show()
    
    # Scatter plot of Volatility vs Return colored by regime
    fig_scatter = px.scatter(
        results,
        x='Volatility',
        y='Return',
        color='Predicted_Regime',
        color_discrete_sequence=regime_colors,
        size='LogVIX',  # Use LogVIX for point size
        size_max=15,
        title=f'Volatility vs Return by Regime ({start_date} to {end_date})',
        labels={'Volatility': 'Volatility', 'Return': 'Return (%)', 'LogVIX': 'Log(VIX)'},
        opacity=0.8
    )
    
    fig_scatter.update_layout(
        legend_title='Regime',
        hovermode='closest',
        template='plotly_white'
    )
    
    fig_scatter.update_xaxes(rangemode='tozero')
    fig_scatter.show()
    
    return results

# Example: Analyze the most recent 3 months
from datetime import datetime, timedelta

today = datetime.today()
three_months_ago = today - timedelta(days=90)

start_date = three_months_ago.strftime('%Y-%m-%d')
end_date = today.strftime('%Y-%m-%d')

recent_results = analyze_market_regimes(
    start_date, 
    end_date, 
    title=f'Market Regimes in Recent 3 Months ({start_date} to {end_date})'
)

Predicted regimes for period 2025-01-11 to 2025-04-11
Data points: 62

Regime Distribution:
Regime 1: 45.16%
Regime 2: 43.55%
Regime 0: 11.29%
