In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from hmmlearn import hmm
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Hidden Markov Model Modifiable Parameters
hidden_states = 3
em_iterations = 75 #75

# Fixed training period (2007-2017)
train_start_date = "2018-01-01"
train_end_date = "2024-12-31"

# Date parameters for yfinance API - download data from 1995 to present
start_date = "1995-01-01"
end_date = datetime.today().strftime("%Y-%m-%d")

print(f"Downloading SPY data from {start_date} to {end_date}...")
# Download data using yfinance
df = yf.download('SPY', start=start_date, end=end_date, auto_adjust=True)

# Fix the multi-level column structure
df.columns = df.columns.droplevel(1) if len(df.columns.names) > 1 else df.columns

# Reset index to make Date a column
df = df.reset_index()

# Display first few rows to check the structure
print("DataFrame structure:")
print("Data shape:", df.shape)
print("Data columns:", df.columns.tolist())
df.head()

[*********************100%***********************]  1 of 1 completed

Downloading SPY data from 1995-01-01 to 2025-04-11...
DataFrame structure:
Data shape: (7620, 6)
Data columns: ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']





Price,Date,Close,High,Low,Open,Volume
0,1995-01-03,26.815863,26.852471,26.76095,26.770102,324300
1,1995-01-04,26.944004,26.944004,26.797569,26.934852,351800
2,1995-01-05,26.944004,27.008069,26.916548,26.962308,89800
3,1995-01-06,26.971464,27.090442,26.889095,26.998921,448400
4,1995-01-09,26.99892,26.99892,26.944007,26.962312,36800


In [112]:
# Function to calculate technical indicators
def calculate_indicators(data):
    # Create a copy of the dataframe to avoid modifying original
    df_copy = data.copy()
    
    # Volatility is computed by obtaining variance between current close and prices of past 10 days
    volatility = []
    # MA is the 10 day SMA
    ma = []
    # Return is the single-day percentage return
    returns = []
    ma_sum = 0
    
    # Warming up data for moving average and volatility calculations
    for i in range(0, 10):
        volatility.append(0)
        ma.append(0)
        returns.append(0)
        ma_sum += df_copy['Close'].iloc[i]
    
    # Filling in data for return, moving average, and volatility
    for ind in range(len(df_copy)):
        if ind >= 10:
            tail_close = df_copy['Close'].iloc[ind-10]
            prev_close = df_copy['Close'].iloc[ind-1]
            head_close = df_copy['Close'].iloc[ind]
            ma_sum = (ma_sum - tail_close + head_close)
            ma_curr = ma_sum/10
            ma.append(ma_curr)
            returns.append(((head_close-prev_close)/prev_close)*100)
            # Computing Volatility
            vol_sum = 0
            for i in range(0, 10):
                curr_vol = abs(ma_curr - df_copy['Close'].iloc[ind-i])
                vol_sum += (curr_vol ** 2)
            volatility.append(vol_sum/10)
    
    # Adding columns to dataframe
    df_copy['Volatility'] = volatility
    df_copy['MA'] = ma
    df_copy['Return'] = returns
    
    return df_copy

# Calculate indicators for the entire dataset
df = calculate_indicators(df)
df.head(15)

Price,Date,Close,High,Low,Open,Volume,Volatility,MA,Return
0,1995-01-03,26.815863,26.852471,26.76095,26.770102,324300,0.0,0.0,0.0
1,1995-01-04,26.944004,26.944004,26.797569,26.934852,351800,0.0,0.0,0.0
2,1995-01-05,26.944004,27.008069,26.916548,26.962308,89800,0.0,0.0,0.0
3,1995-01-06,26.971464,27.090442,26.889095,26.998921,448400,0.0,0.0,0.0
4,1995-01-09,26.99892,26.99892,26.944007,26.962312,36800,0.0,0.0,0.0
5,1995-01-10,27.026375,27.17281,27.026375,27.062984,229800,0.0,0.0,0.0
6,1995-01-11,27.044676,27.117893,26.834176,27.117893,222400,0.0,0.0,0.0
7,1995-01-12,27.053829,27.072134,26.962307,27.01722,40300,0.0,0.0,0.0
8,1995-01-13,27.374155,27.374155,27.163655,27.200264,170600,0.0,0.0,0.0
9,1995-01-16,27.538897,27.548049,27.365005,27.365005,105100,0.0,0.0,0.0


In [113]:
# Function to train HMM model on specified period
def train_hmm_model(data, start_date, end_date, n_states=3, n_iter=75):
    # Filter data to training period
    training = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    print(f"Training model on data from {start_date} to {end_date}")
    print(f"Training data shape: {training.shape}")
    
    # Prepare observations for HMM (using Volatility and Return)
    obs = np.column_stack([training['Volatility'].values, training['Return'].values])
    
    # Create and train the HMM model
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=n_iter)
    model.fit(obs)
    
    # Get predictions for training data
    predictions = model.predict(obs)
    
    # Analyze regime characteristics
    regime_vol = [0] * n_states
    regime_ret = [0] * n_states
    regime_count = [0] * n_states
    return_map = {i: [] for i in range(n_states)}
    
    for i in range(len(predictions)):
        regime = predictions[i]
        regime_count[regime] += 1
        regime_vol[regime] += training['Volatility'].iloc[i]
        regime_ret[regime] += training['Return'].iloc[i]
        return_map[regime].append(training['Return'].iloc[i])
    
    # Calculate averages
    for i in range(n_states):
        if regime_count[i] > 0:  # Prevent division by zero
            regime_vol[i] = regime_vol[i] / regime_count[i]
            regime_ret[i] = regime_ret[i] / regime_count[i]
    
    # Print regime characteristics
    for i in range(n_states):
        print(f"Regime {i}")
        print(f"Avg Vol: {regime_vol[i]}")
        print(f"Avg Return: {regime_ret[i]}")
        print(f"Occurrence: {regime_count[i]}\n")
    
    return model, training, predictions

# Train the model on the fixed period (2007-2017)
model, training_data, train_predictions = train_hmm_model(df, train_start_date, train_end_date, 
                                                         hidden_states, em_iterations)

Training model on data from 2020-01-01 to 2024-12-31
Training data shape: (1258, 9)
Regime 0
Avg Vol: 9.178382289015254
Avg Return: 0.11619144311037989
Occurrence: 586

Regime 1
Avg Vol: 33.32243492762282
Avg Return: 0.05294329672172074
Occurrence: 528

Regime 2
Avg Vol: 120.87016427923929
Avg Return: -0.12134007176305242
Occurrence: 144



In [114]:
# Visualize the training data with regime classifications using Plotly
training_with_predictions = training_data.copy()
training_with_predictions['Regime'] = train_predictions

# Create a categorical color map for regimes
regime_colors = px.colors.qualitative.Set2[:hidden_states]

# Create the scatter plot with Plotly
fig = px.scatter(
    training_with_predictions, 
    x='Date', 
    y='Close', 
    color='Regime',
    color_discrete_sequence=regime_colors,
    title='SPY Close Price Colored by Regime (Training Period: 2007-2017)',
    labels={'Close': 'Price', 'Date': 'Date', 'Regime': 'Market Regime'},
    height=600,
    opacity=0.8,
    size_max=10
)

fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Price',
    legend_title='Regime',
    hovermode='closest',
    template='plotly_white'
)

fig.show()

# Visualize transition probabilities with Plotly
transition_matrix = model.transmat_
regime_labels = [f"Regime {i}" for i in range(hidden_states)]

# Create the heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=transition_matrix,
    x=regime_labels,
    y=regime_labels,
    colorscale='Blues',
    text=np.round(transition_matrix, 2),
    texttemplate="%{text:.2f}",
    textfont={"size": 14}
))

fig_heatmap.update_layout(
    title='Regime Transition Probabilities',
    xaxis_title='To Regime',
    yaxis_title='From Regime',
    width=700,
    height=600,
    template='plotly_white'
)

fig_heatmap.show()

# Show stationary distribution
stationary_dist = model.get_stationary_distribution()
print("\nStationary Distribution:")
for i in range(hidden_states):
    print(f"Regime {i}: {stationary_dist[i]*100:.2f}%")

# Create a pie chart for the stationary distribution
fig_pie = px.pie(
    values=stationary_dist * 100,
    names=regime_labels,
    title='Stationary Distribution of Regimes',
    color_discrete_sequence=regime_colors
)

fig_pie.update_traces(textinfo='percent+label', textposition='inside')
fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig_pie.show()


Stationary Distribution:
Regime 0: 46.01%
Regime 1: 42.01%
Regime 2: 11.97%


In [115]:
# Function to predict regimes for a specific date range
def predict_regimes(model, data, start_date, end_date):
    """
    Predict market regimes for a specific date range using the trained HMM model
    
    Parameters:
    -----------
    model : hmm.GaussianHMM
        The trained HMM model
    data : DataFrame
        The full dataset with calculated indicators
    start_date : str
        Start date for prediction period in 'YYYY-MM-DD' format
    end_date : str
        End date for prediction period in 'YYYY-MM-DD' format
        
    Returns:
    --------
    DataFrame with date, close price, and predicted regime
    """
    # Filter data for prediction period
    pred_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    if len(pred_data) == 0:
        print(f"No data available for period {start_date} to {end_date}")
        if end_date > data['Date'].max().strftime('%Y-%m-%d'):
            print("NOTE: Prediction period extends into the future")
            # Generate future dates for forecasting
            last_date = data['Date'].max()
            future_end = datetime.strptime(end_date, '%Y-%m-%d')
            
            # Get the most recent 30 days of data for calculating indicators
            recent_data = data.tail(30).copy()
            
            # For future dates, we'll extend from the last value and assume zero returns
            # This is a simplification - in reality you might want to use a forecasting model
            current_date = last_date + timedelta(days=1)
            while current_date <= future_end:
                if current_date.weekday() < 5:  # Only include weekdays
                    new_row = {
                        'Date': current_date,
                        'Close': recent_data['Close'].iloc[-1],  # Use the last known close price
                        'High': recent_data['Close'].iloc[-1],
                        'Low': recent_data['Close'].iloc[-1],
                        'Open': recent_data['Close'].iloc[-1],
                        'Volume': recent_data['Volume'].mean(),  # Use average volume
                        'Volatility': recent_data['Volatility'].mean(),  # Use average volatility
                        'MA': recent_data['MA'].iloc[-1],  # Use the last MA
                        'Return': 0  # Assume zero returns for future dates
                    }
                    recent_data = pd.concat([recent_data, pd.DataFrame([new_row])], ignore_index=True)
                current_date += timedelta(days=1)
            
            # Keep only the future dates we generated
            future_data = recent_data[recent_data['Date'] > last_date].copy()
            pred_data = future_data[(future_data['Date'] >= start_date) & (future_data['Date'] <= end_date)].copy()
    
    # Prepare observations for prediction
    obs = np.column_stack([pred_data['Volatility'].values, pred_data['Return'].values])
    
    # Predict regimes
    predictions = model.predict(obs)
    
    # Add predictions to dataframe
    pred_data['Predicted_Regime'] = predictions
    
    print(f"Predicted regimes for period {start_date} to {end_date}")
    print(f"Data points: {len(pred_data)}")
    
    # Calculate regime distribution
    regime_counts = pd.Series(predictions).value_counts(normalize=True) * 100
    print("\nRegime Distribution:")
    for regime, percentage in regime_counts.items():
        print(f"Regime {regime}: {percentage:.2f}%")
    
    return pred_data[['Date', 'Close', 'Volatility', 'Return', 'Predicted_Regime']]

In [116]:
# Example 1: Predict regimes for a historical period (2018-2019)
results_2018_2019 = predict_regimes(model, df, "2018-01-01", "2019-12-31")

# Visualize the results with Plotly
fig = px.scatter(
    results_2018_2019, 
    x='Date', 
    y='Close', 
    color='Predicted_Regime',
    color_discrete_sequence=regime_colors,
    title='Market Regimes in 2018-2019',
    labels={'Close': 'Price', 'Date': 'Date', 'Predicted_Regime': 'Market Regime'},
    height=600,
    opacity=0.8,
    size_max=10
)

fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Price',
    legend_title='Regime',
    hovermode='closest',
    template='plotly_white'
)

fig.show()

# Show distribution of regimes over time with Plotly
regime_counts = results_2018_2019['Predicted_Regime'].value_counts().reset_index()
regime_counts.columns = ['Regime', 'Count']

fig_bar = px.bar(
    regime_counts, 
    x='Regime', 
    y='Count',
    color='Regime',
    color_discrete_sequence=regime_colors,
    title='Distribution of Market Regimes in 2018-2019',
    text='Count'
)

fig_bar.update_layout(
    xaxis_title='Regime',
    yaxis_title='Number of Days',
    template='plotly_white',
    showlegend=False
)

fig_bar.update_traces(textposition='outside')
fig_bar.show()

Predicted regimes for period 2018-01-01 to 2019-12-31
Data points: 503

Regime Distribution:
Regime 0: 86.68%
Regime 1: 12.52%
Regime 2: 0.80%


In [117]:
# Example 2: Predict regimes for the most recent 3 months
from datetime import datetime, timedelta

today = datetime.today()
three_months_ago = today - timedelta(days=90)

start_date = three_months_ago.strftime('%Y-%m-%d')
end_date = today.strftime('%Y-%m-%d')

recent_results = predict_regimes(model, df, start_date, end_date)

# Visualize recent results with Plotly
fig = px.scatter(
    recent_results, 
    x='Date', 
    y='Close', 
    color='Predicted_Regime',
    color_discrete_sequence=regime_colors,
    title=f'Market Regimes in Recent 3 Months ({start_date} to {end_date})',
    labels={'Close': 'Price', 'Date': 'Date', 'Predicted_Regime': 'Market Regime'},
    height=600,
    opacity=0.8,
    size_max=10
)

fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Price',
    legend_title='Regime',
    hovermode='closest',
    template='plotly_white'
)

fig.show()

# Calculate percentage of time in each regime with Plotly
regime_percentages = recent_results['Predicted_Regime'].value_counts(normalize=True) * 100
labels = [f"Regime {i}" for i in regime_percentages.index]

fig_pie = px.pie(
    values=regime_percentages.values,
    names=labels,
    title='Percentage of Time in Each Regime (Recent 3 Months)',
    color_discrete_sequence=regime_colors
)

fig_pie.update_traces(textinfo='percent+label', textposition='inside')
fig_pie.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig_pie.show()

# Add a time series view to show how the regimes evolve over time
fig_time = px.line(
    recent_results, 
    x='Date', 
    y='Close',
    title=f'Price Evolution with Regime Changes (Recent 3 Months)'
)

# Add markers for each regime
for regime in recent_results['Predicted_Regime'].unique():
    regime_data = recent_results[recent_results['Predicted_Regime'] == regime]
    fig_time.add_scatter(
        x=regime_data['Date'],
        y=regime_data['Close'],
        mode='markers',
        name=f'Regime {regime}',
        marker=dict(color=regime_colors[regime], size=8),
        showlegend=True
    )

fig_time.update_layout(
    xaxis_title='Date',
    yaxis_title='Price',
    template='plotly_white',
    hovermode='closest'
)

fig_time.show()

Predicted regimes for period 2025-01-11 to 2025-04-11
Data points: 62

Regime Distribution:
Regime 2: 40.32%
Regime 1: 35.48%
Regime 0: 24.19%


In [118]:
# Example 3: Predict regimes for future dates (Q1 2025)
future_start = "2025-01-01"
future_end = "2025-03-31"

future_results = predict_regimes(model, df, future_start, future_end)

# Display predictions for future dates
if future_results is not None and not future_results.empty:
    print("\nPredicted regimes for Q1 2025:")
    display(future_results.head())

    # Visualize the distribution of predicted regimes for Q1 2025 with Plotly
    regime_counts = future_results['Predicted_Regime'].value_counts().reset_index()
    regime_counts.columns = ['Regime', 'Count']
    
    fig_bar = px.bar(
        regime_counts, 
        x='Regime', 
        y='Count',
        color='Regime',
        color_discrete_sequence=regime_colors,
        title='Predicted Market Regime Distribution for Q1 2025',
        text='Count'
    )
    
    fig_bar.update_layout(
        xaxis_title='Regime',
        yaxis_title='Number of Trading Days',
        template='plotly_white',
        showlegend=False
    )
    
    fig_bar.update_traces(textposition='outside')
    fig_bar.show()
    
    # Create a calendar heatmap to visualize regimes by day
    regime_cal_data = future_results.copy()
    regime_cal_data['Day'] = regime_cal_data['Date'].dt.day
    regime_cal_data['Month'] = regime_cal_data['Date'].dt.month
    regime_cal_data['MonthName'] = regime_cal_data['Date'].dt.strftime('%b')
    
    # Create a calendar heatmap
    fig_cal = px.scatter(
        regime_cal_data,
        x='Day', 
        y='MonthName',
        color='Predicted_Regime',
        color_discrete_sequence=regime_colors,
        title='Calendar View of Predicted Regimes for Q1 2025',
        size_max=15,
        size=[10] * len(regime_cal_data)  # Fixed size for all points
    )
    
    fig_cal.update_layout(
        xaxis_title='Day of Month',
        yaxis_title='Month',
        yaxis=dict(
            categoryorder='array',
            categoryarray=['Jan', 'Feb', 'Mar']
        ),
        template='plotly_white'
    )
    
    fig_cal.show()

Predicted regimes for period 2025-01-01 to 2025-03-31
Data points: 60

Regime Distribution:
Regime 1: 43.33%
Regime 2: 31.67%
Regime 0: 25.00%

Predicted regimes for Q1 2025:


Price,Date,Close,Volatility,Return,Predicted_Regime
7552,2025-01-02,582.886597,40.44466,-0.245698,0
7553,2025-01-03,590.174622,35.377226,1.250333,1
7554,2025-01-06,593.574402,29.428019,0.576063,1
7555,2025-01-07,586.864624,30.909914,-1.130402,1
7556,2025-01-08,587.721985,31.294495,0.146092,1


In [119]:
# Function to create a custom prediction for any date range with Plotly visualizations
def custom_regime_prediction(start_date, end_date):
    """
    Predict regimes for a custom date range
    
    Parameters:
    -----------
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format
    """
    results = predict_regimes(model, df, start_date, end_date)
    
    if results is None or results.empty:
        return None
    
    # Visualize the results with Plotly
    fig = px.scatter(
        results, 
        x='Date', 
        y='Close', 
        color='Predicted_Regime',
        color_discrete_sequence=regime_colors,
        title=f'Market Regimes from {start_date} to {end_date}',
        labels={'Close': 'Price', 'Date': 'Date', 'Predicted_Regime': 'Market Regime'},
        height=600,
        opacity=0.8
    )
    
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title='Price',
        legend_title='Regime',
        hovermode='closest',
        template='plotly_white'
    )
    
    fig.show()
    
    # Add additional visualizations
    
    # 1. Distribution of regimes
    regime_counts = results['Predicted_Regime'].value_counts().reset_index()
    regime_counts.columns = ['Regime', 'Count']
    
    fig_bar = px.bar(
        regime_counts, 
        x='Regime', 
        y='Count',
        color='Regime',
        color_discrete_sequence=regime_colors,
        title=f'Regime Distribution ({start_date} to {end_date})',
        text='Count'
    )
    
    fig_bar.update_layout(
        xaxis_title='Regime',
        yaxis_title='Number of Days',
        template='plotly_white',
        showlegend=False
    )
    
    fig_bar.update_traces(textposition='outside')
    fig_bar.show()
    
    # 2. Volatility vs Return colored by regime
    fig_vr = px.scatter(
        results,
        x='Volatility',
        y='Return',
        color='Predicted_Regime',
        color_discrete_sequence=regime_colors,
        title=f'Volatility vs Return by Regime ({start_date} to {end_date})',
        labels={'Volatility': 'Volatility', 'Return': 'Return (%)'},
        opacity=0.8
    )
    
    fig_vr.update_layout(
        legend_title='Regime',
        hovermode='closest',
        template='plotly_white'
    )
    
    fig_vr.update_xaxes(rangemode='tozero')
    fig_vr.show()
    
    return results

# Example usage - uncomment and modify dates as needed:
# custom_results = custom_regime_prediction("2020-03-01", "2020-06-30")  # COVID-19 market crash period

In [120]:
# Regime Characteristics Summary
regime_vol = [0] * hidden_states
regime_ret = [0] * hidden_states
regime_count = [0] * hidden_states

for i in range(len(train_predictions)):
    regime = train_predictions[i]
    regime_count[regime] += 1
    regime_vol[regime] += training_data['Volatility'].iloc[i]
    regime_ret[regime] += training_data['Return'].iloc[i]

for i in range(hidden_states):
    if regime_count[i] > 0:
        regime_vol[i] = regime_vol[i] / regime_count[i]
        regime_ret[i] = regime_ret[i] / regime_count[i]

# Create a summary dataframe
regime_summary = pd.DataFrame({
    'Regime': range(hidden_states),
    'Average_Volatility': regime_vol,
    'Average_Return': regime_ret,
    'Occurrence_Count': regime_count,
    'Occurrence_Percentage': [count/sum(regime_count)*100 for count in regime_count]
})

# Display the summary
print("HMM Market Regime Summary (Training Period: 2007-2017)")
display(regime_summary)

# Interpret the regimes based on their characteristics
print("\nMarket Regime Interpretation:")
regime_types = []
for i in range(hidden_states):
    if regime_ret[i] > 0.05:
        if regime_vol[i] < 1:
            regime_type = "Stable Growth (Low Volatility, Positive Returns)"
        else:
            regime_type = "Volatile Growth (High Volatility, Positive Returns)"
    elif regime_ret[i] < 0:
        if regime_vol[i] > 2:
            regime_type = "Bear Market (High Volatility, Negative Returns)"
        else:
            regime_type = "Correction (Moderate Volatility, Negative Returns)"
    else:
        regime_type = "Sideways Market (Moderate Volatility, Flat Returns)"
    
    regime_types.append(regime_type)
    print(f"Regime {i}: {regime_type}")

# Create interactive visualizations of regime characteristics
# 1. Radar chart for regime characteristics
categories = ['Volatility', 'Return', 'Frequency']
fig_radar = go.Figure()

# Scale values for better visualization
scaled_vol = [vol / max(regime_vol) * 100 for vol in regime_vol]
scaled_ret = [(ret - min(regime_ret)) / (max(regime_ret) - min(regime_ret)) * 100 if max(regime_ret) != min(regime_ret) else 50 for ret in regime_ret]
scaled_freq = [count / max(regime_count) * 100 for count in regime_count]

for i in range(hidden_states):
    fig_radar.add_trace(go.Scatterpolar(
        r=[scaled_vol[i], scaled_ret[i], scaled_freq[i]],
        theta=categories,
        fill='toself',
        name=f'Regime {i}',
        line_color=regime_colors[i]
    ))

fig_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100]
        )),
    title='Regime Characteristics Comparison',
    showlegend=True,
    template='plotly_white'
)

fig_radar.show()

# 2. Bubble chart showing regimes by volatility, return, and frequency
fig_bubble = px.scatter(
    x=regime_vol,
    y=regime_ret,
    size=[count for count in regime_count],
    color=range(hidden_states),
    color_discrete_sequence=regime_colors,
    labels={'x': 'Average Volatility', 'y': 'Average Return (%)', 'size': 'Occurrence Count'},
    title='Regime Characteristics: Volatility vs Return (size = frequency)',
    text=[f'Regime {i}' for i in range(hidden_states)]
)

fig_bubble.update_traces(textposition='top center')
fig_bubble.update_layout(
    xaxis_title='Average Volatility',
    yaxis_title='Average Return (%)',
    template='plotly_white'
)

fig_bubble.show()

HMM Market Regime Summary (Training Period: 2007-2017)


Unnamed: 0,Regime,Average_Volatility,Average_Return,Occurrence_Count,Occurrence_Percentage
0,0,9.178382,0.116191,586,46.581876
1,1,33.322435,0.052943,528,41.971383
2,2,120.870164,-0.12134,144,11.446741



Market Regime Interpretation:
Regime 0: Volatile Growth (High Volatility, Positive Returns)
Regime 1: Volatile Growth (High Volatility, Positive Returns)
Regime 2: Bear Market (High Volatility, Negative Returns)


In [121]:
# Add a dashboard-style visualization for a deeper analysis of the model
from IPython.display import HTML

# Create a dashboard layout with 4 subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Regime Transition Network", 
        "Training Data Regime Distribution", 
        "Regime Return Distribution", 
        "Regime Characteristics"
    ),
    specs=[
        [{"type": "scatter"}, {"type": "pie"}],
        [{"type": "box"}, {"type": "scatter"}]
    ],
    vertical_spacing=0.1,
    horizontal_spacing=0.05
)

# 1. Regime Transition Network (top left)
# Create node positions for a circular layout
import math
angles = np.linspace(0, 2*math.pi, hidden_states, endpoint=False).tolist()
node_x = [math.cos(angle) for angle in angles]
node_y = [math.sin(angle) for angle in angles]

# Add nodes
for i in range(hidden_states):
    fig.add_trace(
        go.Scatter(
            x=[node_x[i]], 
            y=[node_y[i]],
            mode='markers+text',
            marker=dict(size=30, color=regime_colors[i]),
            text=[f'{i}'],
            textposition="middle center",
            textfont=dict(color='white', size=14),
            name=f'Regime {i}',
            hoverinfo='text',
            hovertext=f'Regime {i}'
        ),
        row=1, col=1
    )

# Add edges for transitions
for i in range(hidden_states):
    for j in range(hidden_states):
        if transition_matrix[i, j] > 0.05:  # Only show significant transitions
            # Calculate edge curvature based on whether it's a self-loop
            curve = 0.2 if i != j else 0.5
            fig.add_trace(
                go.Scatter(
                    x=[node_x[i], None, node_x[j]],
                    y=[node_y[i], None, node_y[j]],
                    mode='lines',
                    line=dict(
                        width=transition_matrix[i, j] * 10,  # Line width based on probability
                        color='rgba(150,150,150,0.8)',
                    ),
                    hoverinfo='text',
                    hovertext=f'P({i}->{j}) = {transition_matrix[i, j]:.2f}',
                    showlegend=False
                ),
                row=1, col=1
            )

# 2. Training Data Regime Distribution (top right)
regime_counts_train = pd.Series(train_predictions).value_counts(normalize=True) * 100
labels = [f"Regime {i}" for i in range(hidden_states)]
values = [regime_counts_train.get(i, 0) for i in range(hidden_states)]

fig.add_trace(
    go.Pie(
        labels=labels,
        values=values,
        textinfo='percent',
        marker=dict(colors=regime_colors)
    ),
    row=1, col=2
)

# 3. Regime Return Distribution (bottom left)
# Prepare return data by regime
regime_returns = {i: [] for i in range(hidden_states)}
for i in range(len(train_predictions)):
    regime_returns[train_predictions[i]].append(training_data['Return'].iloc[i])

for i in range(hidden_states):
    if regime_returns[i]:  # Check if list is not empty
        fig.add_trace(
            go.Box(
                y=regime_returns[i],
                name=f'Regime {i}',
                marker_color=regime_colors[i],
                boxmean=True  # Show mean as a dashed line
            ),
            row=2, col=1
        )

# 4. Regime Characteristics (bottom right)
df_characteristics = pd.DataFrame({
    'Regime': range(hidden_states),
    'Volatility': regime_vol,
    'Return': regime_ret,
    'Frequency': [c / sum(regime_count) for c in regime_count]
})

fig.add_trace(
    go.Scatter(
        x=df_characteristics['Volatility'],
        y=df_characteristics['Return'],
        mode='markers',
        marker=dict(
            size=df_characteristics['Frequency'] * 1000,  # Scale size for visibility
            color=[regime_colors[i] for i in range(hidden_states)],
            line=dict(width=2, color='DarkSlateGrey')
        ),
        text=[f'Regime {i}' for i in range(hidden_states)],
        showlegend=False
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    height=900,
    width=1200,
    title_text='HMM Market Regime Analysis Dashboard',
    template='plotly_white',
    showlegend=False
)

# Update axes for specific subplots
fig.update_xaxes(title_text="", showticklabels=False, row=1, col=1)
fig.update_yaxes(title_text="", showticklabels=False, row=1, col=1)
fig.update_xaxes(title_text="Volatility", row=2, col=2)
fig.update_yaxes(title_text="Return (%)", row=2, col=2)
fig.update_xaxes(title_text="Regime", row=2, col=1)
fig.update_yaxes(title_text="Return (%)", row=2, col=1)

# Show the dashboard
fig.show()

# Add an HTML section with model insights
insights_html = f"""
<div style="background-color: #f9f9f9; padding: 20px; border-radius: 10px; margin-top: 20px;">
    <h2 style="color: #333;">HMM Model Insights</h2>
    <p>The Hidden Markov Model has identified <b>{hidden_states} distinct market regimes</b> based on volatility and return patterns:</p>
    <ul>
"""

for i in range(hidden_states):
    insights_html += f"""
        <li style="margin-bottom: 10px;">
            <span style="color: {regime_colors[i]}; font-weight: bold;">Regime {i}:</span> 
            {regime_types[i]} 
            <ul>
                <li>Average Volatility: {regime_vol[i]:.4f}</li>
                <li>Average Return: {regime_ret[i]:.4f}%</li>
                <li>Frequency: {regime_count[i]} days ({regime_count[i]/sum(regime_count)*100:.2f}%)</li>
                <li>Most likely to transition to: Regime {np.argmax(transition_matrix[i])} (p={np.max(transition_matrix[i]):.2f})</li>
            </ul>
        </li>
    """

insights_html += """
    </ul>
    <p><b>Model Score:</b> {:.2f}</p>
    <p>This model has been trained on data from 2007-01-01 to 2017-12-31 and can be used to identify market regimes in any time period.</p>
</div>
""".format(model.score(np.column_stack([training_data['Volatility'].values, training_data['Return'].values])))

display(HTML(insights_html))