### Imports

In [1]:
# Import necessary libraries for data manipulation and visualization
import pandas as pd
import polars as pl
import numpy as np

from datetime import datetime

from hmmlearn.hmm import GaussianHMM
from pandas_datareader.data import DataReader

import matplotlib.pyplot as plt

# Verify all libraries are imported correctly
print("Libraries imported successfully")

Libraries imported successfully


### Structure Data

In [2]:
# Import necessary libraries
import yfinance as yf

# Set start and end dates for the data extraction
start_date = "2017-01-01"
end_date = "2024-07-01"
symbol = "SPY"

# Fetch data using yfinance
data = yf.download(symbol, start=start_date, end=end_date)
data.head(3)

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,225.039993,225.830002,223.880005,225.240005,198.560043,91366500
2017-01-04,225.619995,226.75,225.610001,226.580002,199.741333,78744400
2017-01-05,226.270004,226.580002,225.479996,226.399994,199.582626,78379000


In [3]:
# Convert to Polars DataFrame and include dates
import polars as pl

data.reset_index(inplace=True) # need to add the date as index otherwise it will drop the date during conversion
data = pl.DataFrame(data)

# Select relevant columns: Date, Open, High, Low, Adj Close
data = data.select(["Date", "Open", "High", "Low", "Adj Close"]) # date is not default added in polars

In [4]:

print(data.columns)
print(data.schema)

['Date', 'Open', 'High', 'Low', 'Adj Close']
OrderedDict({'Date': Datetime(time_unit='ns', time_zone=None), 'Open': Float64, 'High': Float64, 'Low': Float64, 'Adj Close': Float64})


In [5]:
data.head(3)

Date,Open,High,Low,Adj Close
datetime[ns],f64,f64,f64,f64
2017-01-03 00:00:00,225.039993,225.830002,223.880005,198.560043
2017-01-04 00:00:00,225.619995,226.75,225.610001,199.741333
2017-01-05 00:00:00,226.270004,226.580002,225.479996,199.582626


In [6]:
# Create a copy of the dataframe and add returns and range columns
df = data.clone()

# Calculate daily returns
df = df.with_columns((pl.col("Adj Close") / pl.col("Adj Close").shift(1) - 1).alias("Returns")) # the -1 returns the percent return in decimals

# Calculate daily range (volatility)
df = df.with_columns((pl.col("High") / pl.col("Low")-1).alias("Range")) # the -1 returns the percent return in decimals

# Drop NaN values created by the pct_change method
# In Polars, the drop_nulls method does not require the inplace=True parameter as it returns a new DataFrame by default. 
df = df.drop_nulls()

# Display the updated dataframe with new columns
print("Length of Dataframe: ", len(df))
df.head()

Length of Dataframe:  1883


Date,Open,High,Low,Adj Close,Returns,Range
datetime[ns],f64,f64,f64,f64,f64,f64
2017-01-04 00:00:00,225.619995,226.75,225.610001,199.741333,0.005949,0.005053
2017-01-05 00:00:00,226.270004,226.580002,225.479996,199.582626,-0.000795,0.004879
2017-01-06 00:00:00,226.529999,227.75,225.899994,200.296692,0.003578,0.008189
2017-01-09 00:00:00,226.910004,227.070007,226.419998,199.635544,-0.003301,0.002871
2017-01-10 00:00:00,226.479996,227.449997,226.009995,199.635544,0.0,0.006371


In [7]:
df.tail()

Date,Open,High,Low,Adj Close,Returns,Range
datetime[ns],f64,f64,f64,f64,f64,f64
2024-06-24 00:00:00,544.330017,546.950012,542.619995,542.73999,-0.003251,0.00798
2024-06-25 00:00:00,543.98999,545.200012,542.440002,544.830017,0.003851,0.005088
2024-06-26 00:00:00,543.690002,546.23999,543.030029,545.51001,0.001248,0.005911
2024-06-27 00:00:00,545.369995,546.960022,544.609985,546.369995,0.001576,0.004315
2024-06-28 00:00:00,547.159973,550.280029,542.950012,544.219971,-0.003935,0.0135


In [8]:
# Calculate moving averages
df = df.with_columns([
    pl.col('Adj Close').rolling_mean(window_size=12).alias('MA12'),
    pl.col('Adj Close').rolling_mean(window_size=21).alias('MA21')
])
df.head(2)


Date,Open,High,Low,Adj Close,Returns,Range,MA12,MA21
datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64
2017-01-04 00:00:00,225.619995,226.75,225.610001,199.741333,0.005949,0.005053,,
2017-01-05 00:00:00,226.270004,226.580002,225.479996,199.582626,-0.000795,0.004879,,


In [9]:
df.tail(2)

Date,Open,High,Low,Adj Close,Returns,Range,MA12,MA21
datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64
2024-06-27 00:00:00,545.369995,546.960022,544.609985,546.369995,0.001576,0.004315,543.15421,536.84654
2024-06-28 00:00:00,547.159973,550.280029,542.950012,544.219971,-0.003935,0.0135,543.903931,537.789958


In [10]:
# Structure Data
X_train = df.select(['Date','Returns', 'Range'])[:500] # in polars we use .select and we don't use iloc
# remember to add date since in polars index is not defualt

In [11]:
X_train.head(3)

Date,Returns,Range
datetime[ns],f64,f64
2017-01-04 00:00:00,0.005949,0.005053
2017-01-05 00:00:00,-0.000795,0.004879
2017-01-06 00:00:00,0.003578,0.008189


In [12]:
X_test = df.select(['Date','Returns', 'Range'])[500:]
save_df = df[500:]
# also can use
# X_train = df.select(['Returns', 'Range']).slice(0, 500)
# X_test = df.select(['Returns', 'Range']).slice(500)
# save_df = df.slice(500)


print("Train Length: ", len(X_train))
print("Test Length: ", len(X_test))

# Print the first row's date

print("\nX_train From: ", X_train['Date'][1])
print("X_train To: ", X_train['Date'][-1])
print("\nX_test From: ", X_test['Date'][1])
print("X_test To: ", X_test['Date'][-1])

Train Length:  500
Test Length:  1383

X_train From:  2017-01-05 00:00:00
X_train To:  2018-12-28 00:00:00

X_test From:  2019-01-02 00:00:00
X_test To:  2024-06-28 00:00:00


## Train HMM

In [13]:
#Data Preparation
import polars as pl
import numpy as np
from hmmlearn.hmm import GaussianHMM

# Convert Polars DataFrame to NumPy array
X_train_np = X_train.to_numpy()
print("Shape of X_train_np:", X_train_np.shape)

# Display first few rows of the data
print("\nFirst few rows of X_train_np:")
print(X_train_np[:5])

# Determine the number of features
n_features = X_train_np.shape[1]
print(f"\nNumber of features detected: {n_features}")

Shape of X_train_np: (500, 3)

First few rows of X_train_np:
[[ 1.48348800e+18  5.94928190e-03  5.05296479e-03]
 [ 1.48357440e+18 -7.94560959e-04  4.87850862e-03]
 [ 1.48366080e+18  3.57779414e-03  8.18949160e-03]
 [ 1.48392000e+18 -3.30084369e-03  2.87081159e-03]
 [ 1.48400640e+18  0.00000000e+00  6.37141045e-03]]

Number of features detected: 3


In [14]:
hmm_model = GaussianHMM(n_components=4, covariance_type="full", n_iter=100).fit(X_train)
print("Model Score:", hmm_model.score(X_train))

Model is not converging.  Current: -15390.092963449333 is not greater than -15387.555427799658. Delta is -2.537535649675192


Model Score: -15389.284041437482


In [15]:
# Check Results
hidden_states = hmm_model.predict(X_train)
print("First 20 hidden states:", hidden_states[:20])
print("\nUnique states:", np.unique(hidden_states))
print("\nState frequencies:")
for i in range(hmm_model.n_components):
    print(f"State {i}: {np.sum(hidden_states == i) / len(hidden_states):.2%}")

First 20 hidden states: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

Unique states: [0 1 2 3]

State frequencies:
State 0: 10.80%
State 1: 53.60%
State 2: 0.40%
State 3: 35.20%


In [22]:
#Make Predictions on Test Data

# Clone the DataFrame and drop specified columns
df_main = save_df.clone()
df_main = df_main.drop(["High", "Low"])

# Predict using HMM model
hmm_results = hmm_model.predict(X_test)

# Convert hmm_results to a Polars Series
hmm_series = pl.Series('HMM', hmm_results)

# Add the HMM results to the DataFrame
df_main = df_main.with_columns(hmm_series)

# Display the first row
print(df_main.head(1))

shape: (1, 8)
┌──────────────┬────────────┬────────────┬──────────┬──────────┬────────────┬────────────┬─────┐
│ Date         ┆ Open       ┆ Adj Close  ┆ Returns  ┆ Range    ┆ MA12       ┆ MA21       ┆ HMM │
│ ---          ┆ ---        ┆ ---        ┆ ---      ┆ ---      ┆ ---        ┆ ---        ┆ --- │
│ datetime[ns] ┆ f64        ┆ f64        ┆ f64      ┆ f64      ┆ f64        ┆ f64        ┆ i64 │
╞══════════════╪════════════╪════════════╪══════════╪══════════╪════════════╪════════════╪═════╡
│ 2018-12-31   ┆ 249.559998 ┆ 228.866135 ┆ 0.008759 ┆ 0.010991 ┆ 228.386873 ┆ 235.695219 ┆ 1   │
│ 00:00:00     ┆            ┆            ┆          ┆          ┆            ┆            ┆     │
└──────────────┴────────────┴────────────┴──────────┴──────────┴────────────┴────────────┴─────┘
