### Imports

In [9]:
# Import necessary libraries for data manipulation and visualization
import pandas as pd
import polars as pl
import numpy as np

from datetime import datetime

from hmmlearn.hmm import GaussianHMM
from pandas_datareader.data import DataReader

import matplotlib.pyplot as plt

# Verify all libraries are imported correctly
print("Libraries imported successfully")

Libraries imported successfully


### Data Management

In [10]:
# Import necessary libraries
import yfinance as yf

# Set start and end dates for the data extraction
start_date = "2017-01-01"
end_date = "2022-06-01"
symbol = "SPY"

# Fetch data using yfinance
data = yf.download(symbol, start=start_date, end=end_date)


[*********************100%%**********************]  1 of 1 completed


In [11]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,225.039993,225.830002,223.880005,225.240005,198.560059,91366500
2017-01-04,225.619995,226.75,225.610001,226.580002,199.741287,78744400
2017-01-05,226.270004,226.580002,225.479996,226.399994,199.582626,78379000
2017-01-06,226.529999,227.75,225.899994,227.210007,200.296707,71559900
2017-01-09,226.910004,227.070007,226.419998,226.460007,199.635513,46939700


In [12]:
# Convert to Polars DataFrame and include dates
import polars as pl

data.reset_index(inplace=True) # need to add the date as index otherwise it will drop the date during conversion
data = pl.DataFrame(data)

# Select relevant columns: Date, Open, High, Low, Adj Close, and Volume
data = data.select(["Date", "Open", "High", "Low", "Adj Close", "Volume"])

In [13]:

print(data.columns)
print(data.schema)

['Date', 'Open', 'High', 'Low', 'Adj Close', 'Volume']
OrderedDict({'Date': Datetime(time_unit='ns', time_zone=None), 'Open': Float64, 'High': Float64, 'Low': Float64, 'Adj Close': Float64, 'Volume': Int64})


In [14]:
data.head(3)

Date,Open,High,Low,Adj Close,Volume
datetime[ns],f64,f64,f64,f64,i64
2017-01-03 00:00:00,225.039993,225.830002,223.880005,198.560059,91366500
2017-01-04 00:00:00,225.619995,226.75,225.610001,199.741287,78744400
2017-01-05 00:00:00,226.270004,226.580002,225.479996,199.582626,78379000


In [16]:
# Create a copy of the dataframe and add returns and range columns
df = data.clone()

# Calculate daily returns
df = df.with_columns((pl.col("Adj Close") / pl.col("Adj Close").shift(1) - 1).alias("Returns")) # the -1 returns the percent return in decimals

# Calculate daily range (volatility)
df = df.with_columns((pl.col("High") / pl.col("Low")-1).alias("Range")) # the -1 returns the percent return in decimals

# Drop NaN values created by the pct_change method
# In Polars, the drop_nulls method does not require the inplace=True parameter as it returns a new DataFrame by default. 
df = df.drop_nulls()

# Display the updated dataframe with new columns
df.head()

Date,Open,High,Low,Adj Close,Volume,Returns,Range
datetime[ns],f64,f64,f64,f64,i64,f64,f64
2017-01-04 00:00:00,225.619995,226.75,225.610001,199.741287,78744400,0.005949,0.005053
2017-01-05 00:00:00,226.270004,226.580002,225.479996,199.582626,78379000,-0.000794,0.004879
2017-01-06 00:00:00,226.529999,227.75,225.899994,200.296707,71559900,0.003578,0.008189
2017-01-09 00:00:00,226.910004,227.070007,226.419998,199.635513,46939700,-0.003301,0.002871
2017-01-10 00:00:00,226.479996,227.449997,226.009995,199.635513,63771900,0.0,0.006371


In [17]:
df.tail()

Date,Open,High,Low,Adj Close,Volume,Returns,Range
datetime[ns],f64,f64,f64,f64,i64,f64,f64
2022-05-24 00:00:00,392.559998,395.149994,386.959991,380.571136,91448800,-0.007634,0.021165
2022-05-25 00:00:00,392.309998,399.450012,391.890015,383.93335,91472900,0.008835,0.019291
2022-05-26 00:00:00,398.670013,407.040009,398.450012,391.604919,82168300,0.019982,0.021559
2022-05-27 00:00:00,407.910004,415.380005,407.700012,401.218506,84768700,0.024549,0.018837
2022-05-31 00:00:00,413.549988,416.459991,410.029999,398.967285,95937000,-0.005611,0.015682


## Structure Data

In [19]:
# Structure Data
X_train = df.select(['Returns', 'Range']) # in polars we use .select

# Display the first few rows of the structured data
X_train.head()

Returns,Range
f64,f64
0.005949,0.005053
-0.000794,0.004879
0.003578,0.008189
-0.003301,0.002871
0.0,0.006371
