### Imports

In [1]:
# Remove unwanted warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Data Management
import polars as pl
import pandas as pd
from pandas_datareader.data import DataReader
from ta import add_all_ta_features
import yfinance as yf


# Statistics 
from statsmodels.tsa.stattools import adfuller

# Unsupervised Machine Learning
from sklearn.decomposition import PCA # Principle Component Analysis

# Supervised Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

# Reporting
import matplotlib.pyplot as plt



### Initial Data Extraction

In [2]:
# Data Extraction
import yfinance as yf

start_date = '2017-01-01'
end_date = '2022-06-01'
symbol = '^VIX'
df_pd = yf.download(symbol, start=start_date, end=end_date)
df_pd.head()


[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,14.07,14.07,12.85,12.85,12.85,0
2017-01-04,12.78,12.8,11.63,11.85,11.85,0
2017-01-05,11.96,12.09,11.4,11.67,11.67,0
2017-01-06,11.7,11.74,10.98,11.32,11.32,0
2017-01-09,11.71,12.08,11.46,11.56,11.56,0


In [3]:
# Convert to Polars DataFrame

# Reset the index to make the date a column
df_pd.reset_index(inplace=True)

df = pl.from_pandas(df_pd)
print(df.head)

<bound method DataFrame.head of shape: (1_362, 7)
┌─────────────────────┬───────────┬───────────┬───────────┬───────────┬───────────┬────────┐
│ Date                ┆ Open      ┆ High      ┆ Low       ┆ Close     ┆ Adj Close ┆ Volume │
│ ---                 ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---    │
│ datetime[ns]        ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ i64    │
╞═════════════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪════════╡
│ 2017-01-03 00:00:00 ┆ 14.07     ┆ 14.07     ┆ 12.85     ┆ 12.85     ┆ 12.85     ┆ 0      │
│ 2017-01-04 00:00:00 ┆ 12.78     ┆ 12.8      ┆ 11.63     ┆ 11.85     ┆ 11.85     ┆ 0      │
│ 2017-01-05 00:00:00 ┆ 11.96     ┆ 12.09     ┆ 11.4      ┆ 11.67     ┆ 11.67     ┆ 0      │
│ 2017-01-06 00:00:00 ┆ 11.7      ┆ 11.74     ┆ 10.98     ┆ 11.32     ┆ 11.32     ┆ 0      │
│ 2017-01-09 00:00:00 ┆ 11.71     ┆ 12.08     ┆ 11.46     ┆ 11.56     ┆ 11.56     ┆ 0      │
│ …                 

In [4]:
df.schema


OrderedDict([('Date', Datetime(time_unit='ns', time_zone=None)),
             ('Open', Float64),
             ('High', Float64),
             ('Low', Float64),
             ('Close', Float64),
             ('Adj Close', Float64),
             ('Volume', Int64)])

In [5]:
# Convert the 'Date' column to the desired timezone (New York)
df = df.with_columns(
    pl.col('Date').dt.replace_time_zone('America/New_York')
)

print(df.head())

shape: (5, 7)
┌────────────────────────────────┬───────┬───────┬───────┬───────┬───────────┬────────┐
│ Date                           ┆ Open  ┆ High  ┆ Low   ┆ Close ┆ Adj Close ┆ Volume │
│ ---                            ┆ ---   ┆ ---   ┆ ---   ┆ ---   ┆ ---       ┆ ---    │
│ datetime[ns, America/New_York] ┆ f64   ┆ f64   ┆ f64   ┆ f64   ┆ f64       ┆ i64    │
╞════════════════════════════════╪═══════╪═══════╪═══════╪═══════╪═══════════╪════════╡
│ 2017-01-03 00:00:00 EST        ┆ 14.07 ┆ 14.07 ┆ 12.85 ┆ 12.85 ┆ 12.85     ┆ 0      │
│ 2017-01-04 00:00:00 EST        ┆ 12.78 ┆ 12.8  ┆ 11.63 ┆ 11.85 ┆ 11.85     ┆ 0      │
│ 2017-01-05 00:00:00 EST        ┆ 11.96 ┆ 12.09 ┆ 11.4  ┆ 11.67 ┆ 11.67     ┆ 0      │
│ 2017-01-06 00:00:00 EST        ┆ 11.7  ┆ 11.74 ┆ 10.98 ┆ 11.32 ┆ 11.32     ┆ 0      │
│ 2017-01-09 00:00:00 EST        ┆ 11.71 ┆ 12.08 ┆ 11.46 ┆ 11.56 ┆ 11.56     ┆ 0      │
└────────────────────────────────┴───────┴───────┴───────┴───────┴───────────┴────────┘


In [6]:
# Add TA
# Convert back to Pandas DataFrame
df_pd = df.to_pandas()

# Add technical analysis features
df_pd = add_all_ta_features(df_pd, open='Open', high='High', low='Low', close='Adj Close', volume='Volume', fillna=True)

# Convert back to Polars Dataframe, remember need to have the pyarrow module installed
df = pl.from_pandas(df_pd)

In [33]:
df.head(3)

Date,Open,High,Low,Close,Adj Close,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,volume_sma_em,volume_vpt,volume_vwap,volume_mfi,volume_nvi,volatility_bbm,volatility_bbh,volatility_bbl,volatility_bbw,volatility_bbp,volatility_bbhi,volatility_bbli,volatility_kcc,volatility_kch,volatility_kcl,volatility_kcw,volatility_kcp,volatility_kchi,volatility_kcli,volatility_dcl,volatility_dch,volatility_dcm,volatility_dcw,volatility_dcp,volatility_atr,…,trend_ichimoku_a,trend_ichimoku_b,trend_stc,trend_adx,trend_adx_pos,trend_adx_neg,trend_cci,trend_visual_ichimoku_a,trend_visual_ichimoku_b,trend_aroon_up,trend_aroon_down,trend_aroon_ind,trend_psar_up,trend_psar_down,trend_psar_up_indicator,trend_psar_down_indicator,momentum_rsi,momentum_stoch_rsi,momentum_stoch_rsi_k,momentum_stoch_rsi_d,momentum_tsi,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
"datetime[ns, America/New_York]",f64,f64,f64,f64,f64,i64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2017-01-03 00:00:00 EST,14.07,14.07,12.85,12.85,12.85,0,-0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,1000.0,12.85,12.85,12.85,0.0,0.0,0.0,0.0,13.256667,14.476666,12.036668,18.405823,0.333333,0.0,0.0,12.85,14.07,13.46,9.494158,0.0,0.0,…,13.46,13.46,0.0,0.0,0.0,0.0,0.0,20.47047,22.23312,0.0,0.0,0.0,10.94,14.07,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.85,0.0,0.0,0.0
2017-01-04 00:00:00 EST,12.78,12.8,11.63,11.85,11.85,0,-0.0,0,0.0,-0.0,0.0,0.0,0.0,0.0,50.0,1000.0,12.35,13.35,11.35,16.194331,0.25,0.0,0.0,12.675,13.87,11.48,18.856011,0.154812,0.0,0.0,11.63,14.07,12.85,19.757081,0.090164,0.0,…,12.85,12.85,0.0,0.0,0.0,0.0,-66.666667,20.47047,22.23312,0.0,4.0,-4.0,10.94,14.07,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,9.016406,9.016406,4.508203,-90.983594,0.0,0.0,-0.624394,-0.124879,-0.499515,0.0,0.0,0.0,12.098375,-7.782101,-8.101594,-7.782101
2017-01-05 00:00:00 EST,11.96,12.09,11.4,11.67,11.67,0,-0.0,0,0.0,-0.0,0.0,0.0,0.0,0.0,50.0,1000.0,12.123334,13.161452,11.085215,17.125952,0.281656,0.0,0.0,12.356667,13.383333,11.33,16.61721,0.165584,0.0,0.0,11.4,14.07,12.735,22.023646,0.101124,0.0,…,12.735,12.735,0.0,0.0,0.0,0.0,-70.740755,20.47047,22.23312,0.0,8.0,-8.0,10.94,14.07,0.0,1.0,0.0,0.0,0.0,0.0,-100.0,15.654975,10.112376,6.376261,-89.887624,0.0,0.0,-1.226732,-0.345249,-0.881483,0.0,0.0,0.0,11.82578,-1.51899,-1.530645,-9.182881


## Data Processing -Stationarity 

In [18]:
# Convert back to Pandas DataFrame for ADF test
df_pd = df.to_pandas()

# Identify non-stationary columns
non_stationaries = []

# Loop through each column in the DataFrame
for col in df_pd.columns:
    if col != 'Date':  # Exclude the Date column
        series = df_pd[col].dropna().values  # Drop missing values
        if series.max() != series.min():  # Check if the series is not constant
            dftest = adfuller(series)  # Perform the Augmented Dickey-Fuller test
            p_value = dftest[1]  # Extract the p-value from the ADF test results
            t_test = dftest[0] < dftest[4]['1%']  # Check if the test statistic is less than the 1% critical value
            if p_value > 0.05 or not t_test:  # Check if the series is non-stationary
                non_stationaries.append(col)  # Add the column to the list of non-stationary columns


print(f"Non-Stationary Features Found: {len(non_stationaries)}")

# Convert the DataFrame back to Polars if needed for further processing
df = pl.from_pandas(df_pd)

Non-Stationary Features Found: 15


In [24]:
df_pd.dtypes

Date                 datetime64[ns, America/New_York]
Open                                          float64
High                                          float64
Low                                           float64
Close                                         float64
                                   ...               
momentum_pvo_hist                             float64
momentum_kama                                 float64
others_dr                                     float64
others_dlr                                    float64
others_cr                                     float64
Length: 93, dtype: object

In [29]:
from statsmodels.tsa.stattools import adfuller
import numpy as np
import pandas as pd

def analyze_column(col_name, values):
    if np.all(values == values[0]):
        return "constant"
    elif np.all(np.isnan(values)):
        return "all_nan"
    elif np.any(np.isnan(values)):
        return "contains_nan"
    else:
        try:
            dftest = adfuller(values, maxlag=1)
            p_value = dftest[1]
            t_test = dftest[0] < dftest[4]['1%']
            if p_value > 0.05 or not t_test:
                return "non_stationary"
            else:
                return "stationary"
        except Exception as e:
            return f"error: {str(e)}"

results = {}

for col in df_pd.columns:
    if col == 'Date':
        results[col] = "datetime"
    else:
        values = df_pd[col].values
        results[col] = analyze_column(col, values)

# Count occurrences of each result type
result_counts = pd.Series(results).value_counts()

print("Column Analysis Results:")
print(result_counts)
print("\nDetailed Results:")
for col, result in results.items():
    print(f"{col}: {result}")

Column Analysis Results:
stationary        62
non_stationary    16
constant          14
datetime           1
Name: count, dtype: int64

Detailed Results:
Date: datetime
Open: stationary
High: stationary
Low: stationary
Close: stationary
Adj Close: stationary
Volume: constant
volume_adi: constant
volume_obv: constant
volume_cmf: constant
volume_fi: constant
volume_em: constant
volume_sma_em: constant
volume_vpt: constant
volume_vwap: constant
volume_mfi: constant
volume_nvi: constant
volatility_bbm: stationary
volatility_bbh: non_stationary
volatility_bbl: stationary
volatility_bbw: stationary
volatility_bbp: stationary
volatility_bbhi: stationary
volatility_bbli: stationary
volatility_kcc: stationary
volatility_kch: stationary
volatility_kcl: non_stationary
volatility_kcw: stationary
volatility_kcp: stationary
volatility_kchi: stationary
volatility_kcli: stationary
volatility_dcl: non_stationary
volatility_dch: non_stationary
volatility_dcm: non_stationary
volatility_dcw: stationary
vo

In [30]:
from statsmodels.tsa.stattools import adfuller, kpss
import numpy as np
import pandas as pd

def analyze_column(col_name, values):
    if np.all(values == values[0]):
        return "constant"
    elif np.all(np.isnan(values)):
        return "all_nan"
    elif np.any(np.isnan(values)):
        return "contains_nan"
    else:
        try:
            # ADF Test
            adf_result = adfuller(values, maxlag=1)
            adf_p_value = adf_result[1]
            
            # KPSS Test
            kpss_result = kpss(values, regression='c', nlags="auto")
            kpss_p_value = kpss_result[1]
            
            # Combine results
            if adf_p_value > 0.05 and kpss_p_value <= 0.05:
                return "non_stationary"
            elif adf_p_value <= 0.05 and kpss_p_value > 0.05:
                return "stationary"
            elif adf_p_value <= 0.05 and kpss_p_value <= 0.05:
                return "conflicting"
            else:
                return "inconclusive"
        except Exception as e:
            return f"error: {str(e)}"

results = {}

for col in df_pd.columns:
    if col == 'Date':
        results[col] = "datetime"
    else:
        values = df_pd[col].values
        results[col] = analyze_column(col, values)

# Count occurrences of each result type
result_counts = pd.Series(results).value_counts()

print("Column Analysis Results:")
print(result_counts)
print("\nDetailed Results:")
for col, result in results.items():
    print(f"{col}: {result}")

# Count non-stationary and conflicting results
non_stationary_count = sum(1 for result in results.values() if result in ["non_stationary", "conflicting"])
print(f"\nTotal non-stationary or conflicting: {non_stationary_count}")

Column Analysis Results:
stationary        48
conflicting       18
constant          14
non_stationary    11
datetime           1
inconclusive       1
Name: count, dtype: int64

Detailed Results:
Date: datetime
Open: conflicting
High: conflicting
Low: conflicting
Close: conflicting
Adj Close: conflicting
Volume: constant
volume_adi: constant
volume_obv: constant
volume_cmf: constant
volume_fi: constant
volume_em: constant
volume_sma_em: constant
volume_vpt: constant
volume_vwap: constant
volume_mfi: constant
volume_nvi: constant
volatility_bbm: conflicting
volatility_bbh: conflicting
volatility_bbl: conflicting
volatility_bbw: stationary
volatility_bbp: stationary
volatility_bbhi: stationary
volatility_bbli: stationary
volatility_kcc: conflicting
volatility_kch: conflicting
volatility_kcl: conflicting
volatility_kcw: stationary
volatility_kcp: stationary
volatility_kchi: stationary
volatility_kcli: stationary
volatility_dcl: non_stationary
volatility_dch: non_stationary
volatility_dcm:

look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(values, regression='c', nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(values, regression='c', nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(values, regression='c', nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(values, regression='c', nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(values, regression='c', nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(values, regression='c', nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(values, regression='c', nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  k

In [34]:
import polars as pl

# Assuming df is your Polars DataFrame
# and non_stationaries is a list of column names that are non-stationary

# Create a copy of the DataFrame
df_stationary = df.clone()

# Convert non-stationary columns to stationary using percentage change
for col in non_stationaries:
    df_stationary = df_stationary.with_columns(
        pl.col(col).pct_change().alias(col)
    )

# Remove the first row which will contain nulls due to pct_change()
df_stationary = df_stationary.slice(1)

In [35]:
df_stationary.head(3)

Date,Open,High,Low,Close,Adj Close,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,volume_sma_em,volume_vpt,volume_vwap,volume_mfi,volume_nvi,volatility_bbm,volatility_bbh,volatility_bbl,volatility_bbw,volatility_bbp,volatility_bbhi,volatility_bbli,volatility_kcc,volatility_kch,volatility_kcl,volatility_kcw,volatility_kcp,volatility_kchi,volatility_kcli,volatility_dcl,volatility_dch,volatility_dcm,volatility_dcw,volatility_dcp,volatility_atr,…,trend_ichimoku_a,trend_ichimoku_b,trend_stc,trend_adx,trend_adx_pos,trend_adx_neg,trend_cci,trend_visual_ichimoku_a,trend_visual_ichimoku_b,trend_aroon_up,trend_aroon_down,trend_aroon_ind,trend_psar_up,trend_psar_down,trend_psar_up_indicator,trend_psar_down_indicator,momentum_rsi,momentum_stoch_rsi,momentum_stoch_rsi_k,momentum_stoch_rsi_d,momentum_tsi,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
"datetime[ns, America/New_York]",f64,f64,f64,f64,f64,i64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2017-01-04 00:00:00 EST,12.78,12.8,-0.094942,11.85,11.85,0,-0.0,0,0.0,-0.0,0.0,0.0,0.0,0.0,50.0,1000.0,12.35,0.038911,-0.116732,16.194331,0.25,0.0,0.0,12.675,13.87,-0.046248,18.856011,0.154812,0.0,0.0,-0.094942,14.07,-0.045319,19.757081,0.090164,0.0,…,-0.045319,-0.045319,0.0,0.0,0.0,0.0,-66.666667,20.47047,0.0,0.0,4.0,-4.0,0.0,14.07,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,9.016406,9.016406,4.508203,-90.983594,0.0,0.0,-0.624394,-0.124879,-0.499515,0.0,0.0,0.0,-0.058492,-7.782101,-8.101594,-7.782101
2017-01-05 00:00:00 EST,11.96,12.09,-0.019776,11.67,11.67,0,-0.0,0,0.0,-0.0,0.0,0.0,0.0,0.0,50.0,1000.0,12.123334,-0.014123,-0.023329,17.125952,0.281656,0.0,0.0,12.356667,13.383333,-0.013066,16.61721,0.165584,0.0,0.0,-0.019776,14.07,-0.008949,22.023646,0.101124,0.0,…,-0.008949,-0.008949,0.0,0.0,0.0,0.0,-70.740755,20.47047,0.0,0.0,8.0,-8.0,0.0,14.07,0.0,1.0,0.0,0.0,0.0,0.0,-100.0,15.654975,10.112376,6.376261,-89.887624,0.0,0.0,-1.226732,-0.345249,-0.881483,0.0,0.0,0.0,-0.022532,-1.51899,-1.530645,-9.182881
2017-01-06 00:00:00 EST,11.7,11.74,-0.036842,11.32,11.32,0,-0.0,0,0.0,-0.0,0.0,0.0,0.0,0.0,50.0,1000.0,11.9225,-0.007763,-0.027018,19.069543,0.234998,0.0,0.0,12.104167,13.064167,-0.016402,15.862307,0.09158,0.0,0.0,-0.036842,14.07,-0.01649,25.917384,0.110032,0.0,…,-0.01649,-0.01649,0.0,0.0,0.0,0.0,-87.635601,20.47047,0.0,0.0,12.0,-12.0,0.0,14.0166,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,21.336782,11.003241,10.044008,-88.996759,0.0,0.0,-1.916831,-0.659566,-1.257265,0.0,0.0,0.0,-0.027227,-2.999146,-3.045041,-11.90662


In [44]:
import polars as pl

# Find columns with NaN values
na_counts = df_stationary.select([
    pl.col(column).is_null().sum().alias(column) for column in df_stationary.columns
])

# Filter columns with NaN values
na_list = [column for column in na_counts.columns if na_counts.select(column).to_series()[0] > 0]

# Drop rows with NaN values
if na_list:  # Ensure na_list is not empty
    df_stationary = df_stationary.drop_nulls(subset=na_list)

# Show the result
print(df_stationary)


shape: (1_361, 93)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date      ┆ Open      ┆ High      ┆ Low       ┆ … ┆ momentum_ ┆ others_dr ┆ others_dl ┆ others_c │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ kama      ┆ ---       ┆ r         ┆ r        │
│ datetime[ ┆ f64       ┆ f64       ┆ f64       ┆   ┆ ---       ┆ f64       ┆ ---       ┆ ---      │
│ ns, Ameri ┆           ┆           ┆           ┆   ┆ f64       ┆           ┆ f64       ┆ f64      │
│ ca/New_Yo ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ rk]       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2017-01-0 ┆ 12.78     ┆ 12.8      ┆ -0.094942 ┆ … ┆ -0.058492 ┆ -7.782101 ┆ -8.101594 ┆ -7.78210 │
│ 4         ┆           ┆           ┆           ┆   ┆           ┆       

In [48]:
import polars as pl
import numpy as np

# Define a function to replace inf values with 0
def replace_inf_with_zero(column):
    return pl.col(column).map_elements(lambda x: 0 if np.isinf(x) else x, return_dtype=pl.Float64)

# Apply the function to each column
df_stationary = df_stationary.with_columns([
    replace_inf_with_zero(col).alias(col) for col in df_stationary.columns if df_stationary[col].dtype in [pl.Float64, pl.Float32]
])

# Show the result
print(df_stationary.head())


shape: (5, 93)
┌──────────────┬───────┬───────┬───────────┬───┬─────────────┬───────────┬────────────┬────────────┐
│ Date         ┆ Open  ┆ High  ┆ Low       ┆ … ┆ momentum_ka ┆ others_dr ┆ others_dlr ┆ others_cr  │
│ ---          ┆ ---   ┆ ---   ┆ ---       ┆   ┆ ma          ┆ ---       ┆ ---        ┆ ---        │
│ datetime[ns, ┆ f64   ┆ f64   ┆ f64       ┆   ┆ ---         ┆ f64       ┆ f64        ┆ f64        │
│ America/New_ ┆       ┆       ┆           ┆   ┆ f64         ┆           ┆            ┆            │
│ York]        ┆       ┆       ┆           ┆   ┆             ┆           ┆            ┆            │
╞══════════════╪═══════╪═══════╪═══════════╪═══╪═════════════╪═══════════╪════════════╪════════════╡
│ 2017-01-04   ┆ 12.78 ┆ 12.8  ┆ -0.094942 ┆ … ┆ -0.058492   ┆ -7.782101 ┆ -8.101594  ┆ -7.782101  │
│ 00:00:00 EST ┆       ┆       ┆           ┆   ┆             ┆           ┆            ┆            │
│ 2017-01-05   ┆ 11.96 ┆ 12.09 ┆ -0.019776 ┆ … ┆ -0.022532   ┆ -1.51899  ┆ -

## Data Preprocessing - Scaling and Target Setting

In [52]:
import polars as pl

# Data Preprocessing - Scaling and Target Setting

# Set initial TARGET column to -1
df_stationary = df_stationary.with_columns(pl.lit(-1).alias("TARGET"))

# Update TARGET column based on the condition
df_stationary = df_stationary.with_columns(
    # Check if the next day's Adj Close is greater than the current day's Adj Close
    pl.when(pl.col("Adj Close").shift(-1) > pl.col("Adj Close"))
    # If the condition is true, set TARGET to 1
    .then(1)
    # If the condition is false, retain the existing TARGET value
    .otherwise(pl.col("TARGET"))
    # Alias the result to the TARGET column
    .alias("TARGET")
)

df_stationary = df_stationary.drop_nulls() #  Polars operations do not modify the DataFrame in place. 
# Instead, Polars operations return a new DataFrame. 


print(df_stationary.head())


shape: (5, 94)
┌───────────────────┬───────┬───────┬───────────┬───┬───────────┬────────────┬────────────┬────────┐
│ Date              ┆ Open  ┆ High  ┆ Low       ┆ … ┆ others_dr ┆ others_dlr ┆ others_cr  ┆ TARGET │
│ ---               ┆ ---   ┆ ---   ┆ ---       ┆   ┆ ---       ┆ ---        ┆ ---        ┆ ---    │
│ datetime[ns,      ┆ f64   ┆ f64   ┆ f64       ┆   ┆ f64       ┆ f64        ┆ f64        ┆ i32    │
│ America/New_York] ┆       ┆       ┆           ┆   ┆           ┆            ┆            ┆        │
╞═══════════════════╪═══════╪═══════╪═══════════╪═══╪═══════════╪════════════╪════════════╪════════╡
│ 2017-01-04        ┆ 12.78 ┆ 12.8  ┆ -0.094942 ┆ … ┆ -7.782101 ┆ -8.101594  ┆ -7.782101  ┆ -1     │
│ 00:00:00 EST      ┆       ┆       ┆           ┆   ┆           ┆            ┆            ┆        │
│ 2017-01-05        ┆ 11.96 ┆ 12.09 ┆ -0.019776 ┆ … ┆ -1.51899  ┆ -1.530645  ┆ -9.182881  ┆ -1     │
│ 00:00:00 EST      ┆       ┆       ┆           ┆   ┆           ┆           

The code above performs data preprocessing by setting up a `TARGET` column for supervised machine learning. Initially, it sets all values in the `TARGET` column to -1. Then, it updates this column based on whether the adjusted close price of the next day is greater than that of the current day, encoding this future information as 1 in the `TARGET` column. This preprocessing step is essential for later stages where the model will use this target variable to learn and make predictions.

- `pl.col("Adj Close").shift(-1) > pl.col("Adj Close")`: This condition checks if the next day's `Adj Close` is greater than the current day's `Adj Close`. The `shift(-1)` function shifts the column values by one position backward, effectively looking one day ahead.
- `pl.when(...).then(1).otherwise(pl.col("TARGET"))`: This logic sets the `TARGET` column to 1 if the condition is met (i.e., if the next day's `Adj Close` is greater than the current day's). Otherwise, it retains the original `TARGET` value (-1).
- `alias("TARGET")`: This ensures the updated column retains the name `TARGET`.

---

In [53]:
# Split Target from Feature set

import polars as pl

# Split Target from Featureset
# Select all columns except the last one for X (features)
X = df_stationary.select(pl.col('*').exclude('TARGET'))

# Select only the last column for y (target)
y = df_stationary.select('TARGET')

# Show the results
print("Features (X):")
print(X)
print("\nTarget (y):")
print(y)


Features (X):
shape: (1_361, 93)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date      ┆ Open      ┆ High      ┆ Low       ┆ … ┆ momentum_ ┆ others_dr ┆ others_dl ┆ others_c │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ kama      ┆ ---       ┆ r         ┆ r        │
│ datetime[ ┆ f64       ┆ f64       ┆ f64       ┆   ┆ ---       ┆ f64       ┆ ---       ┆ ---      │
│ ns, Ameri ┆           ┆           ┆           ┆   ┆ f64       ┆           ┆ f64       ┆ f64      │
│ ca/New_Yo ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ rk]       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2017-01-0 ┆ 12.78     ┆ 12.8      ┆ -0.094942 ┆ … ┆ -0.058492 ┆ -7.782101 ┆ -8.101594 ┆ -7.78210 │
│ 4         ┆           ┆           ┆           ┆   ┆     

In [56]:
# Feature Scaling

# Clone the original DataFrame to create a new one for scaling operations
df_sc = df_stationary.clone()  # df_sc stands for "dataframe scaled"

# Apply StandardScaler to the features (X) to normalize/standardize them
X_fs = StandardScaler().fit_transform(X)  # X_fs stands for "X feature scaled"


`StandardScaler()` is a function from the `sklearn.preprocessing` module that standardizes features by removing the mean and scaling to unit variance.

`.fit_transform(X)` first fits the scaler to the data (calculating the mean and standard deviation for each feature) and then transforms the data (scales the features accordingly).

The scaled features are stored in `X_fs`, which stands for "X feature scaled".

Standardizing features is a common preprocessing step in machine learning, especially for algorithms that are sensitive to the scale of the data, such as those involving distance calculations (e.g., k-NN, SVM, and neural networks).

The code creates a scaled version of the original DataFrame for feature scaling. It uses `StandardScaler` to standardize the features, ensuring they have a mean of 0 and a standard deviation of 1. This preprocessing step is crucial for many machine learning algorithms to perform well.


In [57]:
# Train Test Split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_fs, y, test_size=0.7, random_state=42)

The code splits the scaled feature set X_fs and the target variable y into training and testing subsets. By setting test_size=0.7, 70% of the data is allocated to the test set, and 30% is allocated to the training set. The random_state=42 parameter ensures that the data split is reproducible, meaning the same split will be obtained each time the code is run. This step is crucial for evaluating the performance of machine learning models, as it allows for testing the model on unseen data (the test set) after training it on the training set.

### Unsupervised ML - PCA Dimensionality Reduction