### Imports

In [1]:
# Remove Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# General numerical operations
import numpy as np

# Data Management libraries
import polars as pl  # For data manipulation using Polars DataFrame
import pandas as pd  # For data manipulation using Pandas DataFrame (if needed)
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets

# Machine Learning libraries
from xgboost import XGBClassifier  # For using the XGBoost classifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score  # For hyperparameter tuning and cross-validation
from sklearn.model_selection import RepeatedStratifiedKFold  # For creating repeated stratified K-Folds

# Evaluation metrics
from sklearn.metrics import precision_score  # For calculating precision score

# Reporting and visualization
import matplotlib.pyplot as plt  # For creating plots and visualizations


### Resolving Errors on Mac OSX

1. **ModuleNotFoundError: No module named 'xgboost'**
    - **Solution**: Install the `xgboost` library using `pip`.
        ```python
        !pip install xgboost
        ```

2. **XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded**
    - **Solution**: Install the OpenMP runtime required by XGBoost.
        - Use Homebrew to install `libomp`:
            ```sh
            brew install libomp
            ```
        - If you don't have Homebrew installed, you can install it by running:
            ```sh
            /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
            ```


### Import Preprocessed Data

In [4]:
# Read the CSV file
df = pl.read_csv("data/BTC-USD.csv")
# We start by reading the CSV file into a Polars DataFrame. 


df = df.with_columns(pl.col("date").cast(pl.Datetime)) # now will be a datetime dtype
# We convert the 'date' column to Datetime type. This is crucial for proper date-based operations and sorting.
# Even if the column is already in Datetime format, this step ensures consistency and can handle string-to-date conversion if needed.

# Sort the DataFrame by the 'date' column in ascending order
df = df.sort("date", descending=False)
# We sort the DataFrame by the 'date' column. This organizes our data chronologically, which is typically 
# desired for time series data like stock prices. The 'descending=False' argument ensures oldest dates come first.

print(df.head(2))


shape: (2, 38)
┌──────────────┬─────┬──────────┬──────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ date         ┆ DOW ┆ open     ┆ high     ┆ … ┆ RSI_Ret_6 ┆ returns_7 ┆ range_7  ┆ RSI_Ret_7 │
│ ---          ┆ --- ┆ ---      ┆ ---      ┆   ┆ ---       ┆ ---       ┆ ---      ┆ ---       │
│ datetime[μs] ┆ i64 ┆ f64      ┆ f64      ┆   ┆ f64       ┆ f64       ┆ f64      ┆ f64       │
╞══════════════╪═════╪══════════╪══════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ 2017-01-31   ┆ 1   ┆ 0.000878 ┆ 0.053054 ┆ … ┆ 1.55891   ┆ -0.030754 ┆ 0.035705 ┆ 0.911807  │
│ 00:00:00     ┆     ┆          ┆          ┆   ┆           ┆           ┆          ┆           │
│ 2017-02-01   ┆ 2   ┆ 0.054272 ┆ 0.017587 ┆ … ┆ 0.984416  ┆ 0.009919  ┆ 0.01297  ┆ 1.55891   │
│ 00:00:00     ┆     ┆          ┆          ┆   ┆           ┆           ┆          ┆           │
└──────────────┴─────┴──────────┴──────────┴───┴───────────┴───────────┴──────────┴───────────┘


### Add Prediction Target

## For Bitcoin Price Prediction
- Targets: next-day price, weekly return, price direction
- Influences feature engineering and model interpretation
- Aligns with specific trading strategies

In [7]:
df.columns

['date',
 'DOW',
 'open',
 'high',
 'low',
 'close',
 'adj_close',
 'volume',
 'returns',
 'range',
 'Bench_C_Rets',
 'RSI',
 'RSI_Ret',
 'MA_12',
 'MA_21',
 'rolling returns',
 'average range',
 'returns_1',
 'range_1',
 'RSI_Ret_1',
 'returns_2',
 'range_2',
 'RSI_Ret_2',
 'returns_3',
 'range_3',
 'RSI_Ret_3',
 'returns_4',
 'range_4',
 'RSI_Ret_4',
 'returns_5',
 'range_5',
 'RSI_Ret_5',
 'returns_6',
 'range_6',
 'RSI_Ret_6',
 'returns_7',
 'range_7',
 'RSI_Ret_7']

In [9]:
# Specify Target
# Create a new column 'TARGET' based on the condition where the next day's 'range' is greater than 'average range'
df = df.with_columns([
    (pl.when(pl.col('range').shift(-1) > pl.col('average range'))  # Compare the shifted 'range' column with 'average range'
        .then(1)  # If the condition is true, set 'TARGET' to 1
        .otherwise(0))  # If the condition is false, set 'TARGET' to 0
        .alias('TARGET')  # Name the new column 'TARGET'
])

# Display the first 2 rows with the new 'TARGET' column
print(df.head(2))

shape: (2, 39)
┌─────────────────────┬─────┬──────────┬──────────┬───┬───────────┬──────────┬───────────┬────────┐
│ date                ┆ DOW ┆ open     ┆ high     ┆ … ┆ returns_7 ┆ range_7  ┆ RSI_Ret_7 ┆ TARGET │
│ ---                 ┆ --- ┆ ---      ┆ ---      ┆   ┆ ---       ┆ ---      ┆ ---       ┆ ---    │
│ datetime[μs]        ┆ i64 ┆ f64      ┆ f64      ┆   ┆ f64       ┆ f64      ┆ f64       ┆ i32    │
╞═════════════════════╪═════╪══════════╪══════════╪═══╪═══════════╪══════════╪═══════════╪════════╡
│ 2017-01-31 00:00:00 ┆ 1   ┆ 0.000878 ┆ 0.053054 ┆ … ┆ -0.030754 ┆ 0.035705 ┆ 0.911807  ┆ 0      │
│ 2017-02-01 00:00:00 ┆ 2   ┆ 0.054272 ┆ 0.017587 ┆ … ┆ 0.009919  ┆ 0.01297  ┆ 1.55891   ┆ 0      │
└─────────────────────┴─────┴──────────┴──────────┴───┴───────────┴──────────┴───────────┴────────┘


In [15]:
# Check for NaNs
nan_location = df.select(pl.all().is_null()).to_numpy().nonzero()
# pl.all().is_null(): Creates a boolean mask for NaNs.
# .to_numpy(): Converts the mask to a NumPy array.
# .nonzero(): Retrieves the indices of the True values in the boolean mask, indicating the positions of NaNs.

nan_location

(array([], dtype=int64), array([], dtype=int64))

In [16]:
# Fill NA if needed
# Assuming 'df' is a Polars DataFrame
df = df.with_columns(pl.col("TARGET").fill_nan(0))
print(df.tail())

shape: (5, 39)
┌──────────────┬─────┬───────────┬───────────┬───┬───────────┬──────────┬───────────┬────────┐
│ date         ┆ DOW ┆ open      ┆ high      ┆ … ┆ returns_7 ┆ range_7  ┆ RSI_Ret_7 ┆ TARGET │
│ ---          ┆ --- ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---      ┆ ---       ┆ ---    │
│ datetime[μs] ┆ i64 ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64      ┆ f64       ┆ i32    │
╞══════════════╪═════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪═══════════╪════════╡
│ 2024-07-14   ┆ 6   ┆ 0.022734  ┆ 0.025799  ┆ … ┆ -0.042097 ┆ 0.046203 ┆ 0.931954  ┆ 1      │
│ 00:00:00     ┆     ┆           ┆           ┆   ┆           ┆          ┆           ┆        │
│ 2024-07-15   ┆ 0   ┆ 0.02685   ┆ 0.057731  ┆ … ┆ 0.015327  ┆ 0.070144 ┆ 1.25864   ┆ 1      │
│ 00:00:00     ┆     ┆           ┆           ┆   ┆           ┆          ┆           ┆        │
│ 2024-07-16   ┆ 1   ┆ 0.065262  ┆ 0.007464  ┆ … ┆ 0.022998  ┆ 0.034134 ┆ 0.981631  ┆ 0      │
│ 00:00:00     ┆     ┆           ┆ 

In [17]:
# Remove unwanted columns
df_tts = df.drop(['close', 'Bench_C_Rets', 'open', 'high', 'low'])

# Display the resulting DataFrame to verify
print(df_tts.head(2))

shape: (2, 34)
┌──────────────┬─────┬────────────┬───────────┬───┬───────────┬──────────┬───────────┬────────┐
│ date         ┆ DOW ┆ adj_close  ┆ volume    ┆ … ┆ returns_7 ┆ range_7  ┆ RSI_Ret_7 ┆ TARGET │
│ ---          ┆ --- ┆ ---        ┆ ---       ┆   ┆ ---       ┆ ---      ┆ ---       ┆ ---    │
│ datetime[μs] ┆ i64 ┆ f64        ┆ f64       ┆   ┆ f64       ┆ f64      ┆ f64       ┆ i32    │
╞══════════════╪═════╪════════════╪═══════════╪═══╪═══════════╪══════════╪═══════════╪════════╡
│ 2017-01-31   ┆ 1   ┆ 970.403015 ┆ 1.103895  ┆ … ┆ -0.030754 ┆ 0.035705 ┆ 0.911807  ┆ 0      │
│ 00:00:00     ┆     ┆            ┆           ┆   ┆           ┆          ┆           ┆        │
│ 2017-02-01   ┆ 2   ┆ 989.02301  ┆ -0.087932 ┆ … ┆ 0.009919  ┆ 0.01297  ┆ 1.55891   ┆ 0      │
│ 00:00:00     ┆     ┆            ┆           ┆   ┆           ┆          ┆           ┆        │
└──────────────┴─────┴────────────┴───────────┴───┴───────────┴──────────┴───────────┴────────┘


In [20]:
# Split into Learning (X) and Target (y) Data
X = df_tts[:, :-1].to_numpy()  # Convert to numpy array and select all columns except the last one
y = df_tts[:, -1].to_numpy()   # Convert to numpy array and select the last column



In [23]:
# Perform Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting splits
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)

Shape of X_train:  (2180, 33)
Shape of y_train:  (2180,)
