### Imports

In [2]:
# Importing necessary libraries and modules

# General
import numpy as np  # Importing numpy for numerical operations

# Data Management
import pandas as pd  # Importing pandas for data manipulation and analysis
import polars as pl
from sklearn.model_selection import train_test_split  # Importing train_test_split for splitting data into training and testing sets

# Machine Learning
from xgboost import XGBClassifier  # Importing XGBClassifier for extreme gradient boosting classification
from sklearn.model_selection import RandomizedSearchCV, cross_val_score  # Importing RandomizedSearchCV for hyperparameter tuning, cross_val_score for cross-validation
from sklearn.model_selection import RepeatedStratifiedKFold  # Importing RepeatedStratifiedKFold for repeated stratified k-fold cross-validation

# Binary Classification Specific Metrics
# Alternative method for plotting the ROC curve
from sklearn.metrics import roc_curve, auc  # Importing roc_curve and auc for ROC curve and AUC calculation

# General Metrics
from sklearn.metrics import accuracy_score, classification_report  # Importing accuracy_score and classification_report for model evaluation
from sklearn.metrics import precision_score, confusion_matrix  # Importing precision_score and confusion_matrix for precision and confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay  # Importing ConfusionMatrixDisplay for displaying confusion matrix

# Reporting
import matplotlib.pyplot as plt  # Importing pyplot from matplotlib for plotting
from matplotlib.pylab import rcParams  # Importing rcParams from matplotlib for setting plot parameters
from xgboost import plot_tree  # Importing plot_tree from xgboost for visualizing decision trees

# Function to plot ROC curve
def plot_roc(model, X_test, y_test):
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

### Data Management

In [8]:
# Data Extraction

df = pl.read_csv("data/BTC-USD.csv") # Read the CSV file into a polars df


df = df.with_columns(pl.col("date").cast(pl.Datetime)) # now will have datetime dtype

# Sort the DataFrame by the 'date' column in ascending order, argument ensures oldest dates come first.
df = df.sort("date", descending=False)

### Add Target

In [9]:
# Specify Target
df = df.with_columns([
    (pl.when(pl.col('range').shift(-1) > pl.col('average range'))  # Compare the 'range' shift less one column with 'average range'
        .then(1)  # If the condition is true, set 'TARGET' to 1
        .otherwise(0))  # If the condition is false, set 'TARGET' to 0
        .alias('TARGET')  # Name the new column 'TARGET'
])
# This allows you to create a target variable based on whether the 'range' of the next day is greater than or less than
# the 'average range' of the current day.

In [17]:
# Check for NaNs
nan_location = df.select(pl.all().is_null()).to_numpy().nonzero()
# pl.all().is_null(): Creates a boolean mask for NaNs.
# .to_numpy(): Converts the mask to a NumPy array.
# .nonzero(): Retrieves the indices of the True values in the boolean mask, indicating the positions of NaNs.

nan_location

(array([], dtype=int64), array([], dtype=int64))

In [18]:
# Fill NA if needed
df = df.with_columns(pl.col("TARGET").fill_nan(0))
df.tail()

date,DOW,open,high,low,close,adj_close,volume,returns,range,Bench_C_Rets,RSI,RSI_Ret,MA_12,MA_21,rolling returns,average range,returns_1,range_1,RSI_Ret_1,returns_2,range_2,RSI_Ret_2,returns_3,range_3,RSI_Ret_3,returns_4,range_4,RSI_Ret_4,returns_5,range_5,RSI_Ret_5,returns_6,range_6,RSI_Ret_6,returns_7,range_7,RSI_Ret_7,TARGET
datetime[μs],i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
2024-07-14 00:00:00,6,0.022734,0.025799,0.024721,60787.792969,60787.792969,0.301132,0.026267,0.03553,0.163694,44.348685,0.982906,57973.966146,59473.93006,-0.075268,0.034931,0.023014,0.034442,1.060974,0.00967,0.034324,1.136591,-0.006885,0.038148,0.909442,-0.004598,0.038144,1.041948,0.022998,0.034134,0.981631,0.015327,0.070144,1.25864,-0.042097,0.046203,0.931954,1
2024-07-15 00:00:00,0,0.02685,0.057731,0.024984,64870.152344,64870.152344,0.714162,0.067158,0.068614,0.162156,54.890198,1.237696,58365.318685,59692.631882,-0.010836,0.03695,0.026267,0.03553,0.982906,0.023014,0.034442,1.060974,0.00967,0.034324,1.136591,-0.006885,0.038148,0.909442,-0.004598,0.038144,1.041948,0.022998,0.034134,0.981631,0.015327,0.070144,1.25864,1
2024-07-16 00:00:00,1,0.065262,0.007464,0.029372,65097.148438,65097.148438,0.092476,0.003499,0.045871,0.162061,57.655443,1.050378,59041.939128,59849.417969,-0.014105,0.038036,0.067158,0.068614,1.237696,0.026267,0.03553,0.982906,0.023014,0.034442,1.060974,0.00967,0.034324,1.136591,-0.006885,0.038148,0.909442,-0.004598,0.038144,1.041948,0.022998,0.034134,0.981631,0
2024-07-17 00:00:00,2,0.004745,0.0109,0.022534,64118.792969,64118.792969,-0.218473,-0.015029,0.033972,0.161807,60.293388,1.045754,59663.307292,60006.918713,-0.026902,0.038097,0.003499,0.045871,1.050378,0.067158,0.068614,1.237696,0.026267,0.03553,0.982906,0.023014,0.034442,1.060974,0.00967,0.034324,1.136591,-0.006885,0.038148,0.909442,-0.004598,0.038144,1.041948,0
2024-07-18 00:00:00,3,-0.015165,-0.014562,-0.010172,63974.066406,63974.066406,-0.162514,-0.002257,0.029385,0.161775,71.713449,1.189408,60135.851237,60119.740885,-0.008863,0.037781,-0.015029,0.033972,1.045754,0.003499,0.045871,1.050378,0.067158,0.068614,1.237696,0.026267,0.03553,0.982906,0.023014,0.034442,1.060974,0.00967,0.034324,1.136591,-0.006885,0.038148,0.909442,0


### Train Test Split

In [16]:
# Feature Selection. We are using the features identified as the top
df_tts = df.clone()

# Select the specified columns
df_tts = df_tts.select([
    "DOW", 
    "volume", 
    "returns", 
    "range", 
    "RSI", 
    "average range", 
    "range_1",
    "range_2", 
    "RSI_Ret_2", 
    "range_4", 
    "range_6",
    "TARGET"
])

# Display the first few rows of the new DataFrame
df_tts.tail(5)


DOW,volume,returns,range,RSI,average range,range_1,range_2,RSI_Ret_2,range_4,range_6,TARGET
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
6,0.301132,0.026267,0.03553,44.348685,0.034931,0.034442,0.034324,1.136591,0.038144,0.070144,1
0,0.714162,0.067158,0.068614,54.890198,0.03695,0.03553,0.034442,1.060974,0.038148,0.034134,1
1,0.092476,0.003499,0.045871,57.655443,0.038036,0.068614,0.03553,0.982906,0.034324,0.038144,0
2,-0.218473,-0.015029,0.033972,60.293388,0.038097,0.045871,0.068614,1.237696,0.034442,0.038148,0
3,-0.162514,-0.002257,0.029385,71.713449,0.037781,0.033972,0.045871,1.050378,0.03553,0.034324,0


In [23]:
# Split into Learning (X) and Target (y) Data

# Last column is the target column
target_column = "TARGET"

# Select feature columns and target column
feature_columns = [
    "DOW", 
    "volume", 
    "returns", 
    "range", 
    "RSI", 
    "average range", 
    "range_1",
    "range_2", 
    "RSI_Ret_2", 
    "range_4", 
    "range_6"
]

# Create X (features) and y (target), needs to be in numpy since sklearn uses numpy or pandas
X = df_tts.select(feature_columns).to_numpy()
y = df_tts.select([target_column]).to_numpy().ravel()  # Ensure y is a 1D array


# Display the first 3 rows of the numpy arrays X and y
print("First 3 rows of X:")
print(X[:3])

print(" ")

# Display the first 3 elements of the numpy array y_train
print("First 3 elements of y:")
print(y[:3])

First 3 rows of X:
[[ 1.00000000e+00  1.10389478e+00  5.43480805e-02  5.54422058e-02
   6.72213683e+01  5.79102306e-02  3.88697735e-03  4.64562778e-03
   9.84746588e-01  8.05486304e-03  1.29698003e-02]
 [ 2.00000000e+00 -8.79318516e-02  1.91878991e-02  1.89257384e-02
   7.86593444e+01  5.73809963e-02  5.54422058e-02  3.88697735e-03
   9.83884377e-01  4.45747739e-03  1.89553700e-02]
 [ 3.00000000e+00 -2.85724335e-02  2.30297751e-02  3.08160739e-02
   7.98249504e+01  5.76747094e-02  1.89257384e-02  5.54422058e-02
   9.41683925e-01  4.64562778e-03  8.05486304e-03]]
 
First 3 elements of y:
[0 0 0]


In [24]:
# Perform Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes of the train and test sets
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (2180, 11)
Shape of y_train:  (2180,)
Shape of X_test:  (546, 11)
Shape of y_test:  (546,)


In [26]:
# Perform Train Test Split (Timeseries based method)
# Define the training size rate
train_size_rate = 0.7

# Calculate the number of training samples
train_size = int(len(X) * train_size_rate)

# Calculate the number of testing samples
test_size = len(X) - train_size

# Split the features (X) and target (y) into training and testing sets
# The first 'train_size' rows for training and the remaining rows for testing
X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:]
y_test = y[train_size:]

# Validate that the sizes match
size_check = (len(y_test) + len(y_train)) == len(y)

# Print shapes of the training and testing sets
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)
print("Size Matches: ", size_check)



Shape of X_train:  (1908, 11)
Shape of y_train:  (1908,)
Shape of X_test:  (818, 11)
Shape of y_test:  (818,)
Size Matches:  True
