# XGBoost Recursive Algorithm

## Necessary Configuration

In [1]:
# XGBoost imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from skforecast.direct import ForecasterDirect
import matplotlib.pyplot as plt
import optuna
import plotly.io as pio
pio.renderers.default = 'notebook'  # or 'colab', 'iframe', 'vscode' depending on environment

from skforecast.plot import set_dark_theme
from datetime import datetime, timedelta
from skforecast.model_selection import backtesting_forecaster
from skforecast.model_selection import TimeSeriesFold
import sys

In [2]:
# Path imports
import os
from pathlib import Path
from IPython import get_ipython

# Set notebook directory as cwd
try:
    notebook_path = get_ipython().run_line_magic("pwd", "")  # Gets notebook's folder
    os.chdir(notebook_path)
    print("CWD set to notebook folder:", os.getcwd())
except Exception as e:
    print("Could not set CWD:", e)

CWD set to notebook folder: C:\Users\karth\Documents\StockBot\algo_training\xg_boost


## Obtain and configure dataset

In [3]:
# Ticker name
tick = "NVDA"

# Whether or not to use truncated data
truncated = False
# Load time-indexed stock price dataset
if truncated:
    total = pd.read_csv(f"../single_stock/{tick.lower()}_data/truncated.csv")
else:
    total = pd.read_csv(f"../single_stock/{tick.lower()}_data/full.csv")

total.set_index("date", inplace=True)
total.index = pd.to_datetime(total.index, errors="raise")

total

Unnamed: 0_level_0,open,close,high,low,volume,dividends,stock splits,lsa,luhn,textrank,lexrank
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-03-03,0.486664,0.478412,0.489415,0.463970,1.593704e+09,0.0,0.0,,,,
2011-03-04,0.478870,0.475890,0.483684,0.470618,9.597600e+08,0.0,0.0,,,,
2011-03-07,0.479329,0.469243,0.480933,0.457322,1.019140e+09,0.0,0.0,,,,
2011-03-08,0.474286,0.448153,0.476807,0.438296,1.947184e+09,0.0,0.0,,,,
2011-03-09,0.446778,0.438754,0.451133,0.434857,1.318976e+09,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2023-12-11,47.469876,46.606258,47.509856,45.809616,5.097280e+08,0.0,0.0,-5.583224,-5.583224,-5.583224,-5.583224
2023-12-12,46.025520,47.635803,47.644798,46.025520,3.723870e+08,0.0,0.0,-3.547586,-3.547586,-3.547586,-3.547586
2023-12-13,47.607821,48.066616,48.572392,47.586830,4.477920e+08,0.0,0.0,-0.665740,-0.665740,-0.665740,-0.665740
2023-12-14,48.368474,48.328491,48.648349,47.400906,3.912320e+08,0.0,0.0,3.056196,3.056196,3.056196,3.056196


## Train-test Split

In [4]:
# Sort in descending order
total.sort_index(ascending=False, inplace=True)

# Train-test split
train, test = None, None

if truncated:
    train, test = train_test_split(
        total, 
        train_size=0.85, 
        shuffle=False, 
        test_size=0.15
    )
else:
    # Get length of sentiment available days portion
    first_idx = total.loc[:, 'lsa'].first_valid_index()
    last_idx = total.loc[:, 'lsa'].last_valid_index()
    diff = last_idx - first_idx

    # Splice it by .85 to get train and test
    delta = timedelta((diff*.85).days)
    
    # if total.index.get_loc(first_idx) + 1 > total.shape[0] - total.index.get_loc(last_idx):
    mid_idx = first_idx + delta
    train = total.loc[:mid_idx]
    test = total.loc[mid_idx:]
    # else:
    #     mid_idx = last_idx - delta
    #     test = total.loc[:mid_idx]
    #     train = total.loc[mid_idx:]

In [5]:
print(mid_idx)

2021-12-22 00:00:00


## XGBoost and Indicator Hyperparameter Optimization with Optuna

In [6]:
# Importing a file
import sys
sys.path.insert(0, "../../")
from indicator import add_indicators

# Automating the hyperparameters
def objective(trial):
    #XGBoost Hyperparameters
    lags = trial.suggest_int("lags", 3, 20)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 6)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
    gamma = trial.suggest_float("gamma", 0, 7)
    n_steps = trial.suggest_int("n_steps", 1, 30)

    # Technical indicators (tunable)
    sma = {
        trial.suggest_int("smap1", 3, 11),
        trial.suggest_int("smap2", 11, 21),
        trial.suggest_int("smap3", 21, 51),
        trial.suggest_int("smap4", 51, 201),
    }
    ema = {
        trial.suggest_int("emap1", 5, 13),
        trial.suggest_int("emap2", 13, 27),
        trial.suggest_int("emap3", 27, 51),
        trial.suggest_int("emap4", 51, 101),
    }
    macd_fast = trial.suggest_int("macd_fast",   10, 15)
    macd_slow = trial.suggest_int("macd_slow", 20, 30)
    macd_signal = trial.suggest_int("macd_signal", 7, 12)
    rsi_window = trial.suggest_int("rsi_window", 10, 20)
    srsi_window = trial.suggest_int("srsi_window", 10, 20)
    srsi_k = trial.suggest_int("srsi_k", 2, 5)
    srsi_d = trial.suggest_int("srsi_d", 2, 5)
    roc_window = trial.suggest_int("roc_window", 10, 20)
    mom_window = trial.suggest_int("mom_window", 5, 15)
    cmf_window = trial.suggest_int("cmf_window", 10, 30)
    mfi_window = trial.suggest_int("mfi_window", 10, 20)
    bollinger_window = trial.suggest_int("bollinger_window", 15, 30)
    bollinger_num_std = trial.suggest_int("bollinger_num_std", 1, 3)
    inc_obv = trial.suggest_categorical("inc_obv", [True, False])
    atr_window = trial.suggest_int("atr_window", 10, 20)
    donchian_window = trial.suggest_int("donchian_window", 15, 30)
    trix_window = trial.suggest_int("trix_window", 10, 20)
    inc_fib = trial.suggest_categorical("inc_fib", [True, False])
    pc_window = trial.suggest_int("pc_window", 15, 30)
    inc_fractals = trial.suggest_categorical("inc_fractals", [True, False])

    # This will allow indicators to be tuned, minimizing information loss when extracting patterns
    train_loc = add_indicators(
        train,
        smap=sma,
        emap=ema,
        macd_fast=macd_fast,
        macd_slow=macd_slow,
        macd_signal=macd_signal,
        rsi_window=rsi_window,
        srsi_window=srsi_window,
        srsi_k=srsi_k,
        srsi_d=srsi_d,
        roc_window=roc_window,
        mom_window=mom_window,
        cmf_window=cmf_window,
        mfi_window=mfi_window,
        bollinger_window=bollinger_window,
        bollinger_num_std=bollinger_num_std,
        inc_obv=inc_obv,
        atr_window=atr_window,
        donchian_window=donchian_window,
        trix_window=trix_window,
        inc_fib=inc_fib,
        pc_window=pc_window,
        inc_fractals=inc_fractals)

    test_loc = add_indicators(
        test,
        smap=sma,
        emap=ema,
        macd_fast=macd_fast,
        macd_slow=macd_slow,
        macd_signal=macd_signal,
        rsi_window=rsi_window,
        srsi_window=srsi_window,
        srsi_k=srsi_k,
        srsi_d=srsi_d,
        roc_window=roc_window,
        mom_window=mom_window,
        cmf_window=cmf_window,
        mfi_window=mfi_window,
        bollinger_window=bollinger_window,
        bollinger_num_std=bollinger_num_std,
        inc_obv=inc_obv,
        atr_window=atr_window,
        donchian_window=donchian_window,
        trix_window=trix_window,
        inc_fib=inc_fib,
        pc_window=pc_window,
        inc_fractals=inc_fractals)

    # Fill index
    train_loc.index = pd.to_datetime(train_loc.index, errors="raise")
    test_loc.index = pd.to_datetime(test_loc.index, errors="raise")

    # Make sure interval is set
    train_loc = train_loc.asfreq("D")
    test_loc = test_loc.asfreq("D")
    
    # Sort in ascending order
    train_loc.sort_index(ascending=True, inplace=True)
    test_loc.sort_index(ascending=True, inplace=True)

    # Fill missing
    train_loc.loc[:, "open":"lsa"] = train_loc.loc[:, "open":"lsa"].ffill()
    test_loc.loc[:, "open":"lsa"] = train_loc.loc[:, "open":"lsa"].ffill()

    # XGB Model
    model = XGBRegressor(
        tree_method="hist",
        predictor="predictor",
        device="cuda",
        verbosity=1, # Shows logs, to know if using cuda
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        random_state=42
    )

    # Adds forecasting
    forecaster = ForecasterDirect(
        regressor=model,
        lags=lags,
        steps=n_steps
    )

    # Train model
    exog_train = train_loc.drop(columns="close")
    exog_test = test_loc.drop(columns="close").drop(index=test_loc.index[0])

    y_train = train_loc["close"]
    y_test = test_loc["close"]


    # assert len(exog_test) == len(y_test)
    # exog_test = exog_test.reindex(y_test.index)

    # train_loc.drop(index=train_loc.index[-1], inplace=True)
    # test_loc.drop(index=test_loc.index[-1], inplace=True)

    # forecaster.fit(
    #     y=y_train,
    #     exog=exog_train
    # )
    
    # Test model
    # try:
    cv = TimeSeriesFold(
        initial_train_size      = len(train),
        fixed_train_size        = True,
        steps                   = n_steps,
        refit                   = True,
        allow_incomplete_fold   = False
    )

    y_full = pd.concat([y_train, y_test]).dropna()
    exog_full = pd.concat([exog_train, exog_test])
    
    if not y_full.index.is_unique:
        y_full = y_full[~y_full.index.duplicated(keep='first')]
    y_full = y_full.asfreq('D', fill_value=np.nan).ffill()
    
    if not exog_full.index.is_unique:
        exog_full = exog_full[~exog_full.index.duplicated(keep='first')]
    exog_full = exog_full.asfreq('D', fill_value=np.nan)

    exog_full = exog_full.reindex(y_full.index)
    assert len(exog_full) == len(y_full), f"""MY ASSERTION FAIEDðŸ˜­ðŸ˜­ðŸ˜­ðŸ˜­ðŸ˜­ðŸ˜­ðŸ˜­ðŸ˜­ðŸ˜­ðŸ˜­!!!!!!
exog: {len(exog_full)}
y: {len(y_full)}
    """
    
    results, predictions = backtesting_forecaster(
        forecaster          = forecaster,
        y                   = y_full,  # full series
        cv                  = cv,
        exog                = exog_full,
        metric              = 'root_mean_squared_error',
        show_progress       = False
    )

    # results_df will have one row per fold, with a column named â€˜root_mean_squared_scaled_errorâ€™
    return results['root_mean_squared_error'].iloc[0]
    # except Exception:
    #     print("TERRIBLE, HORRIBLE")
    #     return sys.maxint

## Execution of model

In [7]:
import warnings
warnings.simplefilter("ignore")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, show_progress_bar = True)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)

[I 2025-07-31 23:27:20,090] A new study created in memory with name: no-name-a8d142bd-7ff6-4f4e-82bc-4f52aa0365e7


  0%|          | 0/600 [00:00<?, ?it/s]

[I 2025-07-31 23:29:04,702] Trial 0 finished with value: 5.162846344478583 and parameters: {'lags': 19, 'n_estimators': 242, 'max_depth': 5, 'learning_rate': 0.019653974106645197, 'subsample': 0.885406499558496, 'colsample_bytree': 0.6635839703651267, 'gamma': 1.4150408978861553, 'n_steps': 21, 'smap1': 9, 'smap2': 11, 'smap3': 30, 'smap4': 121, 'emap1': 12, 'emap2': 20, 'emap3': 42, 'emap4': 88, 'macd_fast': 10, 'macd_slow': 30, 'macd_signal': 7, 'rsi_window': 18, 'srsi_window': 20, 'srsi_k': 4, 'srsi_d': 2, 'roc_window': 15, 'mom_window': 10, 'cmf_window': 12, 'mfi_window': 20, 'bollinger_window': 25, 'bollinger_num_std': 1, 'inc_obv': True, 'atr_window': 10, 'donchian_window': 24, 'trix_window': 17, 'inc_fib': False, 'pc_window': 17, 'inc_fractals': True}. Best is trial 0 with value: 5.162846344478583.
[I 2025-07-31 23:29:40,857] Trial 1 finished with value: 3.6960380524132628 and parameters: {'lags': 18, 'n_estimators': 80, 'max_depth': 6, 'learning_rate': 0.08443515952562924, 'sub

## Showing Results

In [8]:
print("Best trial:")
trial = study.best_trial
print(f"    RMSE: {trial.value}")
print("    Params:")
for key, value in trial.params.items():
    print(f"        {key}: {value}")

Best trial:
    RMSSE: 1.4830756861071281
    Params:
        lags: 3
        n_estimators: 231
        max_depth: 5
        learning_rate: 0.26993578453433226
        subsample: 0.7008410761451417
        colsample_bytree: 0.8023893379198315
        gamma: 0.9053595826559675
        n_steps: 1
        smap1: 10
        smap2: 15
        smap3: 45
        smap4: 163
        emap1: 6
        emap2: 18
        emap3: 31
        emap4: 52
        macd_fast: 15
        macd_slow: 25
        macd_signal: 8
        rsi_window: 10
        srsi_window: 10
        srsi_k: 5
        srsi_d: 4
        roc_window: 15
        mom_window: 5
        cmf_window: 30
        mfi_window: 14
        bollinger_window: 20
        bollinger_num_std: 1
        inc_obv: False
        atr_window: 11
        donchian_window: 19
        trix_window: 17
        inc_fib: False
        pc_window: 18
        inc_fractals: True


In [None]:

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_slice(study, params=["emap1", "lags", "learning_rate"]).update_layout(
    font=dict(size=18),  # Change this to your desired size
    title_font=dict(size=20),
    xaxis_title_font=dict(size=16),
    yaxis_title_font=dict(size=16)
)

In [None]:
optuna.visualization.plot_param_importances(study).update_layout(
    font=dict(size=18),  # Change this to your desired size
    title_font=dict(size=20),
    xaxis_title_font=dict(size=16),
    yaxis_title_font=dict(size=16)
)

In [None]:
# Plot

# from sklearn.linear_model import LogisticRegression


# 1. Initialize something to store results in
results = {}

optuna.visualization.plot_optimization_history(study)

# plt.plot(train[-100:], label="Train")
# plt.plot(test.index, test, label="Test")
# plt.plot(test.index, predictions, label="Prediction")
# plt.legend()
# plt.title("XGBoost with Skforecast")
# plt.show()