In [162]:
# load the BTC long term dataset
import pandas as pd

btc_5y_df = pd.read_csv('../backend/data/btcusdt_1d.csv', index_col=0, parse_dates=True)

print(btc_5y_df.head().to_markdown())

| timestamp           | symbol   |    open |    high |   low |   close |   volume |
|:--------------------|:---------|--------:|--------:|------:|--------:|---------:|
| 2020-04-20 00:00:00 | BTCUSDT  | 7121.4  | 7220    |  6751 | 6826.83 |  90149.5 |
| 2020-04-21 00:00:00 | BTCUSDT  | 6828.98 | 6940    |  6762 | 6841.37 |  60109.7 |
| 2020-04-22 00:00:00 | BTCUSDT  | 6841.36 | 7156.38 |  6818 | 7125.14 |  61486.4 |
| 2020-04-23 00:00:00 | BTCUSDT  | 7125.12 | 7738    |  7020 | 7482.39 | 102774   |
| 2020-04-24 00:00:00 | BTCUSDT  | 7483.96 | 7615.96 |  7388 | 7505    |  60182.1 |


In [163]:
# close prices
btc_5y_close_df = btc_5y_df.drop("symbol", axis=1) #.loc[:, 'close']

print("\n--- Setting up Data Split ---")

# Ensure data is sorted by time
# btc_5y_close_df = btc_5y_close_df.sort_index()
# btc_5y_close_df = btc_5y_close_df.to_frame()

# Create lag features
for lag in range(1, 3):  # Lags from 1 to 2 days
    btc_5y_close_df[f'lag_{lag}'] = btc_5y_close_df['close'].shift(lag)

# Drop NaN values caused by lagging
btc_5y_close_df = btc_5y_close_df.dropna()

# Restore the 'close' column name
# btc_5y_close_df = btc_5y_close_df.rename(columns={'close': 'close'})
btc_5y_close_df.head()


--- Setting up Data Split ---


Unnamed: 0_level_0,open,high,low,close,volume,lag_1,lag_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-22,6841.36,7156.38,6818.0,7125.14,61486.377334,6841.37,6826.83
2020-04-23,7125.12,7738.0,7020.0,7482.39,102773.569561,7125.14,6841.37
2020-04-24,7483.96,7615.96,7388.0,7505.0,60182.119939,7482.39,7125.14
2020-04-25,7505.0,7705.0,7431.07,7538.67,43874.427726,7505.0,7482.39
2020-04-26,7539.03,7700.0,7480.0,7693.1,50522.616209,7538.67,7505.0


In [164]:
# Train-Test Split (last 365 days as test set)
split_date = btc_5y_close_df.index[-30]  
btc_train = btc_5y_close_df.loc[btc_5y_close_df.index <= split_date].copy()
btc_test = btc_5y_close_df.loc[btc_5y_close_df.index > split_date].copy()

btc_train.head()


Unnamed: 0_level_0,open,high,low,close,volume,lag_1,lag_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-22,6841.36,7156.38,6818.0,7125.14,61486.377334,6841.37,6826.83
2020-04-23,7125.12,7738.0,7020.0,7482.39,102773.569561,7125.14,6841.37
2020-04-24,7483.96,7615.96,7388.0,7505.0,60182.119939,7482.39,7125.14
2020-04-25,7505.0,7705.0,7431.07,7538.67,43874.427726,7505.0,7482.39
2020-04-26,7539.03,7700.0,7480.0,7693.1,50522.616209,7538.67,7505.0


In [165]:
import pandas as pd
from pycaret.regression import *

# PyCaret Regression Setup
xgb_exp = RegressionExperiment().setup(
    data=btc_train, 
    target="close",
    session_id=123, 
    fold=3,  # K-fold cross-validation
    data_split_shuffle=False,  # **Important: Keeps time-series order**
    fold_strategy="timeseries",  # Ensures time-series split,
)

# Train XGBoost Model
xgb_model = xgb_exp.create_model('xgboost')



Unnamed: 0,Description,Value
0,Session id,123
1,Target,close
2,Target type,Regression
3,Original data shape,"(1794, 7)"
4,Transformed data shape,"(1794, 7)"
5,Transformed train set shape,"(1255, 7)"
6,Transformed test set shape,"(539, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1779.0249,6253585.0,2500.7168,0.9303,0.0477,0.0349
1,603.7984,611918.8125,782.2524,0.9935,0.0298,0.0224
2,510.3609,458376.5312,677.0351,0.9772,0.0263,0.0203
Mean,964.3947,2441293.4479,1320.0014,0.967,0.0346,0.0259
Std,577.2922,2696425.9019,835.9961,0.0268,0.0094,0.0064


In [166]:
# check the performance of the baseline model using the train set (not the validation data held off in the earlier step)

y_predict = xgb_exp.predict_model(xgb_model)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,8698.0732,246416928.0,15697.6738,0.4318,0.1982,0.0969


In [167]:
import plotly.express as px

fig = px.line(y_predict, y=['close', 'prediction_label'], template='plotly_dark', labels={"value" : "close price $"})

fig.show()

# fig.show()

In [168]:
tuned_xgb = xgb_exp.tune_model(xgb_model)

y_pred_tuned = xgb_exp.predict_model(tuned_xgb)

fig = px.line(y_pred_tuned, y=['close', 'prediction_label'], template='plotly_dark', labels={"value" : "close price $"})

fig.show()


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1601.0255,5546777.0,2355.1597,0.9381,0.043,0.031
1,591.5455,619886.625,787.3288,0.9934,0.0307,0.0227
2,436.8259,337884.125,581.278,0.9832,0.0228,0.0175
Mean,876.4656,2168182.5833,1241.2555,0.9716,0.0322,0.0237
Std,516.2201,2391799.3961,792.1284,0.024,0.0083,0.0056


Fitting 3 folds for each of 10 candidates, totalling 30 fits


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,8472.5996,237284176.0,15404.0322,0.4528,0.1935,0.0942


In [169]:
# Finalize Model and Make Predictions
final_xgb = xgb_exp.finalize_model(tuned_xgb)
# predictions = predict_model(final_xgb, data=btc_test)
y_pred_final = xgb_exp.predict_model(final_xgb)

fig = px.line(y_pred_final, y=['close', 'prediction_label'], template='plotly_dark', labels={"value" : "close price $"})

fig.show()
# pull()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,79.9392,11237.4629,106.0069,1.0,0.0019,0.0014


In [170]:
# compare with furture data N.B there's no need to drop!! see next cell
y_pred_future = xgb_exp.predict_model(final_xgb, data=btc_test.drop('close', axis=1))

print(xgb_exp.pull())

train_forecast_df = pd.concat([y_pred_final, y_pred_future])
# Insert the last 30 days of the test set into the close column of train_forecast_df
train_forecast_df.loc[btc_test.index, 'close'] = btc_test['close']
fig = px.line(train_forecast_df, y=['close', 'prediction_label'], template='plotly_dark', labels={"value" : "close price $"})
fig.add_vrect(x0=btc_test.index[0], x1=btc_test.index[-1], fillcolor="grey", opacity=0.25, line_width=0)

fig.show()

                       Model        MAE           MSE        RMSE   R2  \
0  Extreme Gradient Boosting  79.939201  11237.462891  106.006897  1.0   

    RMSLE    MAPE  
0  0.0019  0.0014  


In [171]:
# compare with furture data
y_pred_future = xgb_exp.predict_model(final_xgb, data=btc_test)

train_forecast_df = pd.concat([y_pred_final, y_pred_future])
# Insert the last 30 days of the test set into the close column of train_forecast_df
# train_forecast_df.loc[btc_test.index, 'close'] = btc_test['close']

fig = px.line(train_forecast_df, y=['close', 'prediction_label'], template='plotly_dark', labels={"value" : "close price $"})
fig.add_vrect(x0=btc_test.index[0], x1=btc_test.index[-1], fillcolor="grey", opacity=0.25, line_width=0)


fig.show()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,1450.7524,3410671.25,1846.8002,0.4829,0.0224,0.0174


# transforming the data?

from the prior time series decomposition, attemping to transform the data didn't make the data more normally distributed. 


# observations

making predictions with lagged values will be require recursive forecasting where exogenous future variables are not available. In the case of lagged features, we can use the last predicted value and the next lagged value as our exogenous variables for prediciton.

In [None]:
def recursive_forecast(experiment: RegressionExperiment, model, data: pd.DataFrame, fh=30, lags=2):
    # get the last known values from the dataset based on the lags
    if len(data) > lags:
        last_known_df = list(data['close'].values[-lags:])
    else:
        last_known_df = list(data['close'].values)
    dates = []
    predictions = []
    
    # get the latest date (should be a timestamp so that timedelta works!)
    last_date = data.index[-1]
    # for window in forecast horizon - last_known_df will be the rolling window
    for window in range(fh):
        # prepare prediction input with correct lag structure i.e last known close data (reversed), columns are now lags, index is timedelta days + window
        input_df = pd.DataFrame([last_known_df[-lags:][::-1]], columns=[f"lag_{j+1}" for j in range(lags)], index=[last_date + pd.Timedelta(days=window+1)])
        print(input_df.to_markdown())
        # make prediction and get first value
        prediction = experiment.predict_model(model, data=input_df)
        prediction = prediction['prediction_label'].values[0]
        # append predictions, update history and dates (with the index of the latest input_df)
        predictions.append(prediction)
        last_known_df.append(prediction)
        dates.append(input_df.index[0])
    # return the prediction df {predictions:prediciton} index=dates
    predictions_df = pd.DataFrame(data=dict(prediction_label=predictions), index=dates)
    return predictions_df
    

def predict_and_plot(model, data:pd.DataFrame, plot=True):
    recursive_forecast(model, data)
    # combine true values and predictions for vis
    # y_pred_final_diff = y_predict_original.rename(columns={"y_pred": "close"})
    y_true_pred_df = pd.concat([btc_5y_close_df['close'].iloc[-len(y_predict):].to_frame(),
                                        y_predict], axis=0, copy=True)
   
    if plot:
        fig = px.line(y_true_pred_df, y=['close', 'prediction_label'], template='plotly_dark', labels={"value": "close price $"})
        fig.add_vrect(x0=data.index[0], x1=data.index[-1], fillcolor="grey", opacity=0.25, line_width=0)
        fig.show()



In [182]:
# recursive_forecast(xgb_exp, final_xgb, data=btc_5y_close_df.drop(['open', 'high', 'low', 'volume'],axis=1).iloc[-2:], fh=5)


# observations

using a recusrsive window for lagged features, the model is not seeing enough dynamic change and is therefore converging on the same value quickly.

To make this better, this will likely require feature engineering i.e 

- Time-based: day_of_week, day_of_month, month
- more time lags, 7, 30 days. But with the high autocorrelation we may end up with the same issue.
- Rolling stats: rolling_mean_3, rolling_std_5

TODO - Cite this!

# Future forecasting

## Direct multi-step forecasting

using PyCaret to optimise the model and sklearn's MultiOutput regressor to predict multiple future steps at once.

In [183]:
# utilise existing code for lagged features, consider 30 day lags

btc_5y_close_df_30_lag = btc_5y_close_df.copy()

# significant time periods (useful for the user) and hopefully saves memory+processing
for i in [1, 3, 5, 7, 14, 30]:
    btc_5y_close_df_30_lag[f'lag_{i}'] = btc_5y_close_df_30_lag['close'].shift(i)
    
btc_5y_close_df_30_lag.dropna(inplace=True)

btc_5y_close_df_30_lag.head()


Unnamed: 0_level_0,open,high,low,close,volume,lag_1,lag_2,lag_3,lag_5,lag_7,lag_14,lag_30
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-05-22,9067.51,9271.0,8933.52,9170.0,58943.131024,9068.65,9511.43,9775.53,9680.04,9316.42,9800.01,7125.14
2020-05-23,9170.0,9307.85,9070.0,9179.15,43526.296966,9170.0,9068.65,9511.43,9733.93,9381.27,9539.4,7482.39
2020-05-24,9179.01,9298.0,8700.0,8720.34,70379.86645,9179.15,9170.0,9068.65,9775.53,9680.04,8722.77,7505.0
2020-05-25,8718.14,8979.66,8642.72,8900.35,62833.910949,8720.34,9179.15,9170.0,9511.43,9733.93,8561.52,7538.67
2020-05-26,8900.35,9017.67,8700.0,8841.18,58299.770138,8900.35,8720.34,9179.15,9068.65,9775.53,8810.79,7693.1


In [216]:
import numpy as np

def create_sequences(data, window=60, horizon=30, step=1):
    """Create sequences while tracking corresponding dates"""
    x = []
    y = []
    target_dates = []  # Track dates for target windows
    
    # Extract the target values
    target_col = "original_close" if "original_close" in data.columns else 'close'
    target = data[target_col].values
    
    # Get the feature columns
    feature_cols = [col for col in data.columns if col != target_col]
    
    # Extract the feature values
    features = data.loc[:, feature_cols].values
    
    # Loop over the dataset
    for i in range(0, len(data) - window - horizon + 1, step):
        # Get input window
        x_i = features[i:i+window]
        
        # Get target sequence
        y_i = target[i+window:i+window+horizon]
        
        # Get target dates
        y_dates = data.index[i+window:i+window+horizon]
        
        # Make sure we have complete sequences
        if len(x_i) == window and len(y_i) == horizon:
            x.append(x_i.flatten())
            y.append(y_i)
            target_dates.append(y_dates)
    
    return np.array(x), np.array(y), target_dates

# test
X, y, dates = create_sequences(btc_5y_close_df_30_lag)
print("x shape", X.shape)
print("y shape", y.shape)
print("target date first window: ", dates[0])

x shape (1704, 660)
y shape (1704, 30)
target date first window:  DatetimeIndex(['2020-07-21', '2020-07-22', '2020-07-23', '2020-07-24',
               '2020-07-25', '2020-07-26', '2020-07-27', '2020-07-28',
               '2020-07-29', '2020-07-30', '2020-07-31', '2020-08-01',
               '2020-08-02', '2020-08-03', '2020-08-04', '2020-08-05',
               '2020-08-06', '2020-08-07', '2020-08-08', '2020-08-09',
               '2020-08-10', '2020-08-11', '2020-08-12', '2020-08-13',
               '2020-08-14', '2020-08-15', '2020-08-16', '2020-08-17',
               '2020-08-18', '2020-08-19'],
              dtype='datetime64[ns]', name='timestamp', freq=None)


In [185]:
# perfrom train test split

# Step 2: Split data for training/testing
# Use 80% for training, 20% for testing
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
test_dates = dates[split_idx:]

print("x shape", X.shape)
print("y shape", y.shape)
print("X_test shape: ", X_test.shape)
print("y test shape :", y_test.shape)

x shape (1704, 660)
y shape (1704, 30)
X_test shape:  (341, 660)
y test shape : (341, 30)


In [186]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
# train and tune xgboost with pycaret using single step prediciton

# for this we need to ensure that the target data is only a one-step target.

def optimise_model(experiment: RegressionExperiment, custom_params:dict):
    final_xgb = experiment.create_model('xgboost',
                                        **custom_params)
    
    return final_xgb

def train_ts_xgboost_multi_step(data: pd.DataFrame, X: np.array, y: np.array): 
    experiment = RegressionExperiment().setup(
        data=data,
        target='close',
        data_split_shuffle=False,
        fold=5,
        fold_strategy='timeseries',
        session_id=456,
        train_size=0.8
    )
    
    xgb = experiment.create_model("xgboost")
    
    xgb_tuned = experiment.tune_model(xgb)
    xgb_params = xgb_tuned.get_params()
    
    # key hyperparameters
    # # each tree sees more data, reducing variance
    # xgb_params['subsample'] = 0.8
    # # deeper trees to increase model complexity and ability to fit.
    # xgb_params['max_depth'] = 4
    # # reduce learning rate to increase regularisation
    # xgb_params['learning_rate'] = 0.05
        
    # optimise the final model - mainly for scoring
    _ = optimise_model(experiment, xgb_params)
    
    X = X.astype(np.float32)
    y = y.astype(np.float32)

    # base_model = XGBRegressor(**xgb_params)
    
    print("\n--- Training multi output XGB regressor ---")
    print("...")
    multi_model = MultiOutputRegressor(xgb_tuned).fit(X, y)
    
    return multi_model, xgb_params
    
    
# test
multi_step_xgb, params = train_ts_xgboost_multi_step(btc_5y_close_df_30_lag, X_train, y_train)


Unnamed: 0,Description,Value
0,Session id,456
1,Target,close
2,Target type,Regression
3,Original data shape,"(1793, 12)"
4,Transformed data shape,"(1793, 12)"
5,Transformed train set shape,"(1434, 12)"
6,Transformed test set shape,"(359, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7554.0928,104873568.0,10240.7793,-0.201,0.217,0.1445
1,1101.4668,2308812.25,1519.4777,0.9671,0.0291,0.0223
2,834.0945,1174539.875,1083.762,0.9312,0.0483,0.0394
3,389.6188,272835.625,522.3367,0.9766,0.0203,0.0149
4,1163.2349,4855050.0,2203.4177,0.9769,0.0343,0.0203
Mean,2208.5016,22696961.15,3113.9547,0.7301,0.0698,0.0483
Std,2686.6627,41117055.2096,3605.5854,0.4659,0.0742,0.0488


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7362.6987,101762184.0,10087.7246,-0.1654,0.2131,0.1401
1,845.1459,1337644.75,1156.5659,0.9809,0.0223,0.0173
2,882.8544,1280846.375,1131.7449,0.925,0.053,0.0433
3,364.954,243300.375,493.2549,0.9791,0.0191,0.014
4,973.2068,3614181.25,1901.1,0.9828,0.0293,0.017
Mean,2085.772,21647631.35,2954.0781,0.7405,0.0673,0.0463
Std,2646.9279,40072418.792,3594.5869,0.4535,0.0738,0.0481


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7362.6987,101762184.0,10087.7246,-0.1654,0.2131,0.1401
1,845.1459,1337644.75,1156.5659,0.9809,0.0223,0.0173
2,882.8544,1280846.375,1131.7449,0.925,0.053,0.0433
3,364.954,243300.375,493.2549,0.9791,0.0191,0.014
4,973.2068,3614181.25,1901.1,0.9828,0.0293,0.017
Mean,2085.772,21647631.35,2954.0781,0.7405,0.0673,0.0463
Std,2646.9279,40072418.792,3594.5869,0.4535,0.0738,0.0481



--- Training multi output XGB regressor ---
...


# Evaluate the multi-step regressor

In [187]:
# wrap tuned model with MultiOutputRegressor to predict multiple steps at once
y_pred = multi_step_xgb.predict(X_test)
# evaluate model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

# this gives aggregate metrics across all test sequences and all prediction days
print(f"multi step model MAE: {mean_absolute_error(y_true= y_test[:,0], y_pred=y_pred[:, 0]):.2f}")
print(f"multi step model MSE: {mean_squared_error(y_true= y_test[:,0], y_pred=y_pred[:, 0]):.2f}")
print(f"multi step model R2: {r2_score(y_true= y_test[:,0], y_pred=y_pred[:, 0]):.2f}")

# eval_df = btc_5y_close_df_30_lag.loc[btc_5y_close_df_30_lag.index[-30:], ['close']]

# get the first day prediction for every sample
eval_data = []
for i in range(len(y_test)):
    if i < len(test_dates) and len(test_dates[i]) > 0:
        eval_data.append({
            'date': test_dates[i][0],  # First day date
            'close': y_test[i, 0],     # First day actual
            'predictions': y_pred[i, 0]  # First day prediction
        })

    eval_df = pd.DataFrame(eval_data)
    eval_df = eval_df.set_index('date')
    
    # Create plot with correctly aligned dates
    fig = px.line(
        eval_df, 
        y=['close', 'predictions'], 
        template='plotly_dark', 
        labels={"value": "close price $", "date": "Date"},
        title="BTC"
    )

fig.show()



multi step model MAE: 12812.09
multi step model MSE: 335767028.95
multi step model R2: -0.29


# finalise the multi-step regressor

In [188]:
final_multi_xgb = multi_step_xgb.fit(X_test, y_test)

In [242]:
# forecast - use most recent 30 day window, generate predictions for next 30 days
def forecast(data: pd.DataFrame, model: MultiOutputRegressor, window=60, horizon=30) -> pd.DataFrame:
    """
    Remember, in this case we are predicting one multi-step interval, which requires
    one input sample.
    Therefore, we take the latest window of data (matching how the model was trained).
    Then, flatten these values to achieve a 1D array.
    Because scikit models require a 2d array (n_samples, n_features), we reshape.
    (1,-1) whcih means one row, automatically determine the column dimension aka features
    
    n.b there is an interdepenency with the create_sequences function, if the window
    or horizon is changed, the data sequences will also need to match this pattern.
    """
    target_col = "original_close" if "original_close" in data.columns else 'close'
    latest_window = data[-window:].drop(target_col, axis=1).values.flatten().reshape(1,-1)
    forecast = model.predict(latest_window)[0] 
    last_date = data.index[-1]
    forecast_dates = pd.date_range(last_date + pd.Timedelta(days=1), periods=horizon)
    
    # create forecast df
    forecast_df = pd.DataFrame(
        {
            'timestamp': forecast_dates,
            'forecast': forecast
        }
    ).set_index('timestamp', drop=True)
    
    return forecast_df
# test
forecasts = forecast(btc_5y_close_df_30_lag, final_multi_xgb)

print(forecasts.head().to_markdown())

| timestamp           |   forecast |
|:--------------------|-----------:|
| 2025-04-19 00:00:00 |    87144.9 |
| 2025-04-20 00:00:00 |    86515.1 |
| 2025-04-21 00:00:00 |    85860.1 |
| 2025-04-22 00:00:00 |    86387.4 |
| 2025-04-23 00:00:00 |    88447.1 |


In [190]:
# plot most recent 30 days with 30 day forecast
last_month = btc_5y_close_df_30_lag.loc[btc_5y_close_df_30_lag.index[-60:], ['close']]

last_month_pred = pd.concat([last_month, forecasts])
# last_month_pred

fig = px.line(last_month_pred, x=last_month_pred.index, y=['close', 'forecast'], template='plotly_dark', labels={"value": "close price $"})
fig.add_vline(x=last_month.index[-1])
fig.show()

In [191]:
print("--- Tuned XGB Hyperparameters ---")
for key, value in params.items():
    print("{}:\n\t{}".format(key, value))

--- Tuned XGB Hyperparameters ---
objective:
	reg:squarederror
base_score:
	None
booster:
	gbtree
callbacks:
	None
colsample_bylevel:
	None
colsample_bynode:
	None
colsample_bytree:
	1
device:
	cpu
early_stopping_rounds:
	None
enable_categorical:
	False
eval_metric:
	None
feature_types:
	None
gamma:
	None
grow_policy:
	None
importance_type:
	None
interaction_constraints:
	None
learning_rate:
	0.05
max_bin:
	None
max_cat_threshold:
	None
max_cat_to_onehot:
	None
max_delta_step:
	None
max_depth:
	10
max_leaves:
	None
min_child_weight:
	2
missing:
	nan
monotone_constraints:
	None
multi_strategy:
	None
n_estimators:
	200
n_jobs:
	-1
num_parallel_tree:
	None
random_state:
	456
reg_alpha:
	0.0001
reg_lambda:
	0.4
sampling_method:
	None
scale_pos_weight:
	28.400000000000006
subsample:
	0.7
tree_method:
	auto
validate_parameters:
	None
verbosity:
	0


# validate against other 5 yr coin data (selected coins from clusters)

In [193]:
selected_coins = [
    'BTCUSDT', 'BNBUSDT', 'ETHUSDT', 'LTCUSDT'
]

In [255]:
import joblib
from datetime import date
from sklearn.preprocessing import RobustScaler

def add_features(df, scale=True):
    """Add technical indicators and features to the dataframe"""
    # Create a copy of the dataframe
    data = df.copy()
    original_close = data['close'].copy()

    # Add lagged features
    for lag in [1, 2, 3, 5, 7, 14, 30]:
        data[f'close_lag_{lag}'] = data['close'].shift(lag)

    # Add rolling stats
    data['ma7'] = data['close'].rolling(window=7).mean()
    data['ma30'] = data['close'].rolling(window=30).mean()
    data['std7'] = data['close'].rolling(window=7).std()

    # Add price changes
    data['price_change_1d'] = data['close'].pct_change(1)
    data['price_change_7d'] = data['close'].pct_change(7)

    # Calculate RSI (14-period)
    delta = data['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    data['rsi'] = 100 - (100 / (1 + rs))

    # ffill where possible
    data = data.ffill()
    # Drop rows with NaN values
    data = data.dropna()
    
    # scale derived features
    features = data.columns
    if scale:
        scaler = RobustScaler()
        valid_data = data[features]
        scaler.fit(valid_data)
        # apply to all derived features
        data[features] = scaler.transform(data[features])
    else:
        scaler = None
    # restore original close values for target
    data['original_close'] = original_close
    
    # ffill where possible
    data = data.ffill()
    # Drop rows with NaN values
    data = data.dropna()
    
    return data, scaler

def xgboost_train_pipeline_full(coin: str, df: pd.DataFrame, params: dict):
    # Step - drop columns
    df.drop(['symbol', 'open', 'volume'], axis=1, inplace=True)
    # step - feature engineering
    df, scaler = add_features(df)
    
    joblib.dump(scaler, f"../backend/models/{coin}-scaler-{date.today()}.pkl")
    
    train_split = 0.8
    
    experiment = RegressionExperiment().setup(
        data=df,
        target='original_close',
        data_split_shuffle=False,
        fold=5,
        fold_strategy='timeseries',
        session_id=456,
        train_size=train_split,
        remove_multicollinearity=True,
        feature_selection=True,
        feature_selection_estimator='rf',
    )
    
    xgb = experiment.create_model("xgboost")
    xgb_tuned = experiment.tune_model(xgb)
        
    print(experiment.dataset_transformed.tail(2).to_markdown())
    
    # Create multi-step sequences of data (30 day prediction window)
    X, y, target_dates = create_sequences(experiment.dataset_transformed)
    
    feature_cols = [col for col in experiment.dataset_transformed if col != 'close' and col != 'original_close']
    
    # Step 3: Split data for training/testing
    # Use 80% for training, 20% for testing
    split_idx = int(len(X) * train_split)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    test_dates = target_dates[split_idx:]

    print("x shape", X.shape)
    print("y shape", y.shape)
        
    # step 4:
    print("\n--- Training multi output XGB regressor ---")
    print("...")
    # in the application, just finalise the model on the test set since the model has already been trained
    multi_model = MultiOutputRegressor(xgb_tuned).fit(X_train, y_train)
    
    joblib.dump(multi_model, f"../backend/models/{coin}-{date.today()}.pkl")
    joblib.dump(feature_cols, f"../backend/models/{coin}-features-{date.today()}.pkl")
    
    # step 5
    print("\n---Evaluating model ---")
    print(f"--- {coin} ---")
    y_pred = multi_model.predict(X_test)

    print(f"multi step model MAE: {mean_absolute_error(y_true=y_test[:,0], y_pred=y_pred[:, 0]):.2f}")
    print(f"multi step model MSE: {mean_squared_error(y_true=y_test[:,0], y_pred=y_pred[:, 0]):.2f}")
    print(f"multi step model RMSE: {root_mean_squared_error(y_true=y_test[:,0], y_pred=y_pred[:, 0]):.2f}")
    print(f"multi step model R2: {r2_score(y_true=y_test[:,0], y_pred=y_pred[:, 0]):.2f}")

    eval_data = []
    # get the first day prediction for every sample
    for i in range(len(y_test)):
        if i < len(test_dates) and len(test_dates[i]) > 0:
            eval_data.append({
                'date': test_dates[i][0],  # First day date
                'close': y_test[i, 0],     # First day actual
                'predictions': y_pred[i, 0]  # First day prediction
            })
    
    eval_df = pd.DataFrame(eval_data)
    eval_df = eval_df.set_index('date')
    
    # Create plot with correctly aligned dates
    fig = px.line(
        eval_df, 
        y=['close', 'predictions'], 
        template='plotly_dark', 
        labels={"value": "close price $", "date": "Date"},
        title=coin
    )

    fig.show()
        


In [256]:
for coin in selected_coins:
    df_filename = coin.lower() + "_1d.csv"
    df = pd.read_csv("../backend/data/"+df_filename, parse_dates=True, index_col=0)
    # use the params trained from the initial model on BTC 5 year lagged close prices.
    xgboost_train_pipeline_full(coin, df, params)
    

Unnamed: 0,Description,Value
0,Session id,456
1,Target,original_close
2,Target type,Regression
3,Original data shape,"(1795, 17)"
4,Transformed data shape,"(1795, 4)"
5,Transformed train set shape,"(1436, 4)"
6,Transformed test set shape,"(359, 4)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6960.48,93696824.0,9679.7119,-0.073,0.2028,0.1318
1,386.6485,411210.5312,641.257,0.9941,0.0118,0.0079
2,465.5753,492812.1875,702.0059,0.9712,0.0327,0.0214
3,366.1445,294543.8125,542.7189,0.9747,0.021,0.014
4,513.7538,1318918.375,1148.4418,0.9937,0.0175,0.0089
Mean,1738.5204,19242861.7812,2542.8271,0.7721,0.0572,0.0368
Std,2611.5236,37228737.3939,3574.4779,0.4227,0.0731,0.0477


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7131.9673,98697424.0,9934.6582,-0.1303,0.2091,0.1346
1,316.7551,349646.1562,591.3088,0.995,0.0103,0.0063
2,480.8962,509422.9375,713.7387,0.9702,0.0339,0.0225
3,357.434,343890.4375,586.4218,0.9704,0.0222,0.0134
4,563.7047,1798902.125,1341.2316,0.9914,0.0202,0.0094
Mean,1770.1515,20339857.1312,2633.4718,0.7594,0.0591,0.0372
Std,2682.3457,39182569.1789,3661.2409,0.4449,0.0753,0.049


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
| timestamp           |   close |   price_change_1d |        rsi |   original_close |
|:--------------------|--------:|------------------:|-----------:|-----------------:|
| 2025-04-17 00:00:00 | 1.35055 |          0.36793  |  0.0520172 |          84947.9 |
| 2025-04-18 00:00:00 | 1.34021 |         -0.158463 | -0.0211681 |          84586.3 |
x shape (1706, 180)
y shape (1706, 30)

--- Training multi output XGB regressor ---
...

---Evaluating model ---
--- BTCUSDT ---
multi step model MAE: 13010.52
multi step model MSE: 338653216.00
multi step model RMSE: 18402.53
multi step model R2: -0.30


Unnamed: 0,Description,Value
0,Session id,456
1,Target,original_close
2,Target type,Regression
3,Original data shape,"(1795, 17)"
4,Transformed data shape,"(1795, 4)"
5,Transformed train set shape,"(1436, 4)"
6,Transformed test set shape,"(359, 4)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,293.6546,109209.5234,330.4686,-3.7388,1.9776,0.7889
1,4.3172,37.9595,6.1611,0.9943,0.0126,0.009
2,2.3692,22.3552,4.7281,0.9752,0.0234,0.0093
3,1.0505,3.2432,1.8009,0.9977,0.0078,0.004
4,4.6933,200.7289,14.1679,0.9886,0.0812,0.0201
Mean,61.2169,21894.762,71.4653,0.0434,0.4205,0.1662
Std,116.2263,43657.4375,129.5665,1.8911,0.779,0.3114


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,293.9461,109394.2891,330.7481,-3.7468,1.9842,0.7899
1,3.4932,20.6486,4.5441,0.9969,0.0101,0.0077
2,2.92,27.8827,5.2804,0.9691,0.0247,0.0118
3,0.7359,1.2285,1.1084,0.9991,0.0046,0.0028
4,4.5179,178.7164,13.3685,0.9899,0.0756,0.0197
Mean,61.1226,21924.553,71.0099,0.0416,0.4198,0.1664
Std,116.4183,43734.914,129.9313,1.8943,0.7826,0.3118


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
| timestamp           |    close |      std7 |   price_change_7d |   original_close |
|:--------------------|---------:|----------:|------------------:|-----------------:|
| 2025-04-17 00:00:00 | 0.951574 | -0.264471 |          0.168022 |           590.49 |
| 2025-04-18 00:00:00 | 0.951746 | -0.249305 |          0.03656  |           590.54 |
x shape (1706, 180)
y shape (1706, 30)

--- Training multi output XGB regressor ---
...

---Evaluating model ---
--- BNBUSDT ---
multi step model MAE: 42.44
multi step model MSE: 2977.98
multi step model RMSE: 54.57
multi step model R2: 0.15


Unnamed: 0,Description,Value
0,Session id,456
1,Target,original_close
2,Target type,Regression
3,Original data shape,"(1795, 17)"
4,Transformed data shape,"(1795, 4)"
5,Transformed train set shape,"(1436, 4)"
6,Transformed test set shape,"(359, 4)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1070.9741,1638698.0,1280.1165,-2.3052,0.6377,0.4075
1,80.0422,25233.5,158.8506,0.9378,0.0374,0.02
2,25.4496,1677.9698,40.963,0.9789,0.0356,0.0201
3,5.4057,50.449,7.1027,0.9981,0.0042,0.0031
4,10.6564,240.5438,15.5095,0.9995,0.0061,0.0042
Mean,238.5056,333180.0925,300.5085,0.3218,0.1442,0.091
Std,417.0751,652828.5947,492.8233,1.3137,0.2472,0.1584


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1090.891,1680248.75,1296.2441,-2.389,0.6514,0.4168
1,83.251,30130.7344,173.5821,0.9257,0.0406,0.02
2,24.8415,1597.223,39.9653,0.98,0.0353,0.0198
3,4.4261,32.6266,5.712,0.9988,0.0033,0.0025
4,8.1157,139.6994,11.8194,0.9997,0.0045,0.0032
Mean,242.3051,342429.8067,305.4646,0.303,0.147,0.0925
Std,425.2335,669007.5384,499.1204,1.3463,0.2527,0.1624


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
| timestamp           |     close |      std7 |   price_change_7d |   original_close |
|:--------------------|----------:|----------:|------------------:|-----------------:|
| 2025-04-17 00:00:00 | -0.278703 | -0.460517 |         0.304342  |          1583.62 |
| 2025-04-18 00:00:00 | -0.275452 | -0.495825 |         0.0797074 |          1588.27 |
x shape (1706, 180)
y shape (1706, 30)

--- Training multi output XGB regressor ---
...

---Evaluating model ---
--- ETHUSDT ---
multi step model MAE: 102.06
multi step model MSE: 18230.96
multi step model RMSE: 135.02
multi step model R2: 0.93


Unnamed: 0,Description,Value
0,Session id,456
1,Target,original_close
2,Target type,Regression
3,Original data shape,"(1795, 17)"
4,Transformed data shape,"(1795, 4)"
5,Transformed train set shape,"(1436, 4)"
6,Transformed test set shape,"(359, 4)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.7004,2627.6028,51.2601,0.1285,0.2197,0.1096
1,1.2683,4.0727,2.0181,0.9976,0.0174,0.0096
2,0.8295,1.1171,1.0569,0.9837,0.0173,0.0137
3,1.1467,2.3152,1.5216,0.9706,0.0169,0.0129
4,0.2946,0.1459,0.382,0.9987,0.0052,0.004
Mean,6.2479,527.0507,11.2477,0.8158,0.0553,0.03
Std,10.7315,1050.2768,20.0135,0.3438,0.0823,0.04


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28.4682,2774.3809,52.6724,0.0798,0.2279,0.1114
1,1.0099,2.3077,1.5191,0.9986,0.0131,0.0078
2,0.569,0.6496,0.806,0.9905,0.0125,0.0092
3,1.0634,2.0841,1.4436,0.9736,0.016,0.0119
4,0.2058,0.081,0.2847,0.9993,0.0038,0.0028
Mean,6.2633,555.9006,11.3452,0.8084,0.0546,0.0286
Std,11.1069,1109.2404,20.6685,0.3644,0.0867,0.0415


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
| timestamp           |     close |   price_change_7d |       rsi |   original_close |
|:--------------------|----------:|------------------:|----------:|-----------------:|
| 2025-04-17 00:00:00 | -0.154562 |         0.115947  | -0.510033 |            74.95 |
| 2025-04-18 00:00:00 | -0.142701 |        -0.0715699 | -0.533326 |            75.6  |
x shape (1706, 180)
y shape (1706, 30)

--- Training multi output XGB regressor ---
...

---Evaluating model ---
--- LTCUSDT ---
multi step model MAE: 3.26
multi step model MSE: 25.40
multi step model RMSE: 5.04
multi step model R2: 0.94


In [None]:
# forecast

df_filename = "ethusdt" + "_1d.csv"
df = pd.read_csv("../backend/data/"+df_filename, parse_dates=True, index_col=0)
df.drop(['symbol'], axis=1, inplace=True)

features = list(joblib.load("../backend/models/ETHUSDT-features-2025-05-07.pkl"))
eth = joblib.load("../backend/models/ETHUSDT-2025-05-07.pkl")
eth_scaler = joblib.load(f"../backend/models/ETHUSDT-scaler-2025-05-07.pkl")

df, _ = add_features(df)
# original_close_eth = df['original_close']

# restore original close values for target
df = df.loc[:,features+['original_close', 'close']]
# df['original_close'] = original_close_eth

print(df.tail(3).to_markdown())

X, y, eth_dates = create_sequences(df)

split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
test_dates = eth_dates[split_idx:]

print(X_test.shape)

# finalise the trained model on the unseen data
eth.fit(X, y)

future_df = forecast(df, eth)

future_df.head()

| timestamp           |      std7 |   price_change_7d |   original_close |     close |
|:--------------------|----------:|------------------:|-----------------:|----------:|
| 2025-04-16 00:00:00 | -0.314001 |        -0.502003  |          1577.14 | -0.283233 |
| 2025-04-17 00:00:00 | -0.460517 |         0.304342  |          1583.62 | -0.278703 |
| 2025-04-18 00:00:00 | -0.495825 |         0.0797074 |          1588.27 | -0.275452 |
(342, 180)


Unnamed: 0_level_0,forecast
timestamp,Unnamed: 1_level_1
2025-04-19,2112.987061
2025-04-20,1939.834717
2025-04-21,1968.380005
2025-04-22,2094.924561
2025-04-23,2119.055664


# Score comparisons

BTC baseline score


## close, 1-lag, ma7, ma30, std7, 1d change, 7d change, RSI

--- BTCUSDT ---

multi step model MAE: 13740.86

multi step model MSE: 384630252.17

multi step model RMSE: 19611.99

multi step model R2: -0.45

--- BNBUSDT ---

multi step model MAE: 53.35

multi step model MSE: 4863.66

multi step model RMSE: 69.74

multi step model R2: -0.37

--- ETHUSDT ---

multi step model MAE: 98.91

multi step model MSE: 17331.24

multi step model RMSE: 131.65

multi step model R2: 0.94

--- LTCUSDT ---

multi step model MAE: 3.30

multi step model MSE: 27.25

multi step model RMSE: 5.22

multi step model R2: 0.94

## high, low, close, 1-lag, ma7, ma30, std7, 1d change, 7d change, RSI

--- BTCUSDT ---

multi step model MAE: 11825.89

multi step model MSE: 299467039.17

multi step model RMSE: 17305.12

multi step model R2: -0.15


--- BNBUSDT ---

multi step model MAE: 46.33

multi step model MSE: 3901.85

multi step model RMSE: 62.46

multi step model R2: -0.12

--- ETHUSDT ---

multi step model MAE: 94.42

multi step model MSE: 16009.03

multi step model RMSE: 126.53

multi step model R2: 0.94

## drop multicolinearity

--- BTCUSDT ---

multi step model MAE: 12746.87

multi step model MSE: 334476832.00

multi step model RMSE: 18288.71

multi step model R2: -0.28

--- BNBUSDT ---

multi step model MAE: 53.05

multi step model MSE: 4672.18

multi step model RMSE: 68.35

multi step model R2: -0.33

## multicolinearity removal, select important features (random forest estimator)

--- BTCUSDT ---

multi step model MAE: 13516.84

multi step model MSE: 318438752.00

multi step model RMSE: 17844.85

multi step model R2: -0.22

--- BNBUSDT ---

multi step model MAE: 47.64

multi step model MSE: 3968.34

multi step model RMSE: 62.99

multi step model R2: -0.13

--- ETHUSDT ---

multi step model MAE: 100.68

multi step model MSE: 17724.36

multi step model RMSE: 133.13

multi step model R2: 0.93

--- LTCUSDT ---

multi step model MAE: 3.71

multi step model MSE: 31.51

multi step model RMSE: 5.61

multi step model R2: 0.93

# as above, with RobustScaler

--- BTCUSDT ---

multi step model MAE: 13010.52

multi step model MSE: 338653216.00

multi step model RMSE: 18402.53

multi step model R2: -0.30

--- BNBUSDT ---

multi step model MAE: 42.44

multi step model MSE: 2977.98

multi step model RMSE: 54.57

multi step model R2: 0.15

--- ETHUSDT ---

multi step model MAE: 102.06

multi step model MSE: 18230.96

multi step model RMSE: 135.02

multi step model R2: 0.93

--- LTCUSDT ---

multi step model MAE: 3.26

multi step model MSE: 25.40

multi step model RMSE: 5.04

multi step model R2: 0.94

## PCA, drop multicolinearity, feature selection (as above)
exited early, worst performance so far.


# with a solid performing model.

time to deploy...

This model will be loaded in the application and fine tuned on recent data.

This should hopefully reduce training time during application run time, making the app more responsive.