In [1]:
# load the BTC long term dataset
import pandas as pd

btc_5y_df = pd.read_csv('../backend/data/BTC-past-5y-ohlc.csv', index_col=0, parse_dates=True)

print(btc_5y_df.head().to_markdown())

| timestamp           | symbol   |    open |    high |     low |   close |   volume |
|:--------------------|:---------|--------:|--------:|--------:|--------:|---------:|
| 2020-03-13 00:00:00 | BTCUSDT  | 4800.01 | 5955    | 3782.13 | 5578.6  |   402202 |
| 2020-03-14 00:00:00 | BTCUSDT  | 5576.05 | 5640.52 | 5055.13 | 5172.06 |   136910 |
| 2020-03-15 00:00:00 | BTCUSDT  | 5172.48 | 5940    | 5093.1  | 5361.3  |   139916 |
| 2020-03-16 00:00:00 | BTCUSDT  | 5360.33 | 5365.42 | 4442.12 | 5028.97 |   227277 |
| 2020-03-17 00:00:00 | BTCUSDT  | 5028.86 | 5525    | 4921.45 | 5312.64 |   150090 |


In [52]:
# xgboost_experiment = setup(btc_5y_df.loc[:,'close'], fh = 3, fold = 5, session_id = 123)
# close prices
btc_5y_close_df = btc_5y_df.loc[:, 'close']

print("\n--- Setting up Data Split ---")

# Ensure data is sorted by time
btc_5y_close_df = btc_5y_close_df.sort_index()
btc_5y_close_df = btc_5y_close_df.to_frame()

# Create lag features
for lag in range(1, 3):  # Lags from 1 to 7 days
    btc_5y_close_df[f'lag_{lag}'] = btc_5y_close_df['close'].shift(lag)

# Drop NaN values caused by lagging
btc_5y_close_df = btc_5y_close_df.dropna()

# Restore the 'close' column name
# btc_5y_close_df = btc_5y_close_df.rename(columns={'close': 'close'})
btc_5y_close_df.head()


--- Setting up Data Split ---


Unnamed: 0_level_0,close,lag_1,lag_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-15,5361.3,5172.06,5578.6
2020-03-16,5028.97,5361.3,5172.06
2020-03-17,5312.64,5028.97,5361.3
2020-03-18,5393.04,5312.64,5028.97
2020-03-19,6162.37,5393.04,5312.64


In [53]:
# Train-Test Split (last 365 days as test set)
split_date = btc_5y_close_df.index[-30]  
btc_train = btc_5y_close_df.loc[btc_5y_close_df.index <= split_date].copy()
btc_test = btc_5y_close_df.loc[btc_5y_close_df.index > split_date].copy()

btc_train.head()


Unnamed: 0_level_0,close,lag_1,lag_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-15,5361.3,5172.06,5578.6
2020-03-16,5028.97,5361.3,5172.06
2020-03-17,5312.64,5028.97,5361.3
2020-03-18,5393.04,5312.64,5028.97
2020-03-19,6162.37,5393.04,5312.64


In [None]:
import pandas as pd
from pycaret.regression import *

# PyCaret Regression Setup (Ensuring No Shuffle)
s = setup(
    data=btc_train, 
    target="close",
    session_id=123, 
    fold=5,  # K-fold cross-validation
    data_split_shuffle=False,  # **Important: Keeps time-series order**
    fold_strategy="timeseries",  # Ensures time-series split
)

# Train XGBoost Model
xgb_model = create_model('xgboost')



Unnamed: 0,Description,Value
0,Session id,123
1,Target,close
2,Target type,Regression
3,Original data shape,"(1794, 3)"
4,Transformed data shape,"(1794, 3)"
5,Transformed train set shape,"(1255, 3)"
6,Transformed test set shape,"(539, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,24661.7383,902940160.0,30048.9629,-2.0506,1.1325,0.5664
1,2233.697,7868927.0,2805.1609,0.9256,0.0615,0.0487
2,1467.7898,3527109.75,1878.0601,0.9438,0.0511,0.0397
3,582.2435,707434.5,841.0912,0.8651,0.0408,0.0289
4,904.2732,1333014.625,1154.5625,0.8194,0.0429,0.0331
Mean,5969.9484,183275329.175,7345.5675,0.3007,0.2658,0.1434
Std,9362.6967,359841170.2478,11371.8059,1.1765,0.4334,0.2116


In [88]:
tuned_xgb = tune_model(xgb_model)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,24803.873,910425472.0,30173.2578,-2.0759,1.1438,0.5712
1,1865.6396,5832058.5,2414.9656,0.9449,0.051,0.0405
2,1452.0278,3657831.0,1912.5457,0.9417,0.0523,0.0397
3,598.5771,617396.875,785.746,0.8823,0.04,0.0309
4,802.7949,1072960.25,1035.8379,0.8546,0.039,0.0296
Mean,5904.5825,184321143.725,7264.4706,0.3095,0.2652,0.1424
Std,9460.4801,363057032.7104,11469.4644,1.1932,0.4393,0.2145


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [89]:
# # Finalize Model and Make Predictions
final_xgb = finalize_model(tuned_xgb)
predictions = predict_model(final_xgb, data=btc_test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,3066.2986,16990076.0,4121.9019,0.5218,0.047,0.0354


In [90]:
predictions.head()

Unnamed: 0_level_0,lag_1,lag_2,close,prediction_label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-02-11,97430.820312,96462.75,95778.203125,96930.125
2025-02-12,95778.203125,97430.820312,97869.992188,95890.671875
2025-02-13,97869.992188,95778.203125,96608.140625,97828.578125
2025-02-14,96608.140625,97869.992188,97500.476562,96717.085938
2025-02-15,97500.476562,96608.140625,97569.65625,96523.085938


In [91]:
print("\n--- Plot against valdation set ---")

import plotly.express as px

# create a dataframe for visualisation
btc_vis = pd.DataFrame(
    {
        "date" : predictions.index,
        "actual" : predictions.close,
        "predicted" : predictions.prediction_label,
    }
)

# print(len(btc_test[-30:].index))

# print(len(btc_test[-30:].values))
# print(len(predictions.values.flatten()))
# melt the dataframe for easier handling of multiple series for pyplot
btc_vis = btc_vis.melt(id_vars=['date'], var_name="type", value_name='close')

# plot validation set with predictions
fig = px.line(
    btc_vis,
    x='date',
    y='close',
    color='type',
    title="XGBoost Forecast vs Actual",
    labels=dict(close='close price', date='date')
)

fig.show()


--- Plot against valdation set ---


In [95]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("\n--- Model Evaluation ---")
# Calculate evaluation metrics

# y_true = btc_test
# y_pred = predictions
mae = mean_absolute_error(predictions.close,predictions.prediction_label)
mse = mean_squared_error(predictions.close,predictions.prediction_label)
r2 = r2_score(predictions.close,predictions.prediction_label)

print(f"Mean squared error: {mse:.2f}")
print(f"Mean absolute error: {mae:.2f}")
print(f"R-squared score : {r2:.2f}") 


--- Model Evaluation ---
Mean squared error: 16990076.00
Mean absolute error: 3066.30
R-squared score : 0.52


In [93]:
# TODO - make the data stationary
# TODO - experiment with log transform to smooth the trend.