In [16]:
from pycaret.time_series import *
import pandas as pd

In [17]:
fig_kwargs = {
    "renderer": "png",
    "width": 1000,
    "height": 700,
}

In [18]:
# load the BTC long term dataset

btc_5y_df = pd.read_csv('../backend/data/BTC-past-5y-ohlc.csv', index_col=0, parse_dates=True)

print(btc_5y_df.tail().to_markdown())

| timestamp           | symbol   |    open |    high |     low |   close |   volume |
|:--------------------|:---------|--------:|--------:|--------:|--------:|---------:|
| 2025-04-14 00:00:00 | BTCUSDT  | 83760   | 85800   | 83678   | 84591.6 | 28659.1  |
| 2025-04-15 00:00:00 | BTCUSDT  | 84591.6 | 86496.4 | 83600   | 83644   | 20911    |
| 2025-04-16 00:00:00 | BTCUSDT  | 83644   | 85500   | 83111.6 | 84030.4 | 20867.2  |
| 2025-04-17 00:00:00 | BTCUSDT  | 84030.4 | 85470   | 83736.3 | 84947.9 | 13728.8  |
| 2025-04-18 00:00:00 | BTCUSDT  | 84947.9 | 85132.1 | 84413.2 | 84606.4 |  3574.41 |


In [19]:
# arima_experiment = setup(btc_5y_df.loc[:,'close'], fh = 3, fold = 5, session_id = 123)

# close prices
btc_5y_close_df = btc_5y_df.loc[:, 'close']

type(btc_5y_close_df.info())

<class 'pandas.core.series.Series'>
DatetimeIndex: 1825 entries, 2020-04-20 to 2025-04-18
Series name: close
Non-Null Count  Dtype  
--------------  -----  
1825 non-null   float64
dtypes: float64(1)
memory usage: 28.5 KB


NoneType

In [20]:
print("\n--- Setting up Data Split ---")

# hold off last year for testing
split_frame = btc_5y_close_df.index[-30-1]
btc_train = btc_5y_close_df.loc[btc_5y_close_df.index <= split_frame].copy()
btc_test = btc_5y_close_df.loc[btc_5y_close_df.index > split_frame].copy()
print(f"Training data: {btc_train.shape[0]} days")
print(f"Testing data: {btc_test.shape[0]} days")



--- Setting up Data Split ---
Training data: 1795 days
Testing data: 30 days


In [21]:
print("\n--- Setting up PyCaret Environment ---")
# Setup PyCaret environment
s = setup(data=btc_5y_close_df, target='close', transform_target='box-cox',
          fh=30, 
          seasonal_period='D', fold=5, session_id=123, #fig_kwargs=fig_kwargs
         )


--- Setting up PyCaret Environment ---


Unnamed: 0,Description,Value
0,session_id,123
1,Target,close
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(1825, 1)"
5,Transformed data shape,"(1825, 1)"
6,Transformed train set shape,"(1795, 1)"
7,Transformed test set shape,"(30, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


In [22]:

s.plot_model(plot='decomp')
s.plot_model(plot='decomp', data_kwargs={'type': 'multiplicative'})
s.plot_model(plot='acf')
s.plot_model(plot='pacf')

In [23]:
# Compare models
best_model = compare_models(
    include=['rf_cds_dt', 'ada_cds_dt', 'auto_arima', 'exp_smooth', 'arima', 'prophet', 'naive'],
    sort='R2'
    )

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
naive,Naive Forecaster,2.5327,2.1194,6056.3326,7501.5005,0.0669,0.0678,-1.6501,0.018
ada_cds_dt,AdaBoost w/ Cond. Deseasonalize & Detrending,2.6866,2.1923,6451.2172,7790.0061,0.0722,0.071,-1.9247,0.382
auto_arima,Auto ARIMA,2.6645,2.216,6389.6435,7865.994,0.0708,0.0704,-2.0378,1.668
exp_smooth,Exponential Smoothing,2.674,2.2224,6412.8554,7889.1502,0.071,0.0706,-2.0772,0.084
rf_cds_dt,Random Forest w/ Cond. Deseasonalize & Detrending,2.9986,2.4268,7183.246,8600.3892,0.08,0.0802,-2.4607,1.006
arima,ARIMA,2.8555,2.3083,6860.5286,8222.2546,0.076,0.0741,-3.1024,0.036
prophet,Prophet,6.7363,4.9985,16263.52,17906.3268,0.1788,0.1674,-22.3559,0.178


In [24]:
print("\n --- Create ARIMA model --- ")

arima_model = create_model('arima')

print("--- model parameters: ---")
print(arima_model)


 --- Create ARIMA model --- 


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2024-10-20,3.0582,2.6195,7079.2885,8943.8435,0.0867,0.0907,0.0851
1,2024-11-19,1.6307,1.3463,3881.8503,4791.6326,0.0387,0.0398,-1.1058
2,2024-12-19,2.2324,1.8158,5358.3798,6496.9287,0.0561,0.0539,-3.4999
3,2025-01-18,3.544,2.772,8643.0199,10035.1364,0.0881,0.0831,-8.0552
4,2025-02-17,3.8122,2.988,9340.1043,10843.7319,0.1106,0.1028,-2.9361
Mean,NaT,2.8555,2.3083,6860.5286,8222.2546,0.076,0.0741,-3.1024
SD,NaT,0.8147,0.6232,2024.9643,2255.0074,0.0255,0.0235,2.7879


--- model parameters: ---
ARIMA(seasonal_order=(0, 1, 0, 7))


In [25]:
# Tune the ARIMA model
print("\n--- Tuning ARIMA model... ---")
tuned_arima = tune_model(arima_model)
print("Tuned ARIMA model parameters:")
print(tuned_arima)



--- Tuning ARIMA model... ---


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2024-10-20,3.4948,3.3581,8090.1048,11465.6697,0.0952,0.1043,-0.5035
1,2024-11-19,2.5998,1.9668,6188.7519,7000.1392,0.0618,0.0643,-3.4944
2,2024-12-19,1.0553,0.8607,2533.0574,3079.6001,0.0261,0.026,-0.0111
3,2025-01-18,1.9556,1.5661,4769.3213,5669.392,0.0488,0.0471,-1.8902
4,2025-02-17,3.416,2.6613,8369.4661,9658.2404,0.0991,0.0929,-2.1225
Mean,NaT,2.5043,2.0826,5990.1403,7374.6083,0.0662,0.0669,-1.6043
SD,NaT,0.9189,0.8643,2170.3599,2947.7735,0.0278,0.0288,1.2388


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  50 | elapsed:    4.5s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.3s finished


Tuned ARIMA model parameters:
ARIMA(order=(1, 0, 1), seasonal_order=(1, 1, 1, 14), with_intercept=False)


In [26]:
print("\n--- Finalizing Model and Making Predictions ---")
final_arima = finalize_model(tuned_arima)

# Generate predictions for the test set
predictions = predict_model(final_arima) #fh=90)

print("Prediction results (last 5 rows):")
display(predictions.tail())

predictions_df = predictions.to_timestamp()

plot_model(tuned_arima, plot='forecast')

print(pull().to_markdown(index=False))




--- Finalizing Model and Making Predictions ---
Prediction results (last 5 rows):


Unnamed: 0,y_pred
2025-05-14,84861.2584
2025-05-15,84750.1407
2025-05-16,84604.8829
2025-05-17,84748.4379
2025-05-18,84559.5244


| Model   |   MASE |   RMSSE |     MAE |    RMSE |   MAPE |   SMAPE |      R2 |
|:--------|-------:|--------:|--------:|--------:|-------:|--------:|--------:|
| ARIMA   | 1.2824 |  1.0809 | 3192.87 | 3989.93 | 0.0391 |  0.0379 | -1.4911 |


In [27]:
plot_model(final_arima, plot='forecast')

In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("\n--- Model Test Scoring ---")

# Calculate evaluation metrics
# mae = mean_absolute_error(btc_test, predictions)
# mse = mean_squared_error(btc_test, predictions)
# r2 = r2_score(btc_test, predictions)

# print(f"Mean squared error: {mse:.2f}")
# print(f"Mean absolute error: {mae:.2f}")
# print(f"R-squared score : {r2:.2f}") 

# print(btc_test.index)
# print(predictions_df.index)
print(pull())



--- Model Test Scoring ---
   Model
0  ARIMA


In [29]:
# print("\n--- Plot against valdation set ---")

# import plotly.express as px

# # create a dataframe for visualisation
# btc_vis = pd.DataFrame(
#     {
#         "date" : btc_test.index,
#         "actual" : btc_test.values,
#         "date": predictions_df.index,
#         "predicted" : predictions.values.flatten(),
#     }
# )

# # print(len(btc_test[-30:].index))

# # print(len(btc_test[-30:].values))
# # print(len(predictions.values.flatten()))
# # melt the dataframe for easier handling of multiple series for pyplot
# btc_vis = btc_vis.melt(id_vars=['date'], var_name="type", value_name='close')

# # plot validation set with predictions
# fig = px.line(
#     btc_vis,
#     x='date',
#     y='close',
#     color='type',
#     title="ARIMA Forecast vs Actual",
#     labels=dict(close='close price', date='date')
# )

# fig.show()

In [30]:
from IPython.display import Markdown, display

display(Markdown(s.check_stats().to_markdown()))


|    | Test         | Test Name   | Data        | Property            | Setting                  |           Value |
|---:|:-------------|:------------|:------------|:--------------------|:-------------------------|----------------:|
|  0 | Summary      | Statistics  | Transformed | Length              |                          |  1825           |
|  1 | Summary      | Statistics  | Transformed | # Missing Values    |                          |     0           |
|  2 | Summary      | Statistics  | Transformed | Mean                |                          |    70.1962      |
|  3 | Summary      | Statistics  | Transformed | Median              |                          |    70.6838      |
|  4 | Summary      | Statistics  | Transformed | Standard Deviation  |                          |    13.1375      |
|  5 | Summary      | Statistics  | Transformed | Variance            |                          |   172.594       |
|  6 | Summary      | Statistics  | Transformed | Kurtosis            |                          |    -0.723124    |
|  7 | Summary      | Statistics  | Transformed | Skewness            |                          |    -0.0460849   |
|  8 | Summary      | Statistics  | Transformed | # Distinct Values   |                          |  1825           |
|  9 | White Noise  | Ljung-Box   | Transformed | Test Statictic      | {'alpha': 0.05, 'K': 24} | 40696.8         |
| 10 | White Noise  | Ljung-Box   | Transformed | Test Statictic      | {'alpha': 0.05, 'K': 48} | 75309.9         |
| 11 | White Noise  | Ljung-Box   | Transformed | p-value             | {'alpha': 0.05, 'K': 24} |     0           |
| 12 | White Noise  | Ljung-Box   | Transformed | p-value             | {'alpha': 0.05, 'K': 48} |     0           |
| 13 | White Noise  | Ljung-Box   | Transformed | White Noise         | {'alpha': 0.05, 'K': 24} |     0           |
| 14 | White Noise  | Ljung-Box   | Transformed | White Noise         | {'alpha': 0.05, 'K': 48} |     0           |
| 15 | Stationarity | ADF         | Transformed | Stationarity        | {'alpha': 0.05}          |     0           |
| 16 | Stationarity | ADF         | Transformed | p-value             | {'alpha': 0.05}          |     0.480925    |
| 17 | Stationarity | ADF         | Transformed | Test Statistic      | {'alpha': 0.05}          |    -1.60531     |
| 18 | Stationarity | ADF         | Transformed | Critical Value 1%   | {'alpha': 0.05}          |    -3.43394     |
| 19 | Stationarity | ADF         | Transformed | Critical Value 5%   | {'alpha': 0.05}          |    -2.86313     |
| 20 | Stationarity | ADF         | Transformed | Critical Value 10%  | {'alpha': 0.05}          |    -2.56761     |
| 21 | Stationarity | KPSS        | Transformed | Trend Stationarity  | {'alpha': 0.05}          |     0           |
| 22 | Stationarity | KPSS        | Transformed | p-value             | {'alpha': 0.05}          |     0.01        |
| 23 | Stationarity | KPSS        | Transformed | Test Statistic      | {'alpha': 0.05}          |     0.701502    |
| 24 | Stationarity | KPSS        | Transformed | Critical Value 10%  | {'alpha': 0.05}          |     0.119       |
| 25 | Stationarity | KPSS        | Transformed | Critical Value 5%   | {'alpha': 0.05}          |     0.146       |
| 26 | Stationarity | KPSS        | Transformed | Critical Value 2.5% | {'alpha': 0.05}          |     0.176       |
| 27 | Stationarity | KPSS        | Transformed | Critical Value 1%   | {'alpha': 0.05}          |     0.216       |
| 28 | Normality    | Shapiro     | Transformed | Normality           | {'alpha': 0.05}          |     0           |
| 29 | Normality    | Shapiro     | Transformed | p-value             | {'alpha': 0.05}          |     1.10809e-14 |

# observations

Arima expects seasonaility. As such, it tries to emulate this in it's forecast. The bitcoin data has no true statistical seasonality.