In [1]:
from pycaret.time_series import *
import pandas as pd

In [2]:
# load the BTC long term dataset

btc_5y_df = pd.read_csv('../backend/data/BTC-past-5y-ohlc.csv', index_col=0, parse_dates=True)

print(btc_5y_df.tail().to_markdown())

| timestamp           | symbol   |    open |    high |     low |   close |   volume |
|:--------------------|:---------|--------:|--------:|--------:|--------:|---------:|
| 2025-03-07 00:00:00 | BTCUSDT  | 89931.9 | 91283   | 84667   | 86801.8 |  57980.4 |
| 2025-03-08 00:00:00 | BTCUSDT  | 86801.7 | 86897.2 | 85218.5 | 86222.4 |  12989.2 |
| 2025-03-09 00:00:00 | BTCUSDT  | 86222.5 | 86500   | 80000   | 80734.4 |  26115.4 |
| 2025-03-10 00:00:00 | BTCUSDT  | 80734.5 | 84123.5 | 77459.9 | 78595.9 |  47633.4 |
| 2025-03-11 00:00:00 | BTCUSDT  | 78595.9 | 82225.2 | 76606   | 81289.9 |  37174.5 |


In [3]:
# arima_experiment = setup(btc_5y_df.loc[:,'close'], fh = 3, fold = 5, session_id = 123)

# close prices
btc_5y_close_df = btc_5y_df.loc[:, 'close']

type(btc_5y_close_df.info())

<class 'pandas.core.series.Series'>
DatetimeIndex: 1825 entries, 2020-03-13 to 2025-03-11
Series name: close
Non-Null Count  Dtype  
--------------  -----  
1825 non-null   float64
dtypes: float64(1)
memory usage: 28.5 KB


NoneType

In [4]:
print("\n--- Setting up Data Split ---")

# hold off last year for testing
split_frame = btc_5y_close_df.index[-30-1]
btc_train = btc_5y_close_df.loc[btc_5y_close_df.index <= split_frame].copy()
btc_test = btc_5y_close_df.loc[btc_5y_close_df.index > split_frame].copy()
print(f"Training data: {btc_train.shape[0]} days")
print(f"Testing data: {btc_test.shape[0]} days")



--- Setting up Data Split ---
Training data: 1795 days
Testing data: 30 days


In [5]:
print("\n--- Setting up PyCaret Environment ---")
# Setup PyCaret environment
s = setup(data=btc_5y_close_df, target='close', transform_target='box-cox',
          fh=30, 
          seasonal_period='D', fold=5, session_id=123)


--- Setting up PyCaret Environment ---


Unnamed: 0,Description,Value
0,session_id,123
1,Target,close
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(1825, 1)"
5,Transformed data shape,"(1825, 1)"
6,Transformed train set shape,"(1795, 1)"
7,Transformed test set shape,"(30, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


In [6]:

s.plot_model(plot='decomp')
s.plot_model(plot='decomp', data_kwargs={'type': 'multiplicative'})
s.plot_model(plot='acf')
s.plot_model(plot='pacf')

In [7]:
# Compare models
best_model = compare_models(
    include=['rf_cds_dt', 'ada_cds_dt', 'auto_arima', 'exp_smooth', 'arima', 'prophet', 'naive'],
    sort='R2'
    )

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2,TT (Sec)
exp_smooth,Exponential Smoothing,2.2296,1.7549,5141.4853,6047.7005,0.0605,0.0625,-2.0171,0.102
auto_arima,Auto ARIMA,2.2762,1.7885,5248.8319,6163.7168,0.0619,0.0639,-2.1453,7.304
rf_cds_dt,Random Forest w/ Cond. Deseasonalize & Detrending,2.3254,1.844,5401.7601,6398.665,0.0608,0.0621,-2.3513,0.866
arima,ARIMA,2.2357,1.7733,5158.5394,6117.3788,0.0619,0.0639,-2.4309,0.04
ada_cds_dt,AdaBoost w/ Cond. Deseasonalize & Detrending,2.5309,1.9397,5864.0775,6711.945,0.0662,0.0684,-2.451,0.366
naive,Naive Forecaster,2.4503,1.8991,5645.9681,6537.5191,0.067,0.0699,-2.6558,0.206
prophet,Prophet,3.698,2.7546,8554.8902,9528.178,0.0921,0.1003,-9.5424,0.47


In [8]:
print("\n --- Create ARIMA model --- ")

arima_model = create_model('arima')

print("--- model parameters: ---")
print(arima_model)


 --- Create ARIMA model --- 


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2024-09-12,1.8518,1.4363,4164.8632,4832.6565,0.0659,0.0689,-5.4068
1,2024-10-12,3.0422,2.4792,6874.4036,8339.1091,0.0939,0.1003,-1.7198
2,2024-11-11,1.5843,1.1912,3638.0145,4081.0269,0.038,0.0379,-0.1527
3,2024-12-11,2.3993,1.8503,5639.3803,6534.1562,0.0584,0.0566,-2.2304
4,2025-01-10,2.3009,1.9092,5476.0354,6799.9453,0.0535,0.0556,-2.645
Mean,NaT,2.2357,1.7733,5158.5394,6117.3788,0.0619,0.0639,-2.4309
SD,NaT,0.5006,0.4416,1146.7859,1508.1728,0.0184,0.0207,1.711


--- model parameters: ---
ARIMA(seasonal_order=(0, 1, 0, 7))


In [9]:
# Tune the ARIMA model
print("\n--- Tuning ARIMA model... ---")
tuned_arima = tune_model(arima_model)
print("Tuned ARIMA model parameters:")
print(tuned_arima)



--- Tuning ARIMA model... ---


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2024-09-12,1.8079,1.3351,4066.0022,4492.1652,0.0644,0.067,-4.5358
1,2024-10-12,3.0385,2.5031,6865.8777,8419.2902,0.0935,0.1,-1.7723
2,2024-11-11,2.4921,1.8907,5722.5375,6477.6006,0.0589,0.0612,-1.9042
3,2024-12-11,2.4189,1.8225,5685.4233,6435.7513,0.0591,0.0571,-2.1339
4,2025-01-10,1.9609,1.6153,4666.8019,5753.2827,0.0453,0.0469,-1.6092
Mean,NaT,2.3436,1.8333,5401.3285,6315.618,0.0642,0.0664,-2.3911
SD,NaT,0.4345,0.3867,964.5118,1273.3663,0.0159,0.018,1.086


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.2s finished


Tuned ARIMA model parameters:
ARIMA(seasonal_order=(0, 1, 0, 7))


In [26]:
print("\n--- Finalizing Model and Making Predictions ---")
final_arima = finalize_model(tuned_arima)

# Generate predictions for the test set
predictions = predict_model(final_arima) #fh=90)

print("Prediction results (last 5 rows):")
display(predictions.tail())

predictions_df = predictions.to_timestamp()

plot_model(tuned_arima, plot='forecast')

print(pull().to_markdown(index=False))




--- Finalizing Model and Making Predictions ---
Prediction results (last 5 rows):


Unnamed: 0,y_pred
2025-04-06,78641.399
2025-04-07,77148.1959
2025-04-08,80343.7265
2025-04-09,84909.9043
2025-04-10,85486.7377


| Model   |   MASE |   RMSSE |     MAE |    RMSE |   MAPE |   SMAPE |      R2 |
|:--------|-------:|--------:|--------:|--------:|-------:|--------:|--------:|
| ARIMA   | 2.9585 |  2.6739 | 7137.07 | 9619.59 | 0.0833 |  0.0773 | -1.5931 |


In [28]:
plot_model(final_arima, plot='forecast')

In [25]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("\n--- Model Test Scoring ---")

# Calculate evaluation metrics
# mae = mean_absolute_error(btc_test, predictions)
# mse = mean_squared_error(btc_test, predictions)
# r2 = r2_score(btc_test, predictions)

# print(f"Mean squared error: {mse:.2f}")
# print(f"Mean absolute error: {mae:.2f}")
# print(f"R-squared score : {r2:.2f}") 

# print(btc_test.index)
# print(predictions_df.index)
print(pull())



--- Model Test Scoring ---
   Model
0  ARIMA


In [12]:
# print("\n--- Plot against valdation set ---")

# import plotly.express as px

# # create a dataframe for visualisation
# btc_vis = pd.DataFrame(
#     {
#         "date" : btc_test.index,
#         "actual" : btc_test.values,
#         "date": predictions_df.index,
#         "predicted" : predictions.values.flatten(),
#     }
# )

# # print(len(btc_test[-30:].index))

# # print(len(btc_test[-30:].values))
# # print(len(predictions.values.flatten()))
# # melt the dataframe for easier handling of multiple series for pyplot
# btc_vis = btc_vis.melt(id_vars=['date'], var_name="type", value_name='close')

# # plot validation set with predictions
# fig = px.line(
#     btc_vis,
#     x='date',
#     y='close',
#     color='type',
#     title="ARIMA Forecast vs Actual",
#     labels=dict(close='close price', date='date')
# )

# fig.show()

In [30]:
from IPython.display import Markdown, display

display(Markdown(s.check_stats().to_markdown()))


|    | Test         | Test Name   | Data        | Property            | Setting                  |           Value |
|---:|:-------------|:------------|:------------|:--------------------|:-------------------------|----------------:|
|  0 | Summary      | Statistics  | Transformed | Length              |                          |  1825           |
|  1 | Summary      | Statistics  | Transformed | # Missing Values    |                          |     0           |
|  2 | Summary      | Statistics  | Transformed | Mean                |                          |    92.2656      |
|  3 | Summary      | Statistics  | Transformed | Median              |                          |    92.9094      |
|  4 | Summary      | Statistics  | Transformed | Standard Deviation  |                          |    19.8052      |
|  5 | Summary      | Statistics  | Transformed | Variance            |                          |   392.246       |
|  6 | Summary      | Statistics  | Transformed | Kurtosis            |                          |    -0.635729    |
|  7 | Summary      | Statistics  | Transformed | Skewness            |                          |    -0.0492704   |
|  8 | Summary      | Statistics  | Transformed | # Distinct Values   |                          |  1825           |
|  9 | White Noise  | Ljung-Box   | Transformed | Test Statictic      | {'alpha': 0.05, 'K': 24} | 40439.6         |
| 10 | White Noise  | Ljung-Box   | Transformed | Test Statictic      | {'alpha': 0.05, 'K': 48} | 74024           |
| 11 | White Noise  | Ljung-Box   | Transformed | p-value             | {'alpha': 0.05, 'K': 24} |     0           |
| 12 | White Noise  | Ljung-Box   | Transformed | p-value             | {'alpha': 0.05, 'K': 48} |     0           |
| 13 | White Noise  | Ljung-Box   | Transformed | White Noise         | {'alpha': 0.05, 'K': 24} |     0           |
| 14 | White Noise  | Ljung-Box   | Transformed | White Noise         | {'alpha': 0.05, 'K': 48} |     0           |
| 15 | Stationarity | ADF         | Transformed | Stationarity        | {'alpha': 0.05}          |     0           |
| 16 | Stationarity | ADF         | Transformed | p-value             | {'alpha': 0.05}          |     0.430857    |
| 17 | Stationarity | ADF         | Transformed | Test Statistic      | {'alpha': 0.05}          |    -1.7005      |
| 18 | Stationarity | ADF         | Transformed | Critical Value 1%   | {'alpha': 0.05}          |    -3.43394     |
| 19 | Stationarity | ADF         | Transformed | Critical Value 5%   | {'alpha': 0.05}          |    -2.86313     |
| 20 | Stationarity | ADF         | Transformed | Critical Value 10%  | {'alpha': 0.05}          |    -2.56761     |
| 21 | Stationarity | KPSS        | Transformed | Trend Stationarity  | {'alpha': 0.05}          |     0           |
| 22 | Stationarity | KPSS        | Transformed | p-value             | {'alpha': 0.05}          |     0.01        |
| 23 | Stationarity | KPSS        | Transformed | Test Statistic      | {'alpha': 0.05}          |     0.68502     |
| 24 | Stationarity | KPSS        | Transformed | Critical Value 10%  | {'alpha': 0.05}          |     0.119       |
| 25 | Stationarity | KPSS        | Transformed | Critical Value 5%   | {'alpha': 0.05}          |     0.146       |
| 26 | Stationarity | KPSS        | Transformed | Critical Value 2.5% | {'alpha': 0.05}          |     0.176       |
| 27 | Stationarity | KPSS        | Transformed | Critical Value 1%   | {'alpha': 0.05}          |     0.216       |
| 28 | Normality    | Shapiro     | Transformed | Normality           | {'alpha': 0.05}          |     0           |
| 29 | Normality    | Shapiro     | Transformed | p-value             | {'alpha': 0.05}          |     2.22191e-13 |