## General prerprocess data for baseline models.

## Preprocess for both SARIMA and HWES, we need fixed time interval:

In [6]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from gluonts.dataset.common import ListDataset
import json
import os
import math

In [7]:
df = pd.read_parquet('../data/processed/cleaned.parquet')
price_cols_to_drop = [col for col in df.columns if col.startswith("price_") and col != "price_USD"]
df.drop(columns=price_cols_to_drop, inplace=True) #Delete all price rows except usd
df.head()
df.tail()

Unnamed: 0_level_0,mempool_blocks_blockSize,mempool_blocks_blockVSize,mempool_blocks_nTx,mempool_blocks_totalFees,mempool_blocks_medianFee,recommended_fee_fastestFee,recommended_fee_halfHourFee,recommended_fee_hourFee,recommended_fee_economyFee,recommended_fee_minimumFee,...,mempool_fee_histogram_bin_70_75,mempool_fee_histogram_bin_75_80,mempool_fee_histogram_bin_80_85,mempool_fee_histogram_bin_85_90,mempool_fee_histogram_bin_90_95,mempool_fee_histogram_bin_95_100,mempool_fee_histogram_bin_100_150,mempool_fee_histogram_bin_150_200,mempool_fee_histogram_bin_200_250,mempool_fee_histogram_bin_250_300
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-05-12 19:39:42,1720245.0,997979.25,3160.0,11873272.0,6.011708,7.0,6.0,5.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-12 19:44:42,1472851.0,997948.75,2737.0,5418173.0,4.453501,5.0,5.0,4.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-12 19:49:41,1630060.0,997908.25,3586.0,9119569.0,5.021792,6.0,6.0,5.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-12 19:54:41,1738488.0,997979.25,2613.0,5505858.0,2.128916,3.0,3.0,3.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-12 19:59:41,1898235.0,997976.25,1779.0,3884022.0,1.10286,2.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Check if all the time interval are fixed.

In [8]:
time_diffs = df.index.to_series().diff()
print("Unique time intervals:")
print(time_diffs.value_counts())

most_common_interval = time_diffs.mode()[0]
print(f"\nMost common interval: {most_common_interval}")


Unique time intervals:
timestamp
0 days 00:05:00    7420
0 days 00:05:01    5786
0 days 00:04:59    5773
0 days 00:04:56     215
0 days 00:05:04     208
0 days 00:05:03     184
0 days 00:04:57     175
0 days 00:04:55       6
0 days 00:04:58       4
0 days 00:05:05       2
0 days 00:01:15       1
0 days 00:06:29       1
0 days 00:03:32       1
0 days 00:06:23       1
0 days 00:03:19       1
0 days 00:03:36       1
0 days 00:06:42       1
0 days 00:01:21       1
0 days 00:05:02       1
0 days 00:05:09       1
0 days 00:05:08       1
0 days 00:08:46       1
0 days 00:02:28       1
0 days 00:08:38       1
0 days 00:03:03       1
0 days 00:07:00       1
0 days 00:04:07       1
0 days 00:05:57       1
0 days 00:02:57       1
0 days 00:07:03       1
0 days 00:04:44       1
0 days 00:05:13       1
0 days 00:02:43       1
0 days 00:07:17       1
0 days 00:04:52       1
Name: count, dtype: int64

Most common interval: 0 days 00:05:00


In [9]:
# Resample to 15-minute regular intervals, averaging or interpolating values
# No actual missing data, just values not aligned exactly on the desired resampling grid, so we can interpolate the data.
df_resampled = df.resample("15min").mean()
df_resampled_hwes = df.resample("5min").mean()
# Linear interpolation is safe here due to small timing variations
df_resampled = df_resampled.interpolate(method='linear')
df_resampled_hwes = df_resampled_hwes.interpolate(method='linear')

In [5]:
#SARIMA need stationarity check
ts = df_resampled['recommended_fee_fastestFee']
adf_result = adfuller(ts.dropna())
print(f"ADF Statistic: {adf_result[0]}")
print(f"p-value: {adf_result[1]}")


ADF Statistic: -15.966378902122305
p-value: 7.084466687138094e-29


The p-value is much less than 0.05, which strongly indicates the series is stationary.
This means: no differencing is needed before fitting ARIMA or SARIMA.

In [6]:
df_resampled.to_parquet("../data/processed/preprocessed_sarima.parquet")
df_resampled_hwes.to_parquet("../data/processed/preprocessed_hwes.parquet")

In [7]:
df_resampled.to_parquet("../data/processed/preprocessed_prophet.parquet")

## Preprocess for XGboost

In [8]:
#We will need lagging feature for XGboost. Since each hour has 4 data points (60 ÷ 15), 48 hours = 192 lag steps.
def create_lag_features_fast(df, target_col, lags):
    lagged_dfs = [
        df[[target_col]].shift(lag).rename(columns={target_col: f'{target_col}_lag_{lag}'})
        for lag in lags
    ]
    return pd.concat([df] + lagged_dfs, axis=1)
lags = range(1, 193)  # 48 hours of 15-minute intervals
df_xgboost = create_lag_features_fast(df_resampled, 'recommended_fee_fastestFee', lags)
df_xgboost.head()

Unnamed: 0_level_0,mempool_blocks_blockSize,mempool_blocks_blockVSize,mempool_blocks_nTx,mempool_blocks_totalFees,mempool_blocks_medianFee,recommended_fee_fastestFee,recommended_fee_halfHourFee,recommended_fee_hourFee,recommended_fee_economyFee,recommended_fee_minimumFee,...,recommended_fee_fastestFee_lag_183,recommended_fee_fastestFee_lag_184,recommended_fee_fastestFee_lag_185,recommended_fee_fastestFee_lag_186,recommended_fee_fastestFee_lag_187,recommended_fee_fastestFee_lag_188,recommended_fee_fastestFee_lag_189,recommended_fee_fastestFee_lag_190,recommended_fee_fastestFee_lag_191,recommended_fee_fastestFee_lag_192
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-05 02:00:00,2170952.0,997895.375,793.0,1846754.0,1.300111,2.0,2.0,2.0,2.0,2.0,...,,,,,,,,,,
2025-03-05 02:15:00,1801526.0,997925.916667,2627.0,3407913.0,2.47396,3.0,2.666667,2.666667,2.666667,2.0,...,,,,,,,,,,
2025-03-05 02:30:00,2055350.0,997965.916667,922.0,1855358.0,1.355898,2.0,2.0,2.0,2.0,2.0,...,,,,,,,,,,
2025-03-05 02:45:00,2006800.0,997970.083333,1296.666667,2167251.0,1.536092,2.333333,2.333333,2.333333,2.333333,2.0,...,,,,,,,,,,
2025-03-05 03:00:00,1773658.0,997960.333333,2725.333333,3462741.0,2.467659,3.333333,3.0,3.0,3.0,2.0,...,,,,,,,,,,


In [9]:
df_xgboost.to_parquet("../data/processed/preprocessed_xgboost.parquet")

## Advanced-deepar

In [10]:
df_resampled.index = pd.to_datetime(df_resampled.index) 
df_deepar = df_resampled.copy()
df_deepar.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6600 entries, 2025-03-05 02:00:00 to 2025-05-12 19:45:00
Freq: 15min
Data columns (total 61 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   mempool_blocks_blockSize                  6600 non-null   float64
 1   mempool_blocks_blockVSize                 6600 non-null   float64
 2   mempool_blocks_nTx                        6600 non-null   float64
 3   mempool_blocks_totalFees                  6600 non-null   float64
 4   mempool_blocks_medianFee                  6600 non-null   float64
 5   recommended_fee_fastestFee                6600 non-null   float64
 6   recommended_fee_halfHourFee               6600 non-null   float64
 7   recommended_fee_hourFee                   6600 non-null   float64
 8   recommended_fee_economyFee                6600 non-null   float64
 9   recommended_fee_minimumFee                6600 non-null   f

In [11]:
# Step 1: Extract timestamp and target
start_timestamp = df_deepar.index[0]  # assumes index is datetime
target_values = df_deepar["recommended_fee_fastestFee"].values.tolist()
start_timestamp

Timestamp('2025-03-05 02:00:00')

In [13]:
# Step 2: add dow, tod

# 1. Day of week (one-hot)
df_deepar['day_of_week'] = df_deepar.index.dayofweek
day_of_week_ohe = pd.get_dummies(df_deepar['day_of_week'], prefix='dow')

# 2. Time of day (cyclical features)
df_deepar['hour'] = df_deepar.index.hour
df_deepar['minute'] = df_deepar.index.minute
df_deepar['minute_of_day'] = df_deepar['hour'] * 60 + df_deepar['minute']
df_deepar['tod_sin'] = np.sin(2 * np.pi * df_deepar['minute_of_day'] / 1440)
df_deepar['tod_cos'] = np.cos(2 * np.pi * df_deepar['minute_of_day'] / 1440)

# 3. Merge all
df_deepar = pd.concat([df_deepar, day_of_week_ohe], axis=1)

# Preview
df_deepar.head()


Unnamed: 0_level_0,mempool_blocks_blockSize,mempool_blocks_blockVSize,mempool_blocks_nTx,mempool_blocks_totalFees,mempool_blocks_medianFee,recommended_fee_fastestFee,recommended_fee_halfHourFee,recommended_fee_hourFee,recommended_fee_economyFee,recommended_fee_minimumFee,...,minute_of_day,tod_sin,tod_cos,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-05 02:00:00,2170952.0,997895.375,793.0,1846754.0,1.300111,2.0,2.0,2.0,2.0,2.0,...,120,0.5,0.866025,False,False,True,False,False,False,False
2025-03-05 02:15:00,1801526.0,997925.916667,2627.0,3407913.0,2.47396,3.0,2.666667,2.666667,2.666667,2.0,...,135,0.55557,0.83147,False,False,True,False,False,False,False
2025-03-05 02:30:00,2055350.0,997965.916667,922.0,1855358.0,1.355898,2.0,2.0,2.0,2.0,2.0,...,150,0.608761,0.793353,False,False,True,False,False,False,False
2025-03-05 02:45:00,2006800.0,997970.083333,1296.666667,2167251.0,1.536092,2.333333,2.333333,2.333333,2.333333,2.0,...,165,0.659346,0.75184,False,False,True,False,False,False,False
2025-03-05 03:00:00,1773658.0,997960.333333,2725.333333,3462741.0,2.467659,3.333333,3.0,3.0,3.0,2.0,...,180,0.707107,0.707107,False,False,True,False,False,False,False


In [15]:
#Step3: change all fee from wide to long
fee_columns = [
    'recommended_fee_fastestFee',
    'recommended_fee_halfHourFee',
    'recommended_fee_hourFee',
    'recommended_fee_economyFee',
    'recommended_fee_minimumFee'
]
df_long = df_deepar[fee_columns].copy()
df_long["timestamp"] = df_resampled.index  # restore timestamp if it's the index

# Melt to long format
df_long = df_long.melt(
    id_vars=["timestamp"],
    value_vars=fee_columns,
    var_name="fee_type",
    value_name="fee_value"
)
df_merged = pd.merge(df_long, df_deepar, on="timestamp", how="left")
df_merged_sorted = df_merged.sort_values(by=["fee_type", "timestamp"]).reset_index(drop=True)
df_merged_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 77 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   timestamp                                 33000 non-null  datetime64[ns]
 1   fee_type                                  33000 non-null  object        
 2   fee_value                                 33000 non-null  float64       
 3   mempool_blocks_blockSize                  33000 non-null  float64       
 4   mempool_blocks_blockVSize                 33000 non-null  float64       
 5   mempool_blocks_nTx                        33000 non-null  float64       
 6   mempool_blocks_totalFees                  33000 non-null  float64       
 7   mempool_blocks_medianFee                  33000 non-null  float64       
 8   recommended_fee_fastestFee                33000 non-null  float64       
 9   recommended_fee_halfHourFee 

In [16]:
#Step 4: Create list dataset for deepar
print(type(df_merged_sorted["timestamp"].iloc[0])) 
fee_type_to_id = {ft: i for i, ft in enumerate(df_merged_sorted["fee_type"].unique())}
external_cols = df_merged_sorted.columns.difference(["timestamp", "fee_type", "fee_value"])

series_list = []

for fee_type, group in df_merged_sorted.groupby("fee_type"):
    group = group.sort_values("timestamp")

    series = {
        "start": group["timestamp"].iloc[0].strftime("%Y-%m-%d %H:%M:%S"),
        "target": group["fee_value"].tolist(),
        "feat_dynamic_real": group[external_cols].astype(float).T.values.tolist(),
        "feat_static_cat": [int(fee_type_to_id[fee_type])]
    }

    series_list.append(series)



<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [18]:
feat_dynamic_real_names = list(external_cols)

def sanitize_for_json(obj):
    """Recursively replace NaN with None so JSON can parse it."""
    if isinstance(obj, float) and math.isnan(obj):
        return None
    elif isinstance(obj, list):
        return [sanitize_for_json(x) for x in obj]
    elif isinstance(obj, dict):
        return {k: sanitize_for_json(v) for k, v in obj.items()}
    else:
        return obj
with open("../data/processed/deepar_dataset.jsonl", "w") as f:
    for item in series_list:
        clean_item = sanitize_for_json(item)
        json.dump(clean_item, f)
        f.write("\n")

# Save mapping
with open("../data/processed/fee_type_to_id.json", "w") as f:
    json.dump(fee_type_to_id, f)

with open("../data/processed/feat_dynamic_real_names.json", "w") as f:
    json.dump(feat_dynamic_real_names, f, indent=2)
