In [1]:
import os
import re
from time import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
import calendar
from catboost import CatBoostRegressor

warnings.filterwarnings('ignore')

# Getting the prediction from block level model

In [14]:
df = pd.read_csv('df_joined_with_basefee.csv').drop(columns='Unnamed: 0')
df['datetime'] = pd.to_datetime(df['datetime'])

df['gas_used_block_real'] = df['gas_used_block'].shift(-1)
df = df.dropna(subset='gas_used_block_real')
df = df[['block_number', 'avg_receipt_gas_used', 'total_value_transactions',
       'avg_value_transactions', 'datetime', 'size', 'gas_limit',
       'transaction_count', 'gas_used_block', 'Volume', 'Close', 'RSI_adj', 'MACD',
       'MACD_exponential_MA_9', 'Ethereum', 'Solana','gas_used_block_real','base_fee_per_gas']]
df = df[df['datetime'] >= '2022-04-08 13:00:00']

df['total_value_transactions'] = df['total_value_transactions'].apply(lambda x: float(x))

In [17]:
for i in range(1,5):
    df['gas_used_block_real_pct_' + str(i)] = df['gas_used_block_real'].pct_change(i)
    df['transaction_count_pct_' + str(i)] = df['transaction_count'].pct_change(i)
    for j in range(3,5):
        df['gas_used_block_real_pct_' + str(i) + '_ewm_' + str(j)] = df['gas_used_block_real_pct_' + str(i)].ewm(span=j, adjust=False).mean()
        df['transaction_count_pct_' + str(i) + '_ewm_' + str(j)] = df['transaction_count_pct_' + str(i)].ewm(span=j, adjust=False).mean()


In [18]:
vars_selected = ['gas_used_block_real_pct_1',
 'gas_used_block',
 'gas_used_block_real_pct_2',
 'gas_used_block_real_pct_3',
 'gas_used_block_real_pct_4',
 'gas_used_block_real_pct_1_ewm_3',
 'gas_used_block_real_pct_2_ewm_3',
 'gas_used_block_real_pct_4_ewm_3',
 'gas_used_block_real_pct_1_ewm_4',
 'gas_used_block_real_pct_3_ewm_3',
 'gas_used_block_real_pct_4_ewm_4',
 'transaction_count_pct_3',
 'transaction_count',
 'transaction_count_pct_2',
 'transaction_count_pct_4_ewm_3',
 'transaction_count_pct_1',
 'transaction_count_pct_3_ewm_3',
 'transaction_count_pct_1_ewm_4',
 'Solana',
 'avg_receipt_gas_used',
 'size',
 'gas_used_block_real_pct_2_ewm_4',
 'transaction_count_pct_4_ewm_4',
 'gas_used_block_real_pct_3_ewm_4',
 'transaction_count_pct_4']

In [19]:
input_df = df[(df['datetime'] >= '2023-01-01 00:00:00') & (df['datetime'] < '2024-01-01 00:00:00')]
input_x = input_df[vars_selected]
input_y = input_df['gas_used_block_real']

# Preprocess
# Winsorization
pctil_95 = input_x.quantile(0.95)
pctil_10 = input_x.quantile(0.10)

input_x = input_x.clip(lower=pctil_10, upper=pctil_95, axis=1)

# Normalization and standardize to [-1,1]
input_x_mean = input_x.mean()
input_x_std = input_x.std()
input_x_max = input_x.max()
input_x_min = input_x.min()

input_x = (input_x - input_x.mean()) / input_x.std()
input_x = 2 * (input_x - input_x.min()) / (input_x.max() - input_x.min()) - 1

# fill na
fillna_mean = input_x.mean()
input_x.fillna(input_x.mean(),inplace=True)

In [20]:
cbr = CatBoostRegressor(verbose=0, depth=3, iterations=150, learning_rate=0.01, l2_leaf_reg=5, random_seed=42)
cbr.fit(input_x, input_y)


<catboost.core.CatBoostRegressor at 0x7f8dda6d93d0>

In [21]:
predict_x = df[df['datetime'] >= '2024-01-01 00:00:00'][vars_selected]
predict_x = predict_x.clip(lower=pctil_10, upper=pctil_95, axis=1)

predict_x = (predict_x - input_x_mean) / input_x_std
predict_x = 2 * (predict_x - input_x_min) / (input_x_max - input_x_min) - 1

# fill na
predict_x.fillna(fillna_mean,inplace=True)


In [22]:
predicted_gas_used = cbr.predict(predict_x)

In [51]:
test_df = df[df['datetime'] >= '2024-01-01 00:00:00']
test_df['predicted_gas_used'] = predicted_gas_used


In [52]:
# In the begining, gas_used_block_real was shifted -1 so that a block can be in the same row of the next block's gas used. With such a setting, inputting i-th and before block data, next block data can be predicted.
# Now with the prediction in hand, and try to compute the associated gas price, which has parent block's data in the equation, shift the predicted gas used by i.
test_df = test_df.reset_index(drop=True)
test_df['predicted_gas_used'] = test_df['predicted_gas_used'].shift(1)
test_df = test_df.dropna(subset='predicted_gas_used')

test_df['predicted_base_fee'] = 0

ElasticityMultiplier = 2 # EIP-1559 The block size has been expanded, the maximum multiple is 2
BaseFeeChangeDenominator = 8 # The amount the base fee can change between blocks

for i in range(1,len(test_df)):
  parent_block = test_df.iloc[i-1, :]
  parent_gas_target = parent_block['gas_limit'] // ElasticityMultiplier

  if parent_block['predicted_gas_used'] == parent_gas_target:
    test_df.loc[i, 'predicted_base_fee'] = parent_block['base_fee_per_gas']

  elif parent_block['predicted_gas_used'] > parent_gas_target:
    gas_used_delta = parent_block['predicted_gas_used'] - parent_gas_target
    x = parent_block['base_fee_per_gas'] * gas_used_delta
    y = x // parent_gas_target
    base_fee_delta = max(
            y // BaseFeeChangeDenominator,
            1
        )
    test_df.loc[i, 'predicted_base_fee'] = parent_block['base_fee_per_gas'] + base_fee_delta
  else:
    gas_used_delta = parent_gas_target - parent_block['predicted_gas_used']
    x = parent_block['base_fee_per_gas'] * gas_used_delta
    y = x // parent_gas_target
    base_fee_delta = y // BaseFeeChangeDenominator
    test_df.loc[i, 'predicted_base_fee'] = max(parent_block['base_fee_per_gas'] - base_fee_delta, 0)

In [53]:
block_level = test_df[['datetime', 'predicted_base_fee','base_fee_per_gas']]

In [55]:
block_level['diff'] = block_level['base_fee_per_gas'] - block_level['predicted_base_fee']

In [58]:
block_level = block_level.iloc[:-1,:]

# Combining the output from minute level model

In [27]:
df_min = pd.read_csv('df_min.csv').drop(columns='Unnamed: 0')
df_min['datetime'] = pd.to_datetime(df_min['datetime'])

In [30]:
for i in range(7,10):
    df_min['base_fee_real_pct_' + str(i)] = df_min['base_fee_real'].pct_change(i)
    df_min['transaction_count_avg_pct_' + str(i)] = df_min['transaction_count_avg'].pct_change(i)
    for j in range(3,7):
        df_min['base_fee_real_pct_' + str(i) + '_ewm_' + str(j)] = df_min['base_fee_real_pct_' + str(i)].ewm(span=j, adjust=False).mean()
        df_min['transaction_count_avg_pct_' + str(i) + '_ewm_' + str(j)] = df_min['transaction_count_avg_pct_' + str(i)].ewm(span=j, adjust=False).mean()

for col in ['total_value_transactions_sum', 'size_sum', 'gas_limit_sum',
       'transaction_count_sum', 'gas_used_block_sum', 'block_cnt',
       'total_value_transactions_avg', 'size_avg', 'gas_limit_avg',
       'transaction_count_avg', 'gas_used_block_avg']:
   for i in range(1,3):
      df_min[col + '_pct_' + str(i)] = df_min[col].pct_change(i)
      df_min[col + '_lag_' + str(i)] = df_min[col].shift(i)

In [31]:
min_vars = ['Solana',
 'MACD_exponential_MA_9',
 'MACD',
 'Close',
 'Ethereum',
 'transaction_count_avg',
 'transaction_count_avg_lag_1',
 'transaction_count_avg_lag_2',
 'RSI_adj',
 'base_fee_real_pct_9_ewm_6',
 'Volume',
 'transaction_count_avg_pct_9_ewm_6',
 'transaction_count_sum_lag_1',
 'total_value_transactions_avg',
 'size_sum_lag_1',
 'size_avg',
 'size_sum',
 'gas_used_block_avg',
 'total_value_transactions_avg_lag_1',
 'size_avg_lag_2',
 'transaction_count_sum',
 'base_fee_real_pct_7',
 'size_sum_lag_2',
 'gas_used_block_avg_lag_2',
 'total_value_transactions_avg_lag_2']

In [33]:
df_min_all = df_min[(df_min['datetime'] >= '2023-01-01 00:00:00') & (df_min['datetime'] < '2024-01-01 00:00:00')]
min_x = df_min_all[min_vars]
min_y = df_min_all['base_fee_real']

# Preprocess
# Winsorization
pctil_95 = min_x.quantile(0.95)
pctil_10 = min_x.quantile(0.10)

min_x = min_x.clip(lower=pctil_10, upper=pctil_95, axis=1)

# Normalization and standardize to [-1,1]
input_x_mean = min_x.mean()
input_x_std = min_x.std()
input_x_max = min_x.max()
input_x_min = min_x.min()

min_x = (min_x - min_x.mean()) / min_x.std()
min_x = 2 * (min_x - min_x.min()) / (min_x.max() - min_x.min()) - 1

# fill na
fillna_mean = min_x.mean()
min_x.fillna(min_x.mean(),inplace=True)

In [34]:
cbr = CatBoostRegressor(verbose=0, depth=5, iterations=350, learning_rate=0.01, l2_leaf_reg=5, random_seed=42)
cbr.fit(min_x, min_y)

<catboost.core.CatBoostRegressor at 0x7f8e3ab32430>

In [36]:
predict_min_x = df_min[df_min['datetime'] >= '2024-01-01 00:00:00'][min_vars]
predict_min_x = predict_min_x.clip(lower=pctil_10, upper=pctil_95, axis=1)

predict_min_x = (predict_min_x - input_x_mean) / input_x_std
predict_min_x = 2 * (predict_min_x - input_x_min) / (input_x_max - input_x_min) - 1

# fill na
predict_min_x.fillna(fillna_mean,inplace=True)


In [37]:
predicted_base_fee = cbr.predict(predict_min_x)

In [41]:
test_min_df = df_min[df_min['datetime'] >= '2024-01-01 00:00:00']
test_min_df['predicted_base_fee'] = predicted_base_fee


In [62]:
min_level = test_min_df[['datetime', 'predicted_base_fee']]

# Join the two results

In [60]:
block_level

Unnamed: 0,datetime,predicted_base_fee,base_fee_per_gas,diff
1,2024-01-01 00:00:00,12956097077,1.238341e+10,-572690075.0
2,2024-01-01 00:00:00,12584369567,1.307809e+10,493724380.0
3,2024-01-01 00:00:00,12214297840,1.250051e+10,286216583.0
4,2024-01-01 00:00:00,12181507802,1.220578e+10,24275118.0
5,2024-01-01 00:01:00,12336845032,1.217861e+10,-158236355.0
...,...,...,...,...
220987,2024-01-31 23:58:00,23043050045,2.227433e+10,-768724089.0
220988,2024-01-31 23:58:00,23446269186,2.418898e+10,742712352.0
220989,2024-01-31 23:59:00,22544972870,2.229720e+10,-247775697.0
220990,2024-01-31 23:59:00,21230987057,2.161776e+10,386773083.0


In [63]:
min_level

Unnamed: 0,datetime,predicted_base_fee
908235,2024-01-01 00:00:00,1.713160e+10
908236,2024-01-01 00:01:00,2.160337e+10
908237,2024-01-01 00:02:00,2.274513e+10
908238,2024-01-01 00:03:00,2.274513e+10
908239,2024-01-01 00:04:00,2.274513e+10
...,...,...
952869,2024-01-31 23:54:00,1.335887e+10
952870,2024-01-31 23:55:00,1.335887e+10
952871,2024-01-31 23:56:00,1.335887e+10
952872,2024-01-31 23:57:00,1.335887e+10


In [73]:
merged = pd.merge(block_level, min_level, on=['datetime'], how='left').rename(columns={'predicted_base_fee_x':'predicted_base_fee','predicted_base_fee_y':'predicted_avg_base_fee'}).drop(columns=['diff']).dropna()

In [74]:
merged

Unnamed: 0,datetime,predicted_base_fee,base_fee_per_gas,predicted_avg_base_fee
0,2024-01-01 00:00:00,12956097077,1.238341e+10,1.713160e+10
1,2024-01-01 00:00:00,12584369567,1.307809e+10,1.713160e+10
2,2024-01-01 00:00:00,12214297840,1.250051e+10,1.713160e+10
3,2024-01-01 00:00:00,12181507802,1.220578e+10,1.713160e+10
4,2024-01-01 00:01:00,12336845032,1.217861e+10,2.160337e+10
...,...,...,...,...
220983,2024-01-31 23:58:00,22669687347,2.355909e+10,1.335887e+10
220984,2024-01-31 23:58:00,21578995604,2.242561e+10,1.335887e+10
220985,2024-01-31 23:58:00,22120983479,2.069684e+10,1.335887e+10
220986,2024-01-31 23:58:00,23043050045,2.227433e+10,1.335887e+10


# Simulation

In [109]:
t = merged['datetime'].unique()[0]
cur_df = merged[merged['datetime'] == t]
cur_predicted_avg = cur_df['predicted_avg_base_fee'].unique()[0]

stg_choice_df = cur_df[cur_df['predicted_base_fee'] <= cur_predicted_avg].reset_index(drop=True)
# stg_choice_df.loc[0,'base_fee_per_gas']
# stg_choice_df.loc[-1,'base_fee_per_gas']
random.choice(np.array(cur_df['base_fee_per_gas']))

12205782920.0

In [110]:
# assume every minute has to happen a transaction
import random


normal = []
strategy = []

for t in merged['datetime'].unique():
    cur_df = merged[merged['datetime'] == t]
    normal.append(random.choice(np.array(cur_df['base_fee_per_gas'])))

    cur_predicted_avg = cur_df['predicted_avg_base_fee'].unique()[0]
    stg_choice_df = cur_df[cur_df['predicted_base_fee'] <= cur_predicted_avg].reset_index(drop=True)
    if len(stg_choice_df) > 0:
        strategy.append(stg_choice_df.loc[0,'base_fee_per_gas'])
    else:
        strategy.append(cur_df.iloc[-1,2])
    


In [111]:
sum(normal) - sum(strategy)

4794335809046.0

In [114]:
# assume every minute has to happen a transaction
import random

strategy = []
for t in merged['datetime'].unique():
    cur_df = merged[merged['datetime'] == t]

    cur_predicted_avg = cur_df['predicted_avg_base_fee'].unique()[0]
    stg_choice_df = cur_df[cur_df['predicted_base_fee'] <= cur_predicted_avg].reset_index(drop=True)
    if len(stg_choice_df) > 0:
        strategy.append(stg_choice_df.loc[0,'base_fee_per_gas'])
    else:
        strategy.append(cur_df.iloc[-1,2])

sim = []
for i in range(100):
    print('----- working on i = %d ----- '%i)
    normal = []

    for t in merged['datetime'].unique():
        cur_df = merged[merged['datetime'] == t]
        normal.append(random.choice(np.array(cur_df['base_fee_per_gas'])))

    sim.append(normal)


----- working on i = 0 ----- 
----- working on i = 1 ----- 
----- working on i = 2 ----- 
----- working on i = 3 ----- 
----- working on i = 4 ----- 
----- working on i = 5 ----- 
----- working on i = 6 ----- 
----- working on i = 7 ----- 
----- working on i = 8 ----- 
----- working on i = 9 ----- 
----- working on i = 10 ----- 
----- working on i = 11 ----- 
----- working on i = 12 ----- 
----- working on i = 13 ----- 
----- working on i = 14 ----- 
----- working on i = 15 ----- 
----- working on i = 16 ----- 
----- working on i = 17 ----- 
----- working on i = 18 ----- 
----- working on i = 19 ----- 
----- working on i = 20 ----- 
----- working on i = 21 ----- 
----- working on i = 22 ----- 
----- working on i = 23 ----- 
----- working on i = 24 ----- 
----- working on i = 25 ----- 
----- working on i = 26 ----- 
----- working on i = 27 ----- 
----- working on i = 28 ----- 
----- working on i = 29 ----- 
----- working on i = 30 ----- 
----- working on i = 31 ----- 
----- working on i

In [117]:
diff = []
for i in [sum(i) for i in sim]:
    diff.append(i-sum(strategy))

In [124]:
np.mean([i/1000000000 for i in diff])

4737.03042287921

In [125]:
np.mean([i/1000000000 * 0.058336037997 for i in diff])

276.33958674202654

In [None]:
0.058336037997