In [11]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [35]:
pool_info = {
    "../../datasets/sol_pengu_pool/sol_pengu_pool_swaps.csv": {
        "pool_address": "FAqh648xeeaTqL7du49sztp9nfj5PjRQrfvaMccyd9cz",
        "fee_tier": 0.003
    },
    "../../datasets/msol_mnde_pool/msol_mnde_pool_swaps.csv": {
        "pool_address": "BVXNG6BrL2Tn3NmppnMeXHjBHTaQSnSnLE99JKwZSWPg",
        "fee_tier": 0.01
    },
    "../../datasets/sol_hnt_pool/sol_hnt_pool_swaps.csv": {
        "pool_address": "CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4",
        "fee_tier": 0.0005
    },
    "../../datasets/sol_usdc_pool/sol_usdc_pool_swaps.csv": {
        "pool_address": "Czfq3xZZDmsdGdUyrNLtRhGc47cXcZtLG4crryfu44zE",
        "fee_tier": 0.0004
    },
    "../../datasets/pyusd_usdc_pool/pyusd_usdc_pool_swaps.csv": {
        "pool_address": "9tXiuRRw7kbejLhZXtxDxYs2REe43uH2e7k1kocgdM9B",
        "fee_tier": 0.0001
    },
    # "../../datasets/drift_jitosol_pool/drift_jitosol_pool_swaps.csv": {
    #     "pool_address": "7u3wk63dbFfN6WUdxpJ6SDNMwDixK1ti2J3Q21ws5Vxs",
    #     "fee_tier": 0.0016
    # },
    "../../datasets/fart_usdc_pool/fart_usdc_with_price_final.csv": {
        "pool_address": "J5jzvT22u1Mt6de4gkBhEsTSTjBfYS7A6aF5jzu9ihkC",
        "fee_tier": 0.0005
    },
    "../../datasets/sol_weth_pool/sol_weth_pool_swaps.csv": {
        "pool_address": "HktfL7iwGKT5QHjywQkcDnZXScoh811k7akrMZJkCcEF",
        "fee_tier": 0.0005
    },

}

In [36]:
def add_pre_aggregation_features(df: pd.DataFrame, pool_address: str, fee_tier: float):
    df['pool_address'] = pool_address
    df['fee_tier'] = fee_tier
    df['token_amount_a_ui'] = df['token_amount_a'] / 10 ** df['decimals_a']
    df['token_amount_b_ui'] = df['token_amount_b'] / 10 ** df['decimals_b']
    df['volume_usd'] = df['token_amount_a_ui'] * df['token_price_a']
    df['fee_usd'] = df['volume_usd'] * df['fee_tier']
    df['lp_fee_usd'] = df['fee_usd'] * 0.87
    df['date'] = pd.to_datetime(df['block_time'], unit='s')
    df['price_ratio'] = df['token_price_a'] / df['token_price_b']
    df['tvl_usd'] = (df['post_balance_a'] * df['token_price_a'] + df['post_balance_b'] * df['token_price_b'])
    df['tvl_utilization'] = df['volume_usd'] / df['tvl_usd']
    df['balance_ratio'] = (df['post_balance_a'] * df['token_price_a']) / (df['post_balance_b'] * df['token_price_b'])
    df['balance_imbalance'] = abs(df['balance_ratio'] - 1.0)

    df = df.sort_values(by='block_time', ascending=True)
    df = df.drop(columns=['slot', 'block_time', 'tx_signature'])
    return df

def aggregate_date(df: pd.DataFrame):
    df = df.resample('1min', on='date').agg(
        pool_address=('pool_address', 'first'),
        fee_tier=('fee_tier', 'first'),
        token_mint_a=('token_mint_a', 'first'),
        token_mint_b=('token_mint_b', 'first'),
        token_vault_a=('token_vault_a', 'first'),
        token_vault_b=('token_vault_b', 'first'),

        num_swaps=('num_swaps', 'sum'),
        volume_usd=('volume_usd', 'sum'),
        fee_usd=('fee_usd', 'sum'),
        lp_fee_usd=('lp_fee_usd', 'sum'),

        token_amount_a=('token_amount_a_ui', 'sum'),
        token_amount_b=('token_amount_b_ui', 'sum'),
        pre_balance_a=('pre_balance_a', 'first'),
        pre_balance_b=('pre_balance_b', 'first'),
        post_balance_a=('post_balance_a', 'last'),
        post_balance_b=('post_balance_b', 'last'),

        tvl_usd_start=('tvl_usd', 'first'),
        tvl_usd_end=('tvl_usd', 'last'),
        tvl_utilization_start=('tvl_utilization', 'first'),
        tvl_utilization_end=('tvl_utilization', 'last'),
        balance_ratio_start=('balance_ratio', 'first'),
        balance_ratio_end=('balance_ratio', 'last'),
        balance_imbalance_start=('balance_imbalance', 'first'),
        balance_imbalance_end=('balance_imbalance', 'last'),

        token_price_a_start=('token_price_a', 'first'),
        token_price_a_end=('token_price_a', 'last'),
        token_price_b_start=('token_price_b', 'first'),
        token_price_b_end=('token_price_b', 'last'),
        token_ema_a_start=('token_ema_a', 'first'),
        token_ema_a_end=('token_ema_a', 'last'),
        token_ema_b_start=('token_ema_b', 'first'),
        token_ema_b_end=('token_ema_b', 'last'),
        price_ratio_start=('price_ratio', 'first'),
        price_ratio_end=('price_ratio', 'last')
    )
    return df


def post_aggregation_features(df: pd.DataFrame):
    df['pct_change_token_a'] = (df['token_price_a_end'] - df['token_price_a_start']) / df['token_price_a_start']
    df['pct_change_token_b'] = (df['token_price_b_end'] - df['token_price_b_start']) / df['token_price_b_start']

    df['price_ema_deviation_a'] = (df['token_price_a_end'] - df['token_ema_a_end']) / df['token_ema_a_end']
    df['price_ema_deviation_b'] = (df['token_price_b_end'] - df['token_ema_b_end']) / df['token_ema_b_end']

    window = 5
    df['vol_token_a'] = df['pct_change_token_a'].rolling(window).std()
    df['vol_token_b'] = df['pct_change_token_b'].rolling(window).std()
    df['vol_ratio'] = df['vol_token_a'] / (df['vol_token_b'] + 1e-8)

    df['price_ratio_change'] = (df['price_ratio_end'] - df['price_ratio_start']) / df['price_ratio_start']

    df['utilization_a'] = abs(df['token_amount_a']) / df['pre_balance_a']
    df['utilization_b'] = abs(df['token_amount_b']) / df['pre_balance_b']
    df['tvl_change'] = (df['tvl_usd_end'] - df['tvl_usd_start']) / df['tvl_usd_start']

    df['volume_per_swap'] = df['volume_usd'] / df['num_swaps']

    V_hodl = df['pre_balance_a'] * df['token_price_a_end'] + df['pre_balance_b'] * df['token_price_b_end']
    V_lp = df['post_balance_a'] * df['token_price_a_end'] + df['post_balance_b'] * df['token_price_b_end']
    df['impermanent_loss'] = V_lp / V_hodl - 1

    df['avg_swap_size_a'] = df['token_amount_a'] / df['num_swaps']
    df['avg_swap_size_b'] = df['token_amount_b'] / df['num_swaps']

    df['fee_revenue_per_swap'] = df['lp_fee_usd'] / df['num_swaps']
    df['fee_efficiency'] = df['lp_fee_usd'] / (df['volume_usd'] + 1e-8)
    df['revenue_per_liquidity'] = df['lp_fee_usd'] / df['tvl_usd_start']

    return df

In [37]:
df_all = pd.DataFrame()
for pool, info in pool_info.items():
    df = pd.read_csv(pool)
    df = add_pre_aggregation_features(df, info['pool_address'], info['fee_tier'])
    df = aggregate_date(df)
    df = post_aggregation_features(df)
    df_all = pd.concat([df_all, df])

In [38]:
df_all.shape

(246559, 52)

In [39]:
df_all.dropna(inplace=True)

In [40]:
df_all.shape

(11597, 52)

In [19]:
df.head()

Unnamed: 0_level_0,slot,block_time,tx_signature,token_mint_a,token_mint_b,token_vault_a,token_vault_b,num_swaps,token_amount_a,token_amount_b,pre_balance_a,pre_balance_b,post_balance_a,post_balance_b,decimals_a,decimals_b,token_price_a,token_price_b,token_ema_a,token_ema_b,pool_address,fee_tier,token_amount_a_ui,token_amount_b_ui,volume_usd,fee_usd,lp_fee_usd,price_ratio,tvl_usd,tvl_utilization,balance_ratio,balance_imbalance
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
2025-10-14 16:20:42,373348755,1760458842,4uPbbFd37Rqyy8QFSw6KivndYWQteLMSAUabKcdAuYHNCD...,So11111111111111111111111111111111111111112,2zMMhcVQEXDtdE6vsFS7S7D5oUodfJHE8vd1gnBouauv,J757hq9DXGPDYfCoeGpTcD9A71NFgNqBRMXHrdVGyRxK,SdFLxX6sWTkKWje3Xb4YNewbm5ieaj3tfEJYeLTyqyg,1,2795627,22913485,14387.629156,190247100.0,14387.62636,190247100.0,9,6,204.702406,0.025035,198.292343,0.024316,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,0.002796,22.913485,0.572272,0.000286,0.000249,8176.802421,7707929.0,7.424453e-08,0.618379,0.381621
2025-10-14 16:19:34,373348586,1760458774,3r5TJs6otoo8enW1VgkXULf2UzdDsoKYtqVd8LQ4mtxBPr...,So11111111111111111111111111111111111111112,2zMMhcVQEXDtdE6vsFS7S7D5oUodfJHE8vd1gnBouauv,J757hq9DXGPDYfCoeGpTcD9A71NFgNqBRMXHrdVGyRxK,SdFLxX6sWTkKWje3Xb4YNewbm5ieaj3tfEJYeLTyqyg,1,3010327305,24669982850,14390.639483,190222400.0,14387.629156,190247100.0,9,6,202.933575,0.024761,198.224513,0.024307,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,3.010327,24669.98285,610.896482,0.305448,0.26574,8195.597847,7630497.0,8.005986e-05,0.6198,0.3802
2025-10-14 16:19:34,373348586,1760458774,aAMzbNpGKempL8zZFpMmHeaRtjxtXHtV2nxE2f6j294yhK...,So11111111111111111111111111111111111111112,2zMMhcVQEXDtdE6vsFS7S7D5oUodfJHE8vd1gnBouauv,J757hq9DXGPDYfCoeGpTcD9A71NFgNqBRMXHrdVGyRxK,SdFLxX6sWTkKWje3Xb4YNewbm5ieaj3tfEJYeLTyqyg,1,1124893147,9216977980,14391.764376,190213200.0,14390.639483,190222400.0,9,6,202.933575,0.024761,198.224513,0.024307,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,1.124893,9216.97798,228.278588,0.114139,0.099301,8195.597847,7630497.0,2.991661e-05,0.62001,0.37999
2025-10-14 16:19:18,373348546,1760458758,5nHYm87ZnisgxQdWpHcHWikWbLjy7TnD1LhDaBLuuwPm7Z...,So11111111111111111111111111111111111111112,2zMMhcVQEXDtdE6vsFS7S7D5oUodfJHE8vd1gnBouauv,J757hq9DXGPDYfCoeGpTcD9A71NFgNqBRMXHrdVGyRxK,SdFLxX6sWTkKWje3Xb4YNewbm5ieaj3tfEJYeLTyqyg,1,1093463768,8958595270,14392.85784,190204300.0,14391.764376,190213200.0,9,6,202.651126,0.024727,198.207664,0.024305,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,1.093464,8958.59527,221.591664,0.110796,0.096392,8195.550291,7619904.0,2.908064e-05,0.620085,0.379915
2025-10-14 16:19:17,373348543,1760458757,5UdTruM4LhsDHEUdJWtS8Bed9LRrGwac1yMnAmAFYbjHrd...,So11111111111111111111111111111111111111112,2zMMhcVQEXDtdE6vsFS7S7D5oUodfJHE8vd1gnBouauv,J757hq9DXGPDYfCoeGpTcD9A71NFgNqBRMXHrdVGyRxK,SdFLxX6sWTkKWje3Xb4YNewbm5ieaj3tfEJYeLTyqyg,1,1124018075,9208037376,14393.981858,190195000.0,14392.85784,190204300.0,9,6,202.650163,0.024728,198.206866,0.024304,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,1.124018,9208.037376,227.782446,0.113891,0.099085,8195.133508,7620107.0,2.989229e-05,0.62013,0.37987


In [12]:
df.head()

Unnamed: 0,slot,block_time,tx_signature,token_mint_a,token_mint_b,token_vault_a,token_vault_b,num_swaps,token_amount_a,token_amount_b,pre_balance_a,pre_balance_b,post_balance_a,post_balance_b,decimals_a,decimals_b,token_price_a,token_price_b,token_ema_a,token_ema_b,pool_address,fee_tier,token_amount_a_ui,token_amount_b_ui,volume_usd,fee_usd,lp_fee_usd,date
0,372117422,1759970010,voFRsYQztPxrn8Up585cw57YZ1FynubwpicBzc7Sy8gJnL...,So11111111111111111111111111111111111111112,hntyVP6YFm1Hg25TN9WGLqM12b8TQmcknKrdu1oxWux,BW18gGtaQjD2VgNmmWXBeqHu22k4KC4WamWyEP3S7PNU,BvquGcdP4bVHfb6RxBxUSaS2kwm36FPHMadACfYKMp6t,1,615874003,5636897771,264.81411,73738.018574,265.429984,73681.649597,9,8,228.308546,2.497639,228.811711,2.499824,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,0.615874,56.368978,140.609298,0.070305,0.061165,2025-10-09 00:33:30
1,372117410,1759970005,3D99LotRruAu3DKzXy8zv4tu28v5dwpAdFagx2wzQT3j8L...,So11111111111111111111111111111111111111112,hntyVP6YFm1Hg25TN9WGLqM12b8TQmcknKrdu1oxWux,BW18gGtaQjD2VgNmmWXBeqHu22k4KC4WamWyEP3S7PNU,BvquGcdP4bVHfb6RxBxUSaS2kwm36FPHMadACfYKMp6t,1,59736975,546843607,264.754373,73743.487011,264.81411,73738.018574,9,8,228.332011,2.497721,228.812321,2.499827,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,0.059737,5.468436,13.639864,0.00682,0.005933,2025-10-09 00:33:25
2,372117210,1759969926,4jD9zFY8wTNkE9Xw6YVATdcyTfr9Pq7UPZn81LzNp7wqCp...,So11111111111111111111111111111111111111112,hntyVP6YFm1Hg25TN9WGLqM12b8TQmcknKrdu1oxWux,BW18gGtaQjD2VgNmmWXBeqHu22k4KC4WamWyEP3S7PNU,BvquGcdP4bVHfb6RxBxUSaS2kwm36FPHMadACfYKMp6t,1,1409922915,12911334671,263.34445,73872.600357,264.754373,73743.487011,9,8,228.9851,2.498548,228.814718,2.499862,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,1.409923,129.113347,322.85134,0.161426,0.14044,2025-10-09 00:32:06
3,372117112,1759969888,2KoUoEE6eNJbLKNbzB6SSWV9MDTDuXyiNLi3F9KWwY4ULk...,So11111111111111111111111111111111111111112,hntyVP6YFm1Hg25TN9WGLqM12b8TQmcknKrdu1oxWux,BW18gGtaQjD2VgNmmWXBeqHu22k4KC4WamWyEP3S7PNU,BvquGcdP4bVHfb6RxBxUSaS2kwm36FPHMadACfYKMp6t,1,231058967,2118644068,263.575509,73851.413917,263.34445,73872.600357,9,8,229.063501,2.499054,228.812174,2.499868,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,0.231059,21.186441,52.927176,0.026464,0.023023,2025-10-09 00:31:28
4,372117071,1759969872,2H5s9zUcuSc93Akphew19gTZc8UpSjC8RzoYLRfofxMCto...,So11111111111111111111111111111111111111112,hntyVP6YFm1Hg25TN9WGLqM12b8TQmcknKrdu1oxWux,BW18gGtaQjD2VgNmmWXBeqHu22k4KC4WamWyEP3S7PNU,BvquGcdP4bVHfb6RxBxUSaS2kwm36FPHMadACfYKMp6t,1,22885893,209834120,263.598395,73849.315575,263.575509,73851.413917,9,8,228.977706,2.498333,228.811211,2.499872,CSP4RmB6kBHkKGkyTnzt9zYYXDA8SbZ5Do5WfZcjqjE4,0.0005,0.022886,2.098341,5.240359,0.00262,0.00228,2025-10-09 00:31:12
