In [8]:
from mainnet_launch.data_fetching.get_state_by_block import (
    get_state_by_one_block,
    build_blocks_to_use,
    get_raw_state_by_blocks,
    safe_normalize_6_with_bool_success,
    safe_normalize_with_bool_success,
)

import os
import pandas as pd
from tqdm import tqdm
import concurrent.futures as cf

from mainnet_launch.database.postgres_operations import (
    get_full_table_as_df_with_block,
    get_full_table_as_df,
    get_full_table_as_df_with_tx_hash,
)
from mainnet_launch.database.schema.full import (
    DestinationStates,
    Destinations,
    AutopoolDestinations,
    RebalanceEvents,
    RebalancePlans,
    Blocks,
)
import pandas as pd

from multicall import Call
import numpy as np
import time
import datetime
from mainnet_launch.data_fetching.defi_llama.fetch_timestamp import fetch_blocks_by_unix_timestamps_defillama

from mainnet_launch.constants import (
    AUTO_USD,
    ETH_CHAIN,
    AutopoolConstants,
    AUTO_DOLA,
    BASE_USD,
    BASE_CHAIN,
    ALL_AUTOPOOLS,
)

import plotly.express as px
import plotly.io as pio

pio.templates.default = None


def _extract_limited_events_data(
    autopool: AutopoolConstants,
    events: pd.DataFrame,
    plans: pd.DataFrame,
    destination_states: pd.DataFrame,
    destinations: pd.DataFrame,
) -> pd.DataFrame:

    limited_events_df = events[
        ["destination_in", "destination_out", "block", "safe_value_out", "rebalance_file_path"]
    ].copy()

    get_fee_and_base_apr = destination_states.set_index(["destination_vault_address", "rebalance_plan_key"])[
        "fee_plus_base_apr"
    ].to_dict()

    limited_events_df["fee_and_base_out"] = limited_events_df.apply(
        lambda row: get_fee_and_base_apr.get((row["destination_out"], row["rebalance_file_path"]), None), axis=1
    )
    limited_events_df["fee_and_base_in"] = limited_events_df.apply(
        lambda row: get_fee_and_base_apr.get((row["destination_in"], row["rebalance_file_path"]), None), axis=1
    )

    destination_names = destinations.set_index("destination_vault_address")["underlying_name"].to_dict()
    exchange_names = destinations.set_index("destination_vault_address")["exchange_name"].to_dict()
    pool_addresses = destinations.set_index("destination_vault_address")["pool"].to_dict()

    limited_events_df["destination_in_name"] = limited_events_df["destination_in"].map(destination_names)
    limited_events_df["destination_out_name"] = limited_events_df["destination_out"].map(destination_names)
    limited_events_df["out_exchange_name"] = limited_events_df["destination_out"].map(exchange_names)
    limited_events_df["in_exchange_name"] = limited_events_df["destination_in"].map(exchange_names)
    limited_events_df["pool_in"] = limited_events_df["destination_in"].map(pool_addresses)
    limited_events_df["pool_out"] = limited_events_df["destination_out"].map(pool_addresses)

    # Join limited_events_df with plans on rebalance_file_path = file_name
    limited_events_df = limited_events_df.merge(plans, left_on="rebalance_file_path", right_on="file_name", how="left")

    return limited_events_df


def load_data(autopool: AutopoolConstants):
    destinations = get_full_table_as_df(Destinations, where_clause=Destinations.chain_id == autopool.chain.chain_id)
    autopool_destinations = get_full_table_as_df(
        AutopoolDestinations, where_clause=AutopoolDestinations.autopool_vault_address == autopool.autopool_eth_addr
    )
    # 2 min to fetch
    destination_states = get_full_table_as_df_with_block(
        DestinationStates,
        where_clause=DestinationStates.destination_vault_address.in_(
            destinations["destination_vault_address"].tolist()
        ),
    )
    plans = get_full_table_as_df(
        RebalancePlans, where_clause=RebalancePlans.autopool_vault_address == autopool.autopool_eth_addr
    )
    events = get_full_table_as_df_with_tx_hash(
        RebalanceEvents, where_clause=RebalanceEvents.autopool_vault_address == autopool.autopool_eth_addr
    )
    mainnet_blocks = get_full_table_as_df(Blocks, where_clause=Blocks.chain_id == autopool.chain.chain_id).sort_values(
        "block"
    )

    limited_events_df = _extract_limited_events_data(autopool, events, plans, destination_states, destinations)
    return destinations, autopool_destinations, destination_states, plans, events, mainnet_blocks, limited_events_df


VP_METHODS = [
    ("getRate", ["getRate()(uint256)"], None),
    ("get_virtual_price", ["get_virtual_price()(uint256)"], None),
    ("convertToAssets_1e18", ["convertToAssets(uint256)(uint256)", int(10**18)], int(10**18)),
    ("stEthPerToken", ["stEthPerToken()(uint256)"], None),
]


def build_vp_calls(pool_address: str):
    calls = []
    for suffix, fn, _ in VP_METHODS:
        key = f"{pool_address}:{suffix}"
        calls.append(
            Call(
                target=pool_address,
                function=fn,
                returns=[(key, safe_normalize_with_bool_success)],
            )
        )
    return calls


def _get_working_virtual_price_column(df: pd.DataFrame, cols_in_priority: list[str]) -> pd.Series:
    for col in cols_in_priority:
        if not any(df[col].isna()):
            return df[col]
    print(df[cols_in_priority])

    raise ValueError("could not identify working virtual price column")


def compute_apr(vp_df: pd.DataFrame) -> pd.DataFrame:

    t0 = vp_df.index[0]
    days = (vp_df.index - t0).total_seconds() / 86400.0

    out0 = vp_df["out_vp"].iloc[0]
    in0 = vp_df["in_vp"].iloc[0]

    # annualized % using actual elapsed days; guard day=0 at start
    vp_df["out_ann_pct"] = np.where(days > 0, ((vp_df["out_vp"] / out0) ** (365.0 / days) - 1.0), np.nan)
    vp_df["in_ann_pct"] = np.where(days > 0, ((vp_df["in_vp"] / in0) ** (365.0 / days) - 1.0), np.nan)

    return vp_df[["block", "out_vp", "in_vp", "out_ann_pct", "in_ann_pct"]]


def _fetch_vp_df(blocks_to_query: list[int], row: pd.Series, autopool: AutopoolConstants) -> pd.DataFrame:
    out_addr = row["pool_out"]
    in_addr = row["pool_in"]

    calls_to_make = []
    calls_to_make += build_vp_calls(out_addr)
    calls_to_make += build_vp_calls(in_addr)

    vp_df = get_raw_state_by_blocks(
        calls_to_make,
        blocks_to_query,
        autopool.chain,
        include_block_number=True,
    )

    # Coalesce per destination in the same priority order as VP_METHODS
    out_cols = [f"{out_addr}:{suffix}" for suffix, _, _ in VP_METHODS]
    in_cols = [f"{in_addr}:{suffix}" for suffix, _, _ in VP_METHODS]

    vp_df["out_vp"] = _get_working_virtual_price_column(vp_df, out_cols)
    vp_df["in_vp"] = _get_working_virtual_price_column(vp_df, in_cols)

    apr_df = compute_apr(vp_df)
    return apr_df


def determine_forward_looking_vp(autopool: AutopoolConstants, row: pd.Series):
    try:
        start_block = int(row["block"])
        chain_to_approx_blocks_per_day = {
            ETH_CHAIN: 7150,  # crude approx
            BASE_CHAIN: 43200,  # crude approx
        }
        approx_blocks_per_day = chain_to_approx_blocks_per_day[autopool.chain]

        block_30_days = start_block + (approx_blocks_per_day * 30)
        block_60_days = start_block + (approx_blocks_per_day * 60)

        today_block = autopool.chain.get_block_near_top()

        if (block_30_days > today_block) or (block_60_days > today_block):
            return {
                **row,
                "valid": False,
            }

        blocks_to_query = [start_block, block_30_days, block_60_days]

        apr_df = _fetch_vp_df(blocks_to_query, row, autopool)

        actual_30_day_fee_and_base_out = apr_df.loc[apr_df["block"] == block_30_days, "out_ann_pct"].values[0]
        actual_60_day_fee_and_base_out = apr_df.loc[apr_df["block"] == block_60_days, "out_ann_pct"].values[0]

        actual_30_day_fee_and_base_in = apr_df.loc[apr_df["block"] == block_30_days, "in_ann_pct"].values[0]
        actual_60_day_fee_and_base_in = apr_df.loc[apr_df["block"] == block_60_days, "in_ann_pct"].values[0]

        block_timstamp_30_days = apr_df["block"].loc[apr_df["block"] == block_30_days].index[0]
        block_timstamp_60_days = apr_df["block"].loc[apr_df["block"] == block_60_days].index[0]

        return {
            **row,
            "actual_30_day_fee_and_base_out": actual_30_day_fee_and_base_out,
            "actual_60_day_fee_and_base_out": actual_60_day_fee_and_base_out,
            "actual_30_day_fee_and_base_in": actual_30_day_fee_and_base_in,
            "actual_60_day_fee_and_base_in": actual_60_day_fee_and_base_in,
            "start_block": start_block,
            "block_30_days": block_30_days,
            "block_60_days": block_60_days,
            "timestamp_30_days": block_timstamp_30_days,
            "timestamp_60_days": block_timstamp_60_days,
            "valid": True,
        }
    except Exception as e:
        print(f"Error processing row with block {row['block']}: {e}")
        return {
            **row,
            "valid": False,
            "error": str(e),
            "error_type": type(e).__name__,
        }


def add_virtual_price_values_no_threads(autopool: AutopoolConstants, limited_events_df: pd.DataFrame) -> pd.DataFrame:
    new_rows = limited_events_df.apply(lambda row: determine_forward_looking_vp(autopool, row), axis=1).tolist()
    all_results_df = pd.DataFrame.from_records(new_rows)

    all_results_df.loc[
        all_results_df["destination_out_x"] == autopool.autopool_eth_addr,
        ["actual_30_day_fee_and_base_out", "actual_60_day_fee_and_base_out"],
    ] = 0
    all_results_df.loc[
        all_results_df["destination_in_x"] == autopool.autopool_eth_addr,
        ["actual_30_day_fee_and_base_in", "actual_60_day_fee_and_base_in"],
    ] = 0

    return all_results_df


# broken

for autopool in ALL_AUTOPOOLS:
    if autopool.chain in [ETH_CHAIN, BASE_CHAIN]:
        destinations, autopool_destinations, destination_states, plans, events, mainnet_blocks, limited_events_df = (
            load_data(autopool)
        )
        print(autopool.name)
        print(
            f"shapes: destinations={destinations.shape} autopool_destinations={autopool_destinations.shape} destination_states={destination_states.shape} plans={plans.shape} events={events.shape} mainnet_blocks={mainnet_blocks.shape} limited_events_df={limited_events_df.shape}"
        )
        full_df = add_virtual_price_values_no_threads(autopool, limited_events_df.reset_index())
        df = full_df[full_df["valid"] == True]
        timestamp_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{autopool.name}_fee_and_base_apr_{timestamp_str}.csv"
        full_df.to_csv(filename)
        print(full_df["valid"].value_counts())
        print(f"Loaded {full_df.shape} rebalance events for {autopool.name}")




# for autopool in ALL_AUTOPOOLS:
#     if autopool.chain in [ETH_CHAIN, BASE_CHAIN]:
#         try:
#             destinations, autopool_destinations, destination_states, plans, events, mainnet_blocks, limited_events_df = (
#                 load_data(autopool)
#             )
#             print(autopool.name)
#             print(
#                 f"shapes: destinations={destinations.shape} autopool_destinations={autopool_destinations.shape} destination_states={destination_states.shape} plans={plans.shape} events={events.shape} mainnet_blocks={mainnet_blocks.shape} limited_events_df={limited_events_df.shape}"
#             )
#             full_df = add_virtual_price_values_no_threads(autopool, limited_events_df.reset_index())
#             df = full_df[full_df["valid"] == True]
#             timestamp_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
#             filename = f"{autopool.name}_fee_and_base_apr_{timestamp_str}.csv"
#             full_df.to_csv(filename)
#             print(full_df["valid"].value_counts())
#             print(f"Loaded {full_df.shape} rebalance events for {autopool.name}")
#         except Exception as e:
#             print(f"Error processing autopool {autopool.name}: {e}")


autoETH
shapes: destinations=(157, 12) autopool_destinations=(55, 3) destination_states=(298568, 17) plans=(1623, 28) events=(649, 14) mainnet_blocks=(139820, 3) limited_events_df=(649, 41)
valid
True     636
False     13
Name: count, dtype: int64
Loaded (649, 52) rebalance events for autoETH
balETH
shapes: destinations=(157, 12) autopool_destinations=(37, 3) destination_states=(298568, 17) plans=(1133, 28) events=(298, 14) mainnet_blocks=(139820, 3) limited_events_df=(298, 41)
valid
True    298
Name: count, dtype: int64
Loaded (298, 52) rebalance events for balETH
autoLRT
shapes: destinations=(157, 12) autopool_destinations=(19, 3) destination_states=(298568, 17) plans=(853, 28) events=(243, 14) mainnet_blocks=(139820, 3) limited_events_df=(243, 41)
valid
True    243
Name: count, dtype: int64
Loaded (243, 52) rebalance events for autoLRT
baseETH
shapes: destinations=(41, 12) autopool_destinations=(10, 3) destination_states=(97009, 17) plans=(5657, 28) events=(264, 14) mainnet_blocks=(

In [None]:
full_df.groupby("destination_out_name")["valid"].value_counts().reset_index()

In [None]:
break

In [12]:
# Create scatter plots for both 30-day and 60-day comparisons
for period in ["30_day", "60_day"]:
    # Plot for "in" destinations
    fig_in = px.scatter(
        df,
        x=f"fee_and_base_in",
        y=f"actual_{period}_fee_and_base_in",
        color="destination_in_name",
        title=f"Expected vs Actual {period.replace('_', '-').title()} Fee+Base APR (In)",
    )
    fig_in.add_trace(
        px.line(x=[0, 0.1], y=[0, 0.1]).data[0].update(line=dict(dash="dash", color="gray"), showlegend=False)
    )
    fig_in.show()

    # Plot for "out" destinations (only if fee_and_base_out exists)
    df_with_out = df.dropna(subset=["fee_and_base_out"])
    if len(df_with_out) > 0:
        fig_out = px.scatter(
            df_with_out,
            x=f"fee_and_base_out",
            y=f"actual_{period}_fee_and_base_out",
            color="destination_out_name",
            title=f"Expected vs Actual {period.replace('_', '-').title()} Fee+Base APR (Out)",
        )
        fig_out.add_trace(
            px.line(x=[0, 0.1], y=[0, 0.1]).data[0].update(line=dict(dash="dash", color="gray"), showlegend=False)
        )
        fig_out.show()

In [10]:
# one limitation could be, we can only predict at the .1% level, no way we are accurate closer than that

In [11]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

# Determine common x-axis range
x_min = min(all_results_df["actual_30_day_fee_and_base_in"].min(), all_results_df["fee_and_base_in"].min())
x_max = max(all_results_df["actual_30_day_fee_and_base_in"].max(), all_results_df["fee_and_base_in"].max())

# Create subplots with 2 rows and 1 column
fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=(
        "Distribution of actual fee + base APR we enter",
        "Distribution of expected fee + base APR we enter",
    ),
)

# Create histograms
fig1 = px.histogram(all_results_df, x="actual_30_day_fee_and_base_in")
fig2 = px.histogram(all_results_df, x="fee_and_base_in")

# Add traces
for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig2.data:
    fig.add_trace(trace, row=2, col=1)

# Update x-axes to have the same range
fig.update_xaxes(range=[x_min, x_max], row=1, col=1)
fig.update_xaxes(range=[x_min, x_max], row=2, col=1)

# Update layout
fig.update_xaxes(title_text="Fee+Base APR", row=2, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

fig.update_layout(height=700, showlegend=False)
fig.show()

NameError: name 'all_results_df' is not defined

In [None]:
sub_df = all_results_df.copy().dropna(subset=["actual_30_day_fee_and_base_in"])
sub_df["difference"] = sub_df["actual_30_day_fee_and_base_in"] - sub_df["fee_and_base_in"]
sub_df = sub_df.sort_values("difference", ascending=False)
sub_df

In [None]:
px.ecdf(
    all_results_df.dropna(subset=["actual_30_day_fee_and_base_in"]),
    x=["actual_30_day_fee_and_base_in", "actual_60_day_fee_and_base_in", "fee_and_base_in"],
    title="ECDF of actual fee + base APR we enter",
).show()

In [None]:
all_results_df

In [None]:
# clip to 5%, prevent the worst outliers

In [None]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

# Create subplots with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=("Expected Fee+Base APR (In)", "Actual 30-Day Fee+Base APR (In)"))

# ECDF for expected (fee_and_base_in)
plot_df_expected = all_results_df.dropna(subset=["fee_and_base_in"]).copy()
fig_expected = px.ecdf(
    plot_df_expected,
    x="fee_and_base_in",
)
for trace in fig_expected.data:
    fig.add_trace(trace, row=1, col=1)

# ECDF for actual 30-day (actual_30_day_fee_and_base_in)
plot_df_actual = all_results_df.dropna(subset=["actual_30_day_fee_and_base_in"]).copy()
fig_actual = px.ecdf(plot_df_actual, x="actual_30_day_fee_and_base_in")
for trace in fig_actual.data:
    fig.add_trace(trace, row=1, col=2)

# Update layout
fig.update_xaxes(title_text="Fee+Base APR", row=1, col=1)
fig.update_xaxes(title_text="Fee+Base APR", row=1, col=2)
fig.update_yaxes(title_text="Cumulative Probability", row=1, col=1)
fig.update_yaxes(title_text="Cumulative Probability", row=1, col=2)

fig.update_layout(title_text="Expected vs Actual Fee+Base APR Distribution (In)", height=500, showlegend=True)

fig.show()

In [None]:
break

In [None]:
all_results_df["30_day_out_diff"] = (
    all_results_df["actual_30_day_fee_and_base_out"] - all_results_df["fee_and_base_out"]
)
all_results_df["60_day_out_diff"] = (
    all_results_df["actual_60_day_fee_and_base_out"] - all_results_df["fee_and_base_out"]
)
all_results_df["30_day_in_diff"] = all_results_df["actual_30_day_fee_and_base_in"] - all_results_df["fee_and_base_in"]
all_results_df["60_day_in_diff"] = all_results_df["actual_60_day_fee_and_base_in"] - all_results_df["fee_and_base_in"]


import plotly.express as px

plot_df = all_results_df.dropna(subset=["destination_out_name", "30_day_out_diff"]).copy()

fig = px.ecdf(
    plot_df,
    x="30_day_out_diff",
    color="destination_out_name",  # one ECDF line per destination
    title="Difference between Actual and Planned 30-Day Fee+Base APR (Out)",
)

fig.update_layout(
    legend_title_text="Destination (out)",
)
print("diff >0 means actual > expected")
print("diff <0 means actual < expected")
fig.show()

In [None]:
plot_df = all_results_df.dropna(subset=["destination_in_name", "30_day_in_diff"]).copy()

fig = px.ecdf(
    plot_df,
    x="30_day_in_diff",
    color="destination_in_name",
    title="Difference between Actual and Planned 30-Day Fee+Base APR (In)",
)

fig.update_layout(
    legend_title_text="Destination (in)",
)
print("diff >0 means actual > expected")
print("diff <0 means actual < expected")
fig.show()

In [None]:
# maybe we should be doing the lowest hanging fruit first? like the ones that are way off?
# Calculate absolute differences for sorting
all_results_df["abs_30_day_out_diff"] = all_results_df["30_day_out_diff"].abs()
all_results_df["abs_60_day_out_diff"] = all_results_df["60_day_out_diff"].abs()
all_results_df["abs_30_day_in_diff"] = all_results_df["30_day_in_diff"].abs()
all_results_df["abs_60_day_in_diff"] = all_results_df["60_day_in_diff"].abs()

# Find the most off predictions for each category
print("=" * 80)
print("TOP 10 WORST PREDICTIONS (60-day OUT)")
print("=" * 80)
all_results_df.columns

In [None]:
interesting_cols = [
    "destination_in_name",
    "destination_out_name",
    "actual_30_day_fee_and_base_out",
    "actual_60_day_fee_and_base_out",
    "actual_30_day_fee_and_base_in",
    "actual_60_day_fee_and_base_in",
    "fee_and_base_out",
    "fee_and_base_in",
    "30_day_out_diff",
    "60_day_out_diff",
    "30_day_in_diff",
    "60_day_in_diff",
    "abs_30_day_out_diff",
    "block",
    "safe_value_out",
]

all_results_df.sort_values(by="abs_60_day_out_diff", ascending=False)[interesting_cols].head(10)