In [168]:
from glob import glob

import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.templates.default = None

# Read each CSV file into a dictionary with keys based on the filename
dfs = {}
for path in glob('*.csv'):
    # Extract the key from the filename (e.g., 'autoLRT' from 'autoLRT_fee_and_base_apr_20260123_132442.csv')
    key = path.split('_')[0]
    df =pd.read_csv(path, index_col=0)
    df['autopool'] = key
    dfs[key] = df
    

# Concatenate all dataframes, keeping track of autopool name
all_dfs = []
for key, df_temp in dfs.items():
    df_temp = df_temp.copy()
    df_temp['autopool'] = key
    all_dfs.append(df_temp)

df = pd.concat(all_dfs, ignore_index=True)
df.loc[df['rebalance_type'] == 'FromIdle', 'fee_and_base_out'] = 0.0

cols = ['fee_and_base_out', 'datetime_generated', 'fee_and_base_in', 'destination_in_name', 
    'destination_out_name', 'out_exchange_name', 'in_exchange_name', 'chain_id',
    'rebalance_type', 'amount_out_safe_value', 'min_amount_in_safe_value', 
    'out_dest_apr', 'min_amount_in_spot_value', 'in_dest_apr', 'in_dest_adj_apr', 
    'actual_30_day_fee_and_base_out', 'actual_60_day_fee_and_base_out', 
    'actual_30_day_fee_and_base_in', 'actual_60_day_fee_and_base_in', 
    'timestamp_30_days', 'timestamp_60_days', 'error', 'autopool']

df['measurement_diff_out'] = 100 * (df['fee_and_base_out'] - df['actual_30_day_fee_and_base_out'])
df['measurement_diff_in'] = 100 * (df['fee_and_base_in'] - df['actual_30_day_fee_and_base_in'])

# df = df[cols].copy()
# df['datetime_generated'] = pd.to_datetime(df['datetime_generated'], utc=True)
# df['timestamp_30_days'] = pd.to_datetime(df['timestamp_30_days'], utc=True)
# df['timestamp_60_days'] = pd.to_datetime(df['timestamp_60_days'], utc=True)
# sub_df = base_USD_df[['datetime_generated', 'amount_out_safe_value', 'destination_out_name', 'destination_in_name', 'fee_and_base_out', 'fee_and_base_in', 'actual_30_day_fee_and_base_out',  'actual_30_day_fee_and_base_in']].copy()

In [181]:
out_df = df[['datetime_generated', 'amount_out_safe_value', 'destination_out_name', 'fee_and_base_out', 'actual_30_day_fee_and_base_out', 'measurement_diff_out', 'autopool']].copy()
out_df['label'] = out_df['destination_out_name'] + ' - Out'
in_df = df[['datetime_generated', 'amount_out_safe_value', 'destination_in_name', 'fee_and_base_in', 'actual_30_day_fee_and_base_in', 'measurement_diff_in', 'autopool']].copy()
in_df['label'] = in_df['destination_in_name'] + ' - In'

out_df.rename(columns={'fee_and_base_out': 'estimated_fee_and_base_apr',
                       'actual_30_day_fee_and_base_out': 'actual_fee_and_base_apr',
                       'measurement_diff_out': 'measurement_diff',}, inplace=True)
in_df.rename(columns={'fee_and_base_in': 'estimated_fee_and_base_apr',
                      'actual_30_day_fee_and_base_in': 'actual_fee_and_base_apr',
                      'measurement_diff_in': 'measurement_diff'}, inplace=True)
cols = ['datetime_generated', 'amount_out_safe_value', 'label', 'estimated_fee_and_base_apr', 'actual_fee_and_base_apr', 'estimated_fee_and_base_apr', 'autopool']
wide_df = pd.concat([in_df[cols], out_df[cols]], ignore_index=True)

wide_df.isna().sum() / len(wide_df)  # check for NaNs

datetime_generated            0.063663
amount_out_safe_value         0.063663
label                         0.000000
estimated_fee_and_base_apr    0.242997
actual_fee_and_base_apr       0.079824
estimated_fee_and_base_apr    0.242997
autopool                      0.000000
dtype: float64

In [167]:
def build_single_box_plot_fee_and_base_apr_accuracy(df, autopool_name: str = "baseUSD"):
    sub_df = df[df["autopool"] == autopool_name].copy()

    out_df = (
        sub_df.loc[sub_df["destination_out_name"].astype(str).str.strip() != autopool_name,
                   ["measurement_diff_out", "destination_out_name"]]
        .rename(columns={"measurement_diff_out": "diff", "destination_out_name": "destination"})
    )
    out_df["side"] = "out"
    out_df["label"] = out_df["destination"].astype(str).str.strip() + " (out)"

    in_df = (
        sub_df.loc[sub_df["destination_in_name"].astype(str).str.strip() != autopool_name,
                   ["measurement_diff_in", "destination_in_name"]]
        .rename(columns={"measurement_diff_in": "diff", "destination_in_name": "destination"})
    )
    in_df["side"] = "in"
    in_df["label"] = in_df["destination"].astype(str).str.strip() + " (in)"

    long_df = pd.concat([out_df, in_df], ignore_index=True).dropna(subset=["diff", "label"])

    # stable ordering + consistent colors by base destination (ignores (out)/(in))
    base_labels = list(pd.Index(long_df["destination"].unique()))
    palette = px.colors.qualitative.Plotly
    base_color_map = {lab: palette[i % len(palette)] for i, lab in enumerate(base_labels)}
    long_df["base_dest"] = long_df["destination"].astype(str).str.strip()

    fig = px.box(
        long_df,
        x="diff",
        y="label",
        color="base_dest",                 # same destination = same color for out/in
        color_discrete_map=base_color_map,
        orientation="h",
        points="outliers",
        title=f"{autopool_name}: Fee and Base APR (Out/In) - Estimated vs Actual",
    )

    fig.update_xaxes(range=[-5, 5], zeroline=True, zerolinewidth=2)
    fig.update_layout(width=1800, height=900, margin=dict(l=320, r=260, t=70, b=50))

    fig.add_annotation(x=0.95, y=0.95, xref="paper", yref="paper",
                       text="Overestimated", showarrow=False,
                       font=dict(size=18, color="black"), align="right")
    fig.add_annotation(x=0.05, y=0.95, xref="paper", yref="paper",
                       text="Underestimated", showarrow=False,
                       font=dict(size=18, color="black"), align="right")

    return fig

fig = build_single_box_plot_fee_and_base_apr_accuracy(df, autopool_name='autoUSD')
fig.show()

In [162]:
# don't like the view here


from plotly.subplots import make_subplots

def build_side_by_side_box_plots_of_fee_and_base_apr_accuracy(df, autopool_name: str = 'baseUSD'):
    sub_df = df[df['autopool'] == autopool_name].copy()
    all_labels = (
    pd.Index(sub_df["destination_out_name"].dropna().astype(str).str.strip().unique())
    .append(pd.Index(sub_df["destination_in_name"].dropna().astype(str).str.strip().unique()))
    .unique()
)
    all_labels = list(all_labels)

    palette = px.colors.qualitative.Plotly
    color_map = {lab: palette[i % len(palette)] for i, lab in enumerate(all_labels)}



    def create_box_plot_with_annotations(df, data_column, color_col):
        if 'out' in data_column:
            df = df[df['destination_out_name'] != autopool_name].copy()
        elif 'in' in data_column:
            df = df[df['destination_in_name'] != autopool_name].copy()

        # ---- MINIMAL ADD: normalize labels so mapping matches exactly ----
        df = df.copy()
        df[color_col] = df[color_col].astype(str).str.strip()

        fig = px.box(
            df,
            x=data_column,
            y=color_col,
            color=color_col,
            color_discrete_map=color_map,
            category_orders={color_col: all_labels},   # ---- ADD: enforce same order ----
            orientation="h",                           # optional but usually what you want
        )


        return fig


    # Create subplot with 1 row and 2 columns
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=(
            f'{autopool_name}: Fee and Base APR Out - Estimated vs Actual',
            f'{autopool_name}: Fee and Base APR In - Estimated vs Actual '
        )
    )

    # Create the out plot
    fig_out = create_box_plot_with_annotations(sub_df, 'measurement_diff_out', 'destination_out_name', )
    for trace in fig_out.data:
        fig.add_trace(trace, row=1, col=1)

    # Create the in plot
    fig_in = create_box_plot_with_annotations(sub_df, 'measurement_diff_in', 'destination_in_name')
    for trace in fig_in.data:
        fig.add_trace(trace, row=1, col=2)

    fig.update_layout(
        margin=dict(l=260, r=40, t=60, b=40),  # increase l as needed
    )
    fig.update_yaxes(showticklabels=False, row=1, col=2)
    fig.update_xaxes(range=[-5, 5], row=1, col=1)
    fig.update_xaxes(range=[-5, 5], row=1, col=2)
    fig.update_layout(
        width=1800,          # wider
        height=700,          # taller
    )

    fig.add_annotation(x=0.95, y=0.95, xref='paper', yref='paper',
                        text='Overestimated', showarrow=False,
                        font=dict(size=18, color='black'), align='right')
    fig.add_annotation(x=0.05, y=0.95, xref='paper', yref='paper',
                        text='Underestimated', showarrow=False,
                        font=dict(size=18, color='black'), align='right')


    return fig


fig = build_side_by_side_box_plots_of_fee_and_base_apr_accuracy(df, autopool_name='autoUSD')
fig.show()



In [None]:
break

In [None]:
fig = px.scatter(sub_df, x='fee_and_base_out', y='actual_30_day_fee_and_base_out', color='destination_out_name', title='baseUSD autopool: Fee and Base APR Out - Estimated vs Actual over 30 days', labels={'fee_and_base_out': 'Estimated Fee and Base APR Out', 'actual_30_day_fee_and_base_out': 'Actual Fee and Base APR Out over 30 days'})

fig.add_trace(
    px.line(x=[0, 0.1], y=[0, 0.1]).data[0].update(line=dict(dash="dash", color="gray"), showlegend=False)
)

In [None]:
break

In [None]:
# Create scatter plots for both 30-day and 60-day comparisons
for period in ["30_day", "60_day"]:
    # Plot for "in" destinations
    fig_in = px.scatter(
        df,
        x=f"fee_and_base_in",
        y=f"actual_{period}_fee_and_base_in",
        color="destination_in_name",
        title=f"Expected vs Actual {period.replace('_', '-').title()} Fee+Base APR (In)",
    )
    fig_in.add_trace(
        px.line(x=[0, 0.1], y=[0, 0.1]).data[0].update(line=dict(dash="dash", color="gray"), showlegend=False)
    )
    fig_in.show()

    # Plot for "out" destinations (only if fee_and_base_out exists)
    df_with_out = df.dropna(subset=["fee_and_base_out"])
    if len(df_with_out) > 0:
        fig_out = px.scatter(
            df_with_out,
            x=f"fee_and_base_out",
            y=f"actual_{period}_fee_and_base_out",
            color="destination_out_name",
            title=f"Expected vs Actual {period.replace('_', '-').title()} Fee+Base APR (Out)",
        )
        fig_out.add_trace(
            px.line(x=[0, 0.1], y=[0, 0.1]).data[0].update(line=dict(dash="dash", color="gray"), showlegend=False)
        )
        fig_out.show()

In [None]:
# one limitation could be, we can only predict at the .1% level, no way we are accurate closer than that

In [None]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

# Determine common x-axis range
x_min = min(all_results_df["actual_30_day_fee_and_base_in"].min(), all_results_df["fee_and_base_in"].min())
x_max = max(all_results_df["actual_30_day_fee_and_base_in"].max(), all_results_df["fee_and_base_in"].max())

# Create subplots with 2 rows and 1 column
fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=(
        "Distribution of actual fee + base APR we enter",
        "Distribution of expected fee + base APR we enter",
    ),
)

# Create histograms
fig1 = px.histogram(all_results_df, x="actual_30_day_fee_and_base_in")
fig2 = px.histogram(all_results_df, x="fee_and_base_in")

# Add traces
for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig2.data:
    fig.add_trace(trace, row=2, col=1)

# Update x-axes to have the same range
fig.update_xaxes(range=[x_min, x_max], row=1, col=1)
fig.update_xaxes(range=[x_min, x_max], row=2, col=1)

# Update layout
fig.update_xaxes(title_text="Fee+Base APR", row=2, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

fig.update_layout(height=700, showlegend=False)
fig.show()

In [None]:
sub_df = all_results_df.copy().dropna(subset=["actual_30_day_fee_and_base_in"])
sub_df["difference"] = sub_df["actual_30_day_fee_and_base_in"] - sub_df["fee_and_base_in"]
sub_df = sub_df.sort_values("difference", ascending=False)
sub_df

In [None]:
px.ecdf(
    all_results_df.dropna(subset=["actual_30_day_fee_and_base_in"]),
    x=["actual_30_day_fee_and_base_in", "actual_60_day_fee_and_base_in", "fee_and_base_in"],
    title="ECDF of actual fee + base APR we enter",
).show()

In [None]:
all_results_df

In [None]:
# clip to 5%, prevent the worst outliers

In [None]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

# Create subplots with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=("Expected Fee+Base APR (In)", "Actual 30-Day Fee+Base APR (In)"))

# ECDF for expected (fee_and_base_in)
plot_df_expected = all_results_df.dropna(subset=["fee_and_base_in"]).copy()
fig_expected = px.ecdf(
    plot_df_expected,
    x="fee_and_base_in",
)
for trace in fig_expected.data:
    fig.add_trace(trace, row=1, col=1)

# ECDF for actual 30-day (actual_30_day_fee_and_base_in)
plot_df_actual = all_results_df.dropna(subset=["actual_30_day_fee_and_base_in"]).copy()
fig_actual = px.ecdf(plot_df_actual, x="actual_30_day_fee_and_base_in")
for trace in fig_actual.data:
    fig.add_trace(trace, row=1, col=2)

# Update layout
fig.update_xaxes(title_text="Fee+Base APR", row=1, col=1)
fig.update_xaxes(title_text="Fee+Base APR", row=1, col=2)
fig.update_yaxes(title_text="Cumulative Probability", row=1, col=1)
fig.update_yaxes(title_text="Cumulative Probability", row=1, col=2)

fig.update_layout(title_text="Expected vs Actual Fee+Base APR Distribution (In)", height=500, showlegend=True)

fig.show()

In [None]:
break

In [None]:
all_results_df["30_day_out_diff"] = (
    all_results_df["actual_30_day_fee_and_base_out"] - all_results_df["fee_and_base_out"]
)
all_results_df["60_day_out_diff"] = (
    all_results_df["actual_60_day_fee_and_base_out"] - all_results_df["fee_and_base_out"]
)
all_results_df["30_day_in_diff"] = all_results_df["actual_30_day_fee_and_base_in"] - all_results_df["fee_and_base_in"]
all_results_df["60_day_in_diff"] = all_results_df["actual_60_day_fee_and_base_in"] - all_results_df["fee_and_base_in"]


import plotly.express as px

plot_df = all_results_df.dropna(subset=["destination_out_name", "30_day_out_diff"]).copy()

fig = px.ecdf(
    plot_df,
    x="30_day_out_diff",
    color="destination_out_name",  # one ECDF line per destination
    title="Difference between Actual and Planned 30-Day Fee+Base APR (Out)",
)

fig.update_layout(
    legend_title_text="Destination (out)",
)
print("diff >0 means actual > expected")
print("diff <0 means actual < expected")
fig.show()

In [None]:
plot_df = all_results_df.dropna(subset=["destination_in_name", "30_day_in_diff"]).copy()

fig = px.ecdf(
    plot_df,
    x="30_day_in_diff",
    color="destination_in_name",
    title="Difference between Actual and Planned 30-Day Fee+Base APR (In)",
)

fig.update_layout(
    legend_title_text="Destination (in)",
)
print("diff >0 means actual > expected")
print("diff <0 means actual < expected")
fig.show()

In [None]:
# maybe we should be doing the lowest hanging fruit first? like the ones that are way off?
# Calculate absolute differences for sorting
all_results_df["abs_30_day_out_diff"] = all_results_df["30_day_out_diff"].abs()
all_results_df["abs_60_day_out_diff"] = all_results_df["60_day_out_diff"].abs()
all_results_df["abs_30_day_in_diff"] = all_results_df["30_day_in_diff"].abs()
all_results_df["abs_60_day_in_diff"] = all_results_df["60_day_in_diff"].abs()

# Find the most off predictions for each category
print("=" * 80)
print("TOP 10 WORST PREDICTIONS (60-day OUT)")
print("=" * 80)
all_results_df.columns

In [None]:
interesting_cols = [
    "destination_in_name",
    "destination_out_name",
    "actual_30_day_fee_and_base_out",
    "actual_60_day_fee_and_base_out",
    "actual_30_day_fee_and_base_in",
    "actual_60_day_fee_and_base_in",
    "fee_and_base_out",
    "fee_and_base_in",
    "30_day_out_diff",
    "60_day_out_diff",
    "30_day_in_diff",
    "60_day_in_diff",
    "abs_30_day_out_diff",
    "block",
    "safe_value_out",
]

all_results_df.sort_values(by="abs_60_day_out_diff", ascending=False)[interesting_cols].head(10)