## Regression Analysis on Ionic Emissions Supply/Borrow
For latest period 11/15 - 12/15


## Imports

In [33]:
pip install dune-client pycoingecko

Collecting pycoingecko
  Downloading pycoingecko-3.2.0-py3-none-any.whl.metadata (16 kB)
Downloading pycoingecko-3.2.0-py3-none-any.whl (10 kB)
Installing collected packages: pycoingecko
Successfully installed pycoingecko-3.2.0


## Load data




In [2]:
from dune_client.client import DuneClient
from google.colab import userdata
userdata.get('DUNE_API_KEY')



dune_api_key = userdata.get('DUNE_API_KEY')
dune = DuneClient(dune_api_key)
deposits = dune.get_latest_result_dataframe(4003406) # mrwild
print("deposits loaded")
borrows= dune.get_latest_result_dataframe(4301345) # mrwild
print("borrows loaded")
withdrawals = dune.get_latest_result_dataframe(4003238) # mrwild
print("withdrawals loaded")
tvl_agg = dune.get_latest_result_dataframe(4001052) # mrwild TVL by POOL
print("tvl by pool loaded")
protocol_tvl = dune.get_latest_result_dataframe(4301363) # NEW  TVL cumulative (old 4069195)
print("total tvl loaded")
protocol_tvl_notional = dune.get_latest_result_dataframe(4309385) # NEW TVL cumulative value (UZL)
print("protocol_tvl_notional loaded")


deposits loaded
borrows loaded
withdrawals loaded
tvl by pool loaded
total tvl loaded
protocol_tvl_notional loaded


In [3]:
# copies of the dataframes
deposits_copy = deposits.copy()
borrows_copy = borrows.copy()
withdrawals_copy = withdrawals.copy()
tvl_agg_copy = tvl_agg.copy()
protocol_tvl_copy = protocol_tvl.copy()
protocol_tvl_notional_copy = protocol_tvl_notional.copy()


In [None]:
deposits_copy.tail(5)

Unnamed: 0,date,vaultName,mintedAmount_USD
4813,2024-12-04 00:00:00.000 UTC,ionsUSDem,668.568189
4814,2024-12-04 00:00:00.000 UTC,ionwUSDMb,100.367283
4815,2024-12-04 00:00:00.000 UTC,ionweETHm,5.855677
4816,2024-12-04 00:00:00.000 UTC,ionwrsETHm,5.399112
4817,2024-12-04 00:00:00.000 UTC,ionwsuperOETHb,0.09399


In [8]:
borrows_copy.head(5)

Unnamed: 0,date,vaultName,daily_borrowed_amount_usd,total_borrowed_in_vault_usd
0,2024-12-06,ionAEROb,0.0,156258.54173791732
1,2024-12-06,ionEURCb,0.0,42081.14146271733
2,2024-12-06,ionLUSDo,0.0,4.097540019327064
3,2024-12-06,ionMBTCm,0.0,682495.9033103749
4,2024-12-06,ionMODEmi,0.0,447519.3634626311


In [None]:
withdrawals_copy.tail(5)

Unnamed: 0,date,vaultName,redeemedAmount_USD
4875,2024-12-04 00:00:00.000 UTC,ionsUSDzb,300.044497
4876,2024-12-04 00:00:00.000 UTC,ionweETHb,16.403964
4877,2024-12-04 00:00:00.000 UTC,ionweETHm,6.235062
4878,2024-12-04 00:00:00.000 UTC,ionwrsETHm,0.007894
4879,2024-12-04 00:00:00.000 UTC,ionwsuperOETHb,0.09399


In [None]:
tvl_agg_copy.tail(5)

Unnamed: 0,vaultName,TVL_USD,TotalBorrowed_USD,ActiveDeposits_USD
44,ioneUSDb,179822.242772,865526.4,1045349.0
45,ionAEROb,139346.406401,149163.6,288510.0
46,ionbsdETH,339867.47115,385040.9,724908.4
47,ionWETHb,650829.470357,1147480.0,1798309.0
48,ionUSDCb,537188.112669,1289691.0,1826879.0


In [None]:
protocol_tvl_copy.tail(5)

Unnamed: 0,date,ionicVault,TVL,chain
16606,2024-12-04,ionweETHm,770236.1,mode
16607,2024-12-04,ionwrsETHm,993336.8,mode
16608,2024-12-04,ionwstETHb,124285.0,base
16609,2024-12-04,ionwstETHo,24.62299,optimism
16610,2024-12-04,ionwsuperOETHb,1288179.0,base


In [None]:
protocol_tvl_notional_copy.tail(5)

Unnamed: 0,date,ionicVault,TVL,chain
16606,2024-12-04,ionweETHm,202.478087,mode
16607,2024-12-04,ionwrsETHm,261.126329,mode
16608,2024-12-04,ionwstETHb,29.031249,base
16609,2024-12-04,ionwstETHo,0.005752,optimism
16610,2024-12-04,ionwsuperOETHb,356.945528,base


## Data Preprocessing

In [None]:
# from 11-15-12-15, take the emissions in row V-AC and multiply by how much time has passed thru the epoch,
# then we have all emissions per vault, per side (supply or borrow)
# size of pool, type of pool, and then model the impact of emissions based on those

In [21]:
import pandas as pd
import numpy as np
from google.colab import auth
import gspread
from google.auth import default

# Ensure we're authenticated
auth.authenticate_user()
creds, _ = default()
access = gspread.authorize(creds)

# Connect to the workbook
WORKBOOK_ID = '1tWPMKIqRxg_noABRmQLhti0qXwG3c8bM30bvdzvxruE'
SHEET_ID = '357844907'

wb = access.open_by_key(WORKBOOK_ID)
sheet = wb.get_worksheet_by_id(int(SHEET_ID))

# Get chain names, vault names, and emissions data
chain_names = sheet.get_values('E:E')
vault_names = sheet.get_values('F:F')
emissions_data = sheet.get_values('V:AC')

if emissions_data and vault_names and chain_names:
    # Create emissions DataFrame first
    emissions_df = pd.DataFrame(emissions_data[2:], columns=[
        'borrow_emissions_tokens',
        'borrow_emissions_usd',
        'borrow_emissions_apr',
        'borrow_net_apr',
        'supply_emissions_tokens',
        'supply_emissions_usd',
        'supply_emissions_apr',
        'supply_net_apr'
    ])

    # Create temporary series for chain and vault
    chain_series = pd.Series(chain_names[2:len(emissions_data)])
    vault_series = pd.Series(vault_names[2:len(emissions_data)])

    # Create names DataFrame
    names_df = pd.DataFrame({
        'chain': chain_series,
        'vault': vault_series
    })

    def clean_value(val):
        if pd.isna(val) or val == '':
            return ''
        return str(val).replace('[', '').replace(']', '').replace("'", "").replace('"', '').strip()

    # Clean the values first
    names_df['chain'] = names_df['chain'].apply(clean_value)
    names_df['vault'] = names_df['vault'].apply(clean_value)

    # Forward fill the chain names to handle merged cells
    names_df['chain'] = names_df['chain'].replace('', np.nan).fillna(method='ffill')

    def get_chain_suffix(chain):
        chain_map = {
            'Mode': 'm',
            'Base': 'b',
            'Optimism': 'o',
            'FRAXTAL': 'f',
            'BOB': 'bob'
        }
        return chain_map.get(chain.strip(), '')

    # Combine into proper vault name format
    names_df['standardized_vault'] = names_df.apply(
        lambda x: f"ion{x['vault']}{get_chain_suffix(x['chain'])}"
        if x['vault'] != ''
        else '', axis=1
    )

    # Print debug info
    print("Sample of processing:")
    print(names_df[['chain', 'vault', 'standardized_vault']].head(10))

    # Add standardized vault names to emissions_df
    emissions_df.insert(0, 'Vault', names_df['standardized_vault'])


    # Print sample of standardized vault names to verify
    print("\nSample of standardized vault names:")
    print(emissions_df['Vault'].head(10))
else:
    print("No data found in the specified range")

Sample of processing:
  chain       vault standardized_vault
0  Mode        WETH           ionWETHm
1  Mode        USDC           ionUSDCm
2  Mode       msDAI          ionmsDAIm
3  Mode       ezETH          ionezETHm
4  Mode       STONE          ionSTONEm
5  Mode       sUSDe          ionsUSDem
6  Mode        USDe           ionUSDem
7  Mode      wrsETH         ionwrsETHm
8  Mode  weETH.mode     ionweETH.modem
9  Mode       dMTBC          iondMTBCm

Sample of standardized vault names:
0          ionWETHm
1          ionUSDCm
2         ionmsDAIm
3         ionezETHm
4         ionSTONEm
5         ionsUSDem
6          ionUSDem
7        ionwrsETHm
8    ionweETH.modem
9         iondMTBCm
Name: Vault, dtype: object


  names_df['chain'] = names_df['chain'].replace('', np.nan).fillna(method='ffill')


In [22]:
emissions_df.head

Unnamed: 0,Vault,borrow_emissions_tokens,borrow_emissions_usd,borrow_emissions_apr,borrow_net_apr,supply_emissions_tokens,supply_emissions_usd,supply_emissions_apr,supply_net_apr
0,ionWETHm,50000.0,"$1,856",0.51%,-0.50%,50000.0,"$1,856",0%,9.3%
1,ionUSDCm,50000.0,"$1,856",0.99%,-19.24%,50000.0,"$1,856",1%,12.5%
2,ionmsDAIm,,$0,0.00%,,,,,
3,ionezETHm,,$0,0.00%,,,$0,0%,2.2%
4,ionSTONEm,,$0,0.00%,,,$0,0%,0.6%
5,ionsUSDem,,$0,0.00%,,,,,
6,ionUSDem,,$0,0.00%,,,,,
7,ionwrsETHm,,$0,0.00%,,,$0,0%,1.6%
8,ionweETH.modem,10000.0,$371,2.66%,-0.63%,,$0,0%,2.1%
9,iondMTBCm,,$0,,,50000.0,"$1,856",0%,0.1%


## Regression on All Vaults + Normalized

In [41]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pycoingecko import CoinGeckoAPI
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from scipy import stats


# Define key dates
START_DATE = pd.Timestamp('2024-09-15')
CURRENT_DATE = pd.Timestamp.now().normalize()

# Get token prices
cg = CoinGeckoAPI()

def get_price_data(token_ids, days=5):
    price_data = {}
    for token_id in token_ids:
        try:
            data = cg.get_coin_market_chart_by_id(id=token_id, vs_currency='usd', days=days)
            df = pd.DataFrame(data['prices'], columns=['timestamp', 'price'])
            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
            df.set_index('timestamp', inplace=True)
            price_data[token_id] = df['price']
        except Exception as e:
            print(f"Error fetching data for {token_id}: {e}")
    return pd.DataFrame(price_data).resample('D').last().ffill()

coingecko_ids = ['ionic-protocol', 'ethereum', 'aerodrome-finance']
price_data = get_price_data(coingecko_ids)

latest_price_ion = price_data.iloc[-1]['ionic-protocol']
latest_price_eth = price_data.iloc[-1]['ethereum']
latest_price_aero = price_data.iloc[-1]['aerodrome-finance']

def clean_numeric(val):
    """Clean and convert values to numeric, handling special cases"""
    if pd.isna(val) or val == '' or val in ['#N/A', '#DIV/0!']:
        return 0
    if isinstance(val, str):
        val = val.replace('$', '').replace(',', '').strip()
        if '%' in val:
            val = val.replace('%', '')
            try:
                return float(val) / 100
            except ValueError:
                return 0
        try:
            return float(val)
        except ValueError:
            return 0
    return float(val) if val else 0

# Process emissions data
emissions_df = pd.DataFrame(emissions_data[2:], columns=[
    'borrow_emissions_tokens', 'borrow_emissions_usd', 'borrow_emissions_apr',
    'borrow_net_apr', 'supply_emissions_tokens', 'supply_emissions_usd',
    'supply_emissions_apr', 'supply_net_apr'
])
emissions_df.insert(0, 'Vault', names_df['standardized_vault'])

# Clean numeric columns
numeric_cols = emissions_df.columns.drop('Vault')
for col in numeric_cols:
    emissions_df[col] = emissions_df[col].apply(clean_numeric)

# Calculate USD values
emissions_df['borrow_emissions_usd'] = emissions_df['borrow_emissions_tokens'] * latest_price_ion
emissions_df['supply_emissions_usd'] = emissions_df['supply_emissions_tokens'] * latest_price_ion

# Extract chain information from vault names
emissions_df['chain'] = emissions_df['Vault'].str.extract('([mbo])$').fillna('unknown')
emissions_df['chain'] = emissions_df['chain'].map({'m': 'mode', 'b': 'base', 'o': 'optimism'})

def get_enhanced_snapshot_data(df, date, value_column):
    """Get enhanced snapshot data with additional metrics"""
    closest_date = df[df['date'] <= date]['date'].max()
    if pd.isna(closest_date):
        return pd.DataFrame()

    # Handle different column names for vault identifier
    vault_col = 'vaultName'
    if 'ionicVault' in df.columns:
        vault_col = 'ionicVault'

    result = df[df['date'] == closest_date][[vault_col, value_column]].copy()
    result = result.rename(columns={
        vault_col: 'Vault',
        value_column: f"{value_column}_{date.strftime('%Y%m%d')}"
    })
    return result

# Get all snapshot data
print("Getting snapshot data...")
supply_start = get_enhanced_snapshot_data(deposits_copy, START_DATE, 'mintedAmount_USD')
supply_end = get_enhanced_snapshot_data(deposits_copy, CURRENT_DATE, 'mintedAmount_USD')
borrow_start = get_enhanced_snapshot_data(borrows_copy, START_DATE, 'total_borrowed_in_vault_usd')
borrow_end = get_enhanced_snapshot_data(borrows_copy, CURRENT_DATE, 'total_borrowed_in_vault_usd')
tvl_start = get_enhanced_snapshot_data(protocol_tvl_copy, START_DATE, 'TVL')

# Debug prints
print("\nSnapshot data shapes:")
print(f"Supply start: {supply_start.shape}")
print(f"Supply end: {supply_end.shape}")
print(f"Borrow start: {borrow_start.shape}")
print(f"Borrow end: {borrow_end.shape}")
print(f"TVL start: {tvl_start.shape}")

def normalize_changes(series, lower_quantile=0.05, upper_quantile=0.95):
    """Normalize extreme values using winsorization"""
    lower_bound = series.quantile(lower_quantile)
    upper_bound = series.quantile(upper_quantile)
    return series.clip(lower=lower_bound, upper=upper_bound)

def log_transform_changes(series):
    """Apply log transformation to handle extreme values"""
    # Add a constant to handle negative values
    min_val = series.min()
    if min_val < 0:
        series = series - min_val + 1
    # Log transform
    return np.log1p(series)

# Get snapshot data
print("Getting snapshot data...")
supply_start = get_enhanced_snapshot_data(deposits_copy, START_DATE, 'mintedAmount_USD')
supply_end = get_enhanced_snapshot_data(deposits_copy, CURRENT_DATE, 'mintedAmount_USD')
borrow_start = get_enhanced_snapshot_data(borrows_copy, START_DATE, 'total_borrowed_in_vault_usd')
borrow_end = get_enhanced_snapshot_data(borrows_copy, CURRENT_DATE, 'total_borrowed_in_vault_usd')
tvl_start = get_enhanced_snapshot_data(protocol_tvl_copy, START_DATE, 'TVL')

print("\nSnapshot data shapes:")
print(f"Supply start: {supply_start.shape}")
print(f"Supply end: {supply_end.shape}")
print(f"Borrow start: {borrow_start.shape}")
print(f"Borrow end: {borrow_end.shape}")
print(f"TVL start: {tvl_start.shape}")

# Create analysis DataFrame
analysis_df = emissions_df.copy()

# Merge all snapshot data
for df in [supply_start, supply_end, borrow_start, borrow_end, tvl_start]:
    if not df.empty:
        analysis_df = analysis_df.merge(df, on='Vault', how='left')

# Calculate percentage changes
def calculate_pct_change(end_val, start_val):
    try:
        end_val = float(end_val) if pd.notnull(end_val) else 0
        start_val = float(start_val) if pd.notnull(start_val) else 0
        if start_val == 0:
            return 100 if end_val > 0 else 0
        return ((end_val - start_val) / start_val) * 100
    except:
        return 0

# Calculate raw changes
analysis_df['supply_pct_change_raw'] = analysis_df.apply(
    lambda x: calculate_pct_change(
        x[f"mintedAmount_USD_{CURRENT_DATE.strftime('%Y%m%d')}"],
        x[f"mintedAmount_USD_{START_DATE.strftime('%Y%m%d')}"]
    ), axis=1
)

analysis_df['borrow_pct_change_raw'] = analysis_df.apply(
    lambda x: calculate_pct_change(
        x[f"total_borrowed_in_vault_usd_{CURRENT_DATE.strftime('%Y%m%d')}"],
        x[f"total_borrowed_in_vault_usd_{START_DATE.strftime('%Y%m%d')}"]
    ), axis=1
)

# Normalize changes
analysis_df['supply_pct_change'] = normalize_changes(analysis_df['supply_pct_change_raw'])
analysis_df['borrow_pct_change'] = normalize_changes(analysis_df['borrow_pct_change_raw'])

# Log transform borrow changes due to extreme values
analysis_df['borrow_pct_change_log'] = log_transform_changes(analysis_df['borrow_pct_change'])

# Extract chain information
analysis_df['chain'] = analysis_df['Vault'].str.extract('([mbo])$').fillna('unknown')
analysis_df['chain'] = analysis_df['chain'].map({'m': 'mode', 'b': 'base', 'o': 'optimism'})

# Encode categorical variables
le = LabelEncoder()
analysis_df['chain_encoded'] = le.fit_transform(analysis_df['chain'])

def run_enhanced_regression(df, dependent_var, regression_type="supply"):
    """Run enhanced regression with all available variables"""
    if regression_type == "supply":
        X_cols = [
            'supply_emissions_usd',
            'supply_emissions_apr',
            'supply_net_apr',
            f"TVL_{START_DATE.strftime('%Y%m%d')}",
            'chain_encoded'
        ]
    else:
        X_cols = [
            'borrow_emissions_usd',
            'borrow_emissions_apr',
            'borrow_net_apr',
            f"TVL_{START_DATE.strftime('%Y%m%d')}",
            'chain_encoded'
        ]

    # Filter relevant rows
    df = df[~df[dependent_var].isin([np.inf, -np.inf])]

    X = df[X_cols].fillna(0)
    X = sm.add_constant(X)
    y = df[dependent_var].fillna(0)

    if len(df) < 5:
        print(f"Insufficient data for {regression_type} regression")
        return None

    model = sm.OLS(y, X).fit()
    return model, X_cols

print("\nEnhanced Regression Analysis (with Normalized Changes)")
print("===================================================")

# Supply regression
supply_vaults = analysis_df[analysis_df['supply_emissions_usd'] > 0].copy()
print(f"\nSupply Analysis (n={len(supply_vaults)})")
supply_model, supply_cols = run_enhanced_regression(supply_vaults, 'supply_pct_change', "supply")

if supply_model:
    print("\nSupply Impact Regression Results:")
    print("--------------------------------")
    print(supply_model.summary().tables[1])
    print(f"\nR-squared: {supply_model.rsquared:.4f}")
    print("\nFeatures used:", supply_cols)

# Borrow regression (using log-transformed changes)
borrow_vaults = analysis_df[analysis_df['borrow_emissions_usd'] > 0].copy()
print(f"\nBorrow Analysis (n={len(borrow_vaults)})")
borrow_model, borrow_cols = run_enhanced_regression(borrow_vaults, 'borrow_pct_change_log', "borrow")

if borrow_model:
    print("\nBorrow Impact Regression Results (Log-transformed):")
    print("------------------------------------------------")
    print(borrow_model.summary().tables[1])
    print(f"\nR-squared: {borrow_model.rsquared:.4f}")
    print("\nFeatures used:", borrow_cols)

# Export results with both raw and normalized values
results_df = analysis_df[[
    'Vault', 'chain',
    'supply_pct_change_raw', 'supply_pct_change',
    'borrow_pct_change_raw', 'borrow_pct_change', 'borrow_pct_change_log',
    'supply_emissions_usd', 'borrow_emissions_usd',
    'supply_emissions_apr', 'borrow_emissions_apr',
    'supply_net_apr', 'borrow_net_apr',
    f"TVL_{START_DATE.strftime('%Y%m%d')}"
]].copy()

# Display summary statistics by chain for normalized values
print("\nSummary Statistics by Chain (Normalized Values):")
print("---------------------------------------------")
for chain in results_df['chain'].unique():
    chain_data = results_df[results_df['chain'] == chain]
    print(f"\nChain: {chain}")
    print("\nSupply Changes:")
    print(chain_data['supply_pct_change'].describe())
    print("\nBorrow Changes (Log-transformed):")
    print(chain_data['borrow_pct_change_log'].describe())

# Export results
today = datetime.now().strftime('%Y%m%d')
results_df.to_csv(f'normalized_emissions_analysis_{today}.csv', index=False)
print(f"\nNormalized results exported to normalized_emissions_analysis_{today}.csv")

Getting snapshot data...

Snapshot data shapes:
Supply start: (17, 2)
Supply end: (22, 2)
Borrow start: (49, 2)
Borrow end: (49, 2)
TVL start: (49, 2)
Getting snapshot data...

Snapshot data shapes:
Supply start: (17, 2)
Supply end: (22, 2)
Borrow start: (49, 2)
Borrow end: (49, 2)
TVL start: (49, 2)

Enhanced Regression Analysis (with Normalized Changes)

Supply Analysis (n=29)

Supply Impact Regression Results:
--------------------------------
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -1.7478     27.961     -0.063      0.951     -59.589      56.093
supply_emissions_usd    -0.0137      0.019     -0.704      0.489      -0.054       0.027
supply_emissions_apr   276.3818    512.105      0.540      0.595    -782.988    1335.752
supply_net_apr          84.1879    304.137      0.277      0.784    -544.967     713.343
TVL_20240915    

  res = hypotest_fun_out(*samples, **kwds)


## Validation Checks

In [42]:
import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm

print("=== DATA VALIDATION CHECKS ===\n")

# 1. Check Original DataFrames
print("Original DataFrame Shapes:")
print(f"Deposits: {deposits_copy.shape}")
print(f"Borrows: {borrows_copy.shape}")
print(f"Withdrawals: {withdrawals_copy.shape}")
print(f"TVL Aggregate: {tvl_agg_copy.shape}")
print(f"Protocol TVL: {protocol_tvl_copy.shape}")
print(f"Protocol TVL Notional: {protocol_tvl_notional_copy.shape}\n")

# 2. Check Emissions Data
print("Emissions Data Sample:")
print("\nRaw emissions_data first 5 rows:")
for i, row in enumerate(emissions_data[:5]):
    print(f"Row {i}: {row}")

print("\nProcessed emissions_df:")
print(f"Shape: {emissions_df.shape}")
print("\nColumns:", emissions_df.columns.tolist())
print("\nSample (first 5 rows):")
print(emissions_df.head())
print("\nNon-zero emissions counts:")
print(f"Supply emissions > 0: {(emissions_df['supply_emissions_usd'] > 0).sum()}")
print(f"Borrow emissions > 0: {(emissions_df['borrow_emissions_usd'] > 0).sum()}\n")

# 3. Check Vault Names Consistency
print("Vault Names Across DataFrames:")
print("\nDeposits unique vaults:", len(deposits_copy['vaultName'].unique()))
print("Sample:", deposits_copy['vaultName'].unique()[:5])
print("\nBorrows unique vaults:", len(borrows_copy['vaultName'].unique()))
print("Sample:", borrows_copy['vaultName'].unique()[:5])
print("\nProtocol TVL unique vaults:", len(protocol_tvl_copy['ionicVault'].unique()))
print("Sample:", protocol_tvl_copy['ionicVault'].unique()[:5])
print("\nEmissions unique vaults:", len(emissions_df['Vault'].unique()))
print("Sample:", emissions_df['Vault'].unique()[:5])

# 4. Check Date Ranges
print("\nDate Ranges:")
print(f"Deposits: {deposits_copy['date'].min()} to {deposits_copy['date'].max()}")
print(f"Borrows: {borrows_copy['date'].min()} to {borrows_copy['date'].max()}")
print(f"Protocol TVL: {protocol_tvl_copy['date'].min()} to {protocol_tvl_copy['date'].max()}")

# 5. Check Value Ranges
print("\nValue Ranges:")
print("\nDeposits USD:")
print(deposits_copy['mintedAmount_USD'].describe())
print("\nBorrows USD:")
print(borrows_copy['total_borrowed_in_vault_usd'].describe())
print("\nTVL:")
print(protocol_tvl_copy['TVL'].describe())

# 6. Check Final Analysis DataFrame
if 'analysis_df' in locals():
    print("\nFinal Analysis DataFrame:")
    print(f"Shape: {analysis_df.shape}")
    print("\nColumns:", analysis_df.columns.tolist())
    print("\nSample statistics for key metrics:")
    for col in ['supply_pct_change', 'borrow_pct_change', 'supply_emissions_usd', 'borrow_emissions_usd']:
        if col in analysis_df.columns:
            print(f"\n{col}:")
            print(analysis_df[col].describe())

# 7. Check Chain Distribution
if 'analysis_df' in locals() and 'chain' in analysis_df.columns:
    print("\nChain Distribution:")
    print(analysis_df['chain'].value_counts())

# 8. Check for Missing Values
if 'analysis_df' in locals():
    print("\nMissing Values in Analysis DataFrame:")
    print(analysis_df.isnull().sum())

print("\n=== END OF VALIDATION CHECKS ===")

=== DATA VALIDATION CHECKS ===

Original DataFrame Shapes:
Deposits: (4873, 3)
Borrows: (16709, 4)
Withdrawals: (4938, 3)
TVL Aggregate: (49, 4)
Protocol TVL: (16709, 4)
Protocol TVL Notional: (16709, 4)

Emissions Data Sample:

Raw emissions_data first 5 rows:
Row 0: ['Borrow - Next Epoch', '', '', '', 'Supply - Next Epoch', '', '', '']
Row 1: ['Emissions - Tokens', 'Emissions - $', 'Emissions APR', 'Net Borrow APR', 'Emissions - Tokens', 'Emissions - $', 'Emissions APR', 'Net Supply APR']
Row 2: ['50000', '$1,856', '0.51%', '-0.50%', '50000', '$1,856', '0%', '9.3%']
Row 3: ['50000', '$1,856', '0.99%', '-19.24%', '50000', '$1,856', '1%', '12.5%']
Row 4: ['', '$0', '0.00%', '', '', '', '', '']

Processed emissions_df:
Shape: (48, 10)

Columns: ['Vault', 'borrow_emissions_tokens', 'borrow_emissions_usd', 'borrow_emissions_apr', 'borrow_net_apr', 'supply_emissions_tokens', 'supply_emissions_usd', 'supply_emissions_apr', 'supply_net_apr', 'chain']

Sample (first 5 rows):
       Vault  bor

In [44]:
print("=== DETAILED DATA VALIDATION ===\n")

# 1. Analyze vault name patterns
def analyze_vault_names():
    print("Vault Name Analysis:")

    # Get all unique vaults from each source
    deposit_vaults = set(deposits_copy['vaultName'].unique())
    borrow_vaults = set(borrows_copy['vaultName'].unique())
    tvl_vaults = set(protocol_tvl_copy['ionicVault'].unique())
    emission_vaults = set(emissions_df['Vault'].unique())

    # Find vaults that should have emissions but don't
    active_vaults = deposit_vaults.intersection(borrow_vaults).intersection(tvl_vaults)
    missing_emissions = active_vaults - emission_vaults

    print(f"\nActive vaults (in all three main DFs): {len(active_vaults)}")
    print(f"Vaults with emissions data: {len(emission_vaults)}")
    print(f"Active vaults missing emissions: {len(missing_emissions)}")

    # Check for possible name mismatches
    print("\nPossible name variations:")
    for v in missing_emissions:
        similar_names = [ev for ev in emission_vaults if v[:-1] in ev or ev[:-1] in v]
        if similar_names:
            print(f"{v} might match with: {similar_names}")

# 2. Analyze time series completeness
def analyze_time_series():
    print("\nTime Series Analysis:")

    # Check data availability around our key dates
    start_window = pd.Timestamp(START_DATE) - pd.Timedelta(days=7)
    end_window = pd.Timestamp(CURRENT_DATE) + pd.Timedelta(days=7)

    for df_name, df, date_col in [
        ('Deposits', deposits_copy, 'date'),
        ('Borrows', borrows_copy, 'date'),
        ('Protocol TVL', protocol_tvl_copy, 'date')
    ]:
        df['date'] = pd.to_datetime(df[date_col])
        start_data = df[(df['date'] >= start_window) & (df['date'] <= START_DATE)]
        end_data = df[(df['date'] <= end_window) & (df['date'] >= CURRENT_DATE)]

        print(f"\n{df_name}:")
        print(f"Data points near start date: {len(start_data)}")
        print(f"Data points near end date: {len(end_data)}")
        print(f"Unique vaults near start: {start_data['vaultName'].nunique() if 'vaultName' in df.columns else start_data['ionicVault'].nunique()}")
        print(f"Unique vaults near end: {end_data['vaultName'].nunique() if 'vaultName' in df.columns else end_data['ionicVault'].nunique()}")

# 3. Analyze value distributions
def analyze_value_distributions():
    print("\nValue Distribution Analysis:")

    # Analyze emissions values
    print("\nEmissions Distribution:")
    for col in ['supply_emissions_usd', 'borrow_emissions_usd']:
        non_zero = emissions_df[emissions_df[col] > 0][col]
        print(f"\n{col}:")
        print("Non-zero values statistics:")
        print(non_zero.describe())

    # Analyze percentage changes
    if 'analysis_df' in locals():
        print("\nPercentage Changes Distribution:")
        for col in ['supply_pct_change', 'borrow_pct_change']:
            non_zero = analysis_df[analysis_df[col] != 0][col]
            print(f"\n{col}:")
            print("Non-zero values statistics:")
            print(non_zero.describe())

# 4. Check data consistency
def check_data_consistency():
    print("\nData Consistency Checks:")

    # Check for vaults with emissions but no activity
    if 'analysis_df' in locals():
        supply_emitting = analysis_df[analysis_df['supply_emissions_usd'] > 0]
        borrow_emitting = analysis_df[analysis_df['borrow_emissions_usd'] > 0]

        print("\nVaults with emissions but no activity:")
        print("Supply side:")
        print(supply_emitting[supply_emitting['supply_pct_change'] == 0]['Vault'].tolist())
        print("\nBorrow side:")
        print(borrow_emitting[borrow_emitting['borrow_pct_change'] == 0]['Vault'].tolist())

# Run all validations
analyze_vault_names()
analyze_time_series()
analyze_value_distributions()
check_data_consistency()

print("\n=== END OF DETAILED VALIDATION ===")

=== DETAILED DATA VALIDATION ===

Vault Name Analysis:

Active vaults (in all three main DFs): 49
Vaults with emissions data: 46
Active vaults missing emissions: 17

Possible name variations:
ionMODEmi might match with: ['ionMODEm']
ionweETH_OLDm might match with: ['ionweETHo', 'ionweETHb']
ionUSDTmi might match with: ['ionUSDTm', 'ionUSDTo']
ionWETHo might match with: ['ionWETHm']
ionbsdETH might match with: ['ionbsdETHb']
ionweETH might match with: ['ionweETH.modem', 'ionweETHo', 'ionweETHb']
ionWETHb might match with: ['ionWETHm']
ionweETHm might match with: ['ionweETH.modem', 'ionweETHo', 'ionweETHb']
ionUSDCmi might match with: ['ionUSDCo', 'ionUSDCb', 'ionUSDCm']
ionWETHmi might match with: ['ionWETHm']
ionhyUSD might match with: ['ionhyUSDb']

Time Series Analysis:

Deposits:
Data points near start date: 136
Data points near end date: 22
Unique vaults near start: 27
Unique vaults near end: 22

Borrows:
Data points near start date: 392
Data points near end date: 49
Unique vaults 

## Regression - Only Vaults w/ Emissons

In [46]:
import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm
from scipy import stats

# Filter for only vaults with emissions
active_emissions_df = emissions_df[
    (emissions_df['supply_emissions_usd'] > 0) |
    (emissions_df['borrow_emissions_usd'] > 0)
].copy()

print(f"Number of vaults with any emissions: {len(active_emissions_df)}")
print(f"Supply emissions > 0: {(active_emissions_df['supply_emissions_usd'] > 0).sum()}")
print(f"Borrow emissions > 0: {(active_emissions_df['borrow_emissions_usd'] > 0).sum()}\n")

def get_snapshot_data(df, date, value_column, vault_col='vaultName'):
    """Get data closest to specified date"""
    closest_date = df[df['date'] <= date]['date'].max()
    if pd.isna(closest_date):
        return pd.DataFrame()

    result = df[df['date'] == closest_date][[vault_col, value_column]].copy()
    result = result.rename(columns={
        vault_col: 'Vault',
        value_column: f"{value_column}_{date.strftime('%Y%m%d')}"
    })
    # Fill NaN values with 0
    result = result.fillna(0)
    return result

# Get snapshots only for vaults with emissions
active_vaults = active_emissions_df['Vault'].unique()

def filter_and_get_snapshot(df, date, value_column, vault_col='vaultName'):
    df_filtered = df[df[vault_col].isin(active_vaults)]
    return get_snapshot_data(df_filtered, date, value_column, vault_col)

# Get snapshots
print("Getting snapshots for active vaults...")
supply_start = filter_and_get_snapshot(deposits_copy, START_DATE, 'mintedAmount_USD')
supply_end = filter_and_get_snapshot(deposits_copy, CURRENT_DATE, 'mintedAmount_USD')
borrow_start = filter_and_get_snapshot(borrows_copy, START_DATE, 'total_borrowed_in_vault_usd')
borrow_end = filter_and_get_snapshot(borrows_copy, CURRENT_DATE, 'total_borrowed_in_vault_usd')
tvl_start = filter_and_get_snapshot(protocol_tvl_copy, START_DATE, 'TVL', 'ionicVault')

# Create analysis DataFrame starting with active emissions vaults only
analysis_df = active_emissions_df.copy()

# Merge snapshot data
for df in [supply_start, supply_end, borrow_start, borrow_end, tvl_start]:
    if not df.empty:
        analysis_df = analysis_df.merge(df, on='Vault', how='left')

# Fill NaN values with 0
analysis_df = analysis_df.fillna(0)

# Calculate percentage changes with proper handling of edge cases
def calculate_pct_change(end_val, start_val):
    try:
        end_val = float(end_val) if pd.notnull(end_val) else 0
        start_val = float(start_val) if pd.notnull(start_val) else 0
        if start_val == 0:
            return 100 if end_val > 0 else 0
        return ((end_val - start_val) / start_val) * 100
    except:
        return 0

# Calculate changes
analysis_df['supply_pct_change'] = analysis_df.apply(
    lambda x: calculate_pct_change(
        x[f"mintedAmount_USD_{CURRENT_DATE.strftime('%Y%m%d')}"],
        x[f"mintedAmount_USD_{START_DATE.strftime('%Y%m%d')}"]
    ), axis=1
)

analysis_df['borrow_pct_change'] = analysis_df.apply(
    lambda x: calculate_pct_change(
        x[f"total_borrowed_in_vault_usd_{CURRENT_DATE.strftime('%Y%m%d')}"],
        x[f"total_borrowed_in_vault_usd_{START_DATE.strftime('%Y%m%d')}"]
    ), axis=1
)

# Add log-transformed changes for better regression
analysis_df['borrow_pct_change_log'] = analysis_df['borrow_pct_change'].apply(
    lambda x: np.log1p(x + 100) # Add 100 to handle negative values
)

def run_regression(df, dependent_var, emissions_var, type_label):
    """Run regression with proper filtering and diagnostics"""
    # Filter for relevant vaults and remove any remaining NaN values
    df = df[df[emissions_var] > 0].copy()
    df = df.replace([np.inf, -np.inf], 0)

    print(f"\n{type_label} Analysis (n={len(df)})")

    # Print data quality check
    print("\nData Quality Check:")
    print(f"NaN values in TVL: {df['TVL_20240915'].isna().sum()}")
    print(f"NaN values in {dependent_var}: {df[dependent_var].isna().sum()}")
    print(f"NaN values in {emissions_var}: {df[emissions_var].isna().sum()}")

    X = df[[emissions_var, 'TVL_20240915']]
    X = sm.add_constant(X)
    y = df[dependent_var]

    model = sm.OLS(y, X).fit()

    # Print results
    print("\nRegression Results:")
    print("==================")
    print(model.summary().tables[1])
    print(f"\nR-squared: {model.rsquared:.4f}")

    return model, df

# Run supply regression
supply_model, supply_df = run_regression(
    analysis_df,
    'supply_pct_change',
    'supply_emissions_usd',
    'Supply'
)

# Run borrow regression (using log-transformed changes)
borrow_model, borrow_df = run_regression(
    analysis_df,
    'borrow_pct_change_log',
    'borrow_emissions_usd',
    'Borrow'
)

# Print summary statistics by chain
print("\nSummary Statistics by Chain:")
print("===========================")
for chain in analysis_df['chain'].unique():
    chain_data = analysis_df[analysis_df['chain'] == chain]
    print(f"\nChain: {chain}")
    print(f"Number of vaults: {len(chain_data)}")
    print("\nSupply Changes:")
    print(chain_data[chain_data['supply_emissions_usd'] > 0]['supply_pct_change'].describe())
    print("\nBorrow Changes (Log-transformed):")
    print(chain_data[chain_data['borrow_emissions_usd'] > 0]['borrow_pct_change_log'].describe())

# Export results
results_df = analysis_df[[
    'Vault', 'chain',
    'supply_pct_change', 'borrow_pct_change', 'borrow_pct_change_log',
    'supply_emissions_usd', 'borrow_emissions_usd',
    'TVL_20240915'
]].copy()

today = datetime.now().strftime('%Y%m%d')
results_df.to_csv(f'emissions_analysis_clean_{today}.csv', index=False)
print(f"\nResults exported to emissions_analysis_clean_{today}.csv")

Number of vaults with any emissions: 33
Supply emissions > 0: 29
Borrow emissions > 0: 17

Getting snapshots for active vaults...

Supply Analysis (n=29)

Data Quality Check:
NaN values in TVL: 0
NaN values in supply_pct_change: 0
NaN values in supply_emissions_usd: 0

Regression Results:
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   16.8094     18.102      0.929      0.362     -20.399      54.018
supply_emissions_usd    -0.0193      0.017     -1.160      0.257      -0.053       0.015
TVL_20240915         -3.778e-05   2.88e-05     -1.312      0.201    -9.7e-05    2.14e-05

R-squared: 0.1457

Borrow Analysis (n=17)

Data Quality Check:
NaN values in TVL: 0
NaN values in borrow_pct_change_log: 0
NaN values in borrow_emissions_usd: 0

Regression Results:
                           coef    std err          t      P>|t|      [0.025      

  res = hypotest_fun_out(*samples, **kwds)




*   33 vaults with emissions this epoch
*   29 w/ supply emissions
*   17 w/ borrow emissions

Supply side:
*   no statistically significant features, r2=0.1457
*   base effect: not significant, p=0.362
*   emissions effect: not sig,p=0.257
*   TVL effect: very small negative impact, not sig, p=-3.778e-05

Borrow side:
*   base effect: significant +6.85 log units (p<.0001)
*   emissions effect: very small, not significant
*   TVL effect: negligible, not significant


Chain-Specific Patterns:
*  Mode (10 vaults):
 *  Supply: Negative trend (-41.91% mean)
 *  Borrow: Consistent moderate growth (mean log = 4.80)

* Base (18 vaults):
 *  Supply: Slight positive (4.61% mean)
 *  Borrow: Higher growth than Mode (mean log = 6.13)

* Optimism (5 vaults):
 *  Supply: Positive trend (20% mean)
 *  Borrow: Highest growth (mean log = 9.50)
OP most volatile, mode least volatile, base mid


When normalized:

* Borrow growth is still significant but more reasonably distributed
* Chain effects are less dramatic than initially appeared
* APR remains the strongest predictor of borrowing behavior
* Supply side behavior is robust to outlier treatment



## Regression on 11/15-12/15

In [71]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from datetime import datetime

# 1. Define analysis period
START_DATE = pd.Timestamp('2024-11-15')
END_DATE = pd.Timestamp('2024-12-06')

def standardize_vault_name(name):
    """Standardize vault names between datasets"""
    name = str(name)
    name = name.replace('.mode', '')
    name = name.replace('weETH.mode', 'WETHm')
    return name

def log_transform_growth(growth):
    """
    Log transform growth rates while handling negative values
    Add 101 to shift all values positive (since min is -100%)
    """
    return np.log1p(growth + 101)

# 2. Get active vaults with emissions
borrow_vaults = emissions_df[emissions_df['borrow_emissions_usd'] > 0].copy()
supply_vaults = emissions_df[emissions_df['supply_emissions_usd'] > 0].copy()
borrow_vaults['Vault'] = borrow_vaults['Vault'].apply(standardize_vault_name)
supply_vaults['Vault'] = supply_vaults['Vault'].apply(standardize_vault_name)

# 3. Prepare regression data
def prepare_regression_data(vaults_df, data_df, value_col, emissions_col):
    data = []
    for _, vault_row in vaults_df.iterrows():
        vault_name = vault_row['Vault']

        # Get start and end values
        start_data = data_df[
            (data_df['date'] <= START_DATE) &
            (data_df['vaultName'] == vault_name)
        ].sort_values('date')

        end_data = data_df[
            (data_df['date'] <= END_DATE) &
            (data_df['vaultName'] == vault_name)
        ].sort_values('date')

        if not start_data.empty and not end_data.empty:
            start_value = float(start_data[value_col].iloc[-1])
            end_value = float(end_data[value_col].iloc[-1])

            # Calculate growth
            if start_value > 0:
                growth = ((end_value - start_value) / start_value) * 100
            else:
                growth = 100 if end_value > 0 else 0

            # Log transform growth
            log_growth = log_transform_growth(growth)

            data.append({
                'vault': vault_name,
                'growth': float(growth),
                'log_growth': float(log_growth),
                'emissions': float(vault_row[emissions_col]),
                'chain': str(vault_row['chain']),
                'start_value': start_value,
                'end_value': end_value
            })

    return pd.DataFrame(data)

# Prepare both datasets
borrow_df = prepare_regression_data(
    borrow_vaults,
    borrows_copy,
    'total_borrowed_in_vault_usd',
    'borrow_emissions_usd'
)

supply_df = prepare_regression_data(
    supply_vaults,
    deposits_copy,
    'mintedAmount_USD',
    'supply_emissions_usd'
)

def run_regression_analysis(df, side="Borrow"):
    if len(df) > 0:
        # Create design matrix X with proper constant term
        X = sm.add_constant(df['emissions'])

        # Add chain dummies
        chain_dummies = pd.get_dummies(df['chain'], prefix='chain')
        X = pd.concat([X, chain_dummies], axis=1)

        # Ensure all columns are float type
        X = X.astype(float)

        # Prepare y vector (using log_growth)
        y = df['log_growth'].astype(float)

        print(f"\n{side} Side Regression (Log-Transformed)")
        print("=" * 50)

        print("\nData Types Before Regression:")
        print("X dtypes:", X.dtypes)
        print("y dtype:", y.dtype)

        try:
            model = sm.OLS(y, X).fit()

            print("\nCoefficients:")
            print(model.params)
            print("\nP-values:")
            print(model.pvalues)
            print("\nR-squared:", model.rsquared)

            print("\nRegression Statistics:")
            print(f"Number of observations: {model.nobs}")
            print(f"F-statistic: {model.fvalue}")
            print(f"Prob (F-statistic): {model.f_pvalue}")

            print(f"\n=== {side} Side Additional Validation ===")
            print(f"Number of vaults with valid start values: {len(df[df['start_value'] > 0])}")
            print(f"Number of vaults with valid end values: {len(df[df['end_value'] > 0])}")

            print("\nOriginal Growth rate distribution:")
            print(df['growth'].describe())

            print("\nLog-Transformed Growth rate distribution:")
            print(df['log_growth'].describe())

            print("\nCorrelations:")
            correlations = df[['growth', 'log_growth', 'emissions']].corr()
            print(correlations)

            print(f"\n{side} Data Sample:")
            print(df[['vault', 'growth', 'log_growth', 'emissions', 'start_value', 'end_value']].head())

        except Exception as e:
            print(f"\n{side} Side Regression Error:", str(e))
            print("\nX data sample:")
            print(X.head())
            print("\ny data sample:")
            print(y.head())
    else:
        print(f"\nNo {side.lower()} data available for regression analysis")

# Print overall analysis period
print(f"Analysis Period: {START_DATE.date()} to {END_DATE.date()}")
print(f"Borrow Vaults: {len(borrow_df)}")
print(f"Supply Vaults: {len(supply_df)}\n")

# Run both regressions
run_regression_analysis(borrow_df, "Borrow")
run_regression_analysis(supply_df, "Supply")

Analysis Period: 2024-11-15 to 2024-12-06
Borrow Vaults: 13
Supply Vaults: 20


Borrow Side Regression (Log-Transformed)

Data Types Before Regression:
X dtypes: const             float64
emissions         float64
chain_base        float64
chain_mode        float64
chain_optimism    float64
dtype: object
y dtype: float64

Coefficients:
const             3.809133
emissions        -0.000008
chain_base        1.162533
chain_mode        0.895958
chain_optimism    1.750642
dtype: float64

P-values:
const             1.843123e-10
emissions         8.776173e-01
chain_base        3.722137e-05
chain_mode        7.416418e-04
chain_optimism    2.229911e-04
dtype: float64

R-squared: 0.32039045240190034

Regression Statistics:
Number of observations: 13.0
F-statistic: 1.4142993732249747
Prob (F-statistic): 0.30125449261656134

=== Borrow Side Additional Validation ===
Number of vaults with valid start values: 13
Number of vaults with valid end values: 13

Original Growth rate distribution:
count  

Key Insights

Borrow Side:
* Log transformation revealed stronger chain-specific effects
* Chain selection is more important than emission levels
* Very stable and consistent borrowing behavior across chains
* Recommendation: Focus on chain-specific strategies rather than emission levels


Supply Side:
* Still showing high variability even after log transformation
* Weaker relationship between original and log growth (0.728 vs 0.983 for borrow)
* Model remains poor at explaining supply behavior
* Recommendation:
  * Consider additional variables beyond emissions
  * Investigate supply-specific market factors
  * May need to segment analysis by asset type

Overall Implications:
* Borrowing behavior is more predictable and chain-dependent
* Supply behavior might be driven by external market factors not captured in the model