In [41]:
# importing src directory
import sys
sys.path.append('/Users/andrewcarranti/CODE/SHIFT/2024/py_repo/post_refactor/AMM-Python/src')
# experiment imports
import os
import math
import numpy as np
import random
from datetime import datetime as dt
from scipy.stats import truncnorm
from scipy import integrate
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import shapiro
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_pacf
from scipy.optimize import minimize
# project imports
from amm.amm import AMM, SimpleFeeAMM
from amm.fee import TriangleFee, PercentFee, NoFee
# data imports
from data.kaiko import fetch_data
from api_key.my_api_key import api_key

In [42]:

def gbm_assumption_test(log_returns):
    adf_result = adfuller(log_returns) # check for stationarity
    print("ADF Statistic:", adf_result[0]) # check for stationarity
    print("P-value:", adf_result[1])
    print("Critical Values:", adf_result[4])
    print("Stationary:", adf_result[1] <= 0.05)
    # if adf_result[1] > 0.05:  # if not stationary, iteratively difference until achieved
    #     for d in range(1, max_lag + 1):
    #         diff_data = diff(log_returns, k_diff=d)
    #         adf_result = adfuller(diff_data)
    #         print(f"ADF result after differencing level {d}: {adf_result[0]}, p-value: {adf_result[1]}")
    #         if adf_result[1] <= 0.05:
    #             print("Achieved stationarity with differencing level:", d)
    #             diff_data = diff_data
    #             break
    shapiro_result = shapiro(log_returns) # check for normality
    print("Shapiro-Wilk Test Statistic:", shapiro_result[0])
    print("p-value:", shapiro_result[1])
    print("normal:", shapiro_result[1] > 0.05)
    lb_result = acorr_ljungbox(log_returns, lags=[10], return_df=True) # check for independence (autocorrelation)
    print("Ljung-Box test:")
    print(lb_result)
    print("Independent:", lb_result['lb_pvalue'].iloc[0] > 0.05)
    print("-"*50)
    # if lb_result['lb_pvalue'].iloc[0] < 0.05: # if autocorrelation detected, adjust
    #     print("Autocorrelation detected:") 
    #     plot_pacf(log_returns, lags=40) # plot partial autocorrelation function
    #     plt.title('Partial Autocorrelation Function (PACF)')
    #     plt.show()

In [43]:

# Define the log-likelihood function
def neg_log_likelihood(params, log_returns):
    """
    calculate negative log likelihood of a normal distribution for calibrating GBM
    params: tuple, mu and sigma
    """
    mu, sigma = params # define mu and sigma
    estimated_mu = np.mean(log_returns) # estimate mu
    estimated_var = np.sum((log_returns - estimated_mu)**2) / len(log_returns) # estimate variance
    return 0.5 * len(log_returns) * np.log(2 * np.pi * estimated_var) + 0.5 / estimated_var * np.sum((log_returns - mu)**2) # return negative log likelihood

In [44]:
def calibrate_gbm(asset, data, frequency, T, N, type):
    """
    calibrate geometric brownian motion for next period (t=0 is last observation in data)
    
    calibrate gbm model by pulling data from kaiko api

    asset (str): asset to calibrate
    data (pd.DataFrame): price data w/ column 'price'
    freq (str): frequency of data (1h, 1d, 1w)
    T (float): terminal time
    N (int): number of time steps
    type (str): type of calibration (reg, mle)
    max_lag (int): maximum lag for autocorrelation test (default=10)
    alpha (float): significance level for hypothesis tests (default=0.05)

    return mu (float), sigma (float), S (numpy.ndarray)
    """
    
    if type == "reg":
        print("dfghjk")

        returns = np.log((data / data.shift(1)).dropna()) # get returns
        gbm_assumption_test(returns) # test gbm assumptions
        mu = returns.mean() * 365.25  # annualized return

        print("dfghjk")

        sigma = returns.std() * 365.25 ** 0.5 # annualized volatility
        print(f'Estimated {asset} {frequency} Mu:', round(mu, 2), 'Estimated Annualized Mu:', round(mu * 365.25,2))
        print(f'Estimated {asset} {frequency} Sigma:', round(sigma, 2), 'Estimated Annualized Sigma:', round(sigma * 365.25**0.5, 2))
        S0 = data.iloc[-1] # get LAST price in series
        dt = T / N # time step size
        t = np.linspace(0, T, N)
        W = np.random.standard_normal(size=N)
        W = np.cumsum(W) * np.sqrt(dt)  # Standard Brownian motion
        X = (mu - 0.5 * sigma**2) * t + sigma * W 
        S = S0 * np.exp(X)  # Geometric Brownian motion   

        print("dfghjk")

        return mu, sigma, S
    
    elif type == "mle":
        log_returns = np.log(1 + data.pct_change().dropna()) # calculate log returns
        result = minimize(neg_log_likelihood, [0.05, 0.2], args=(log_returns,), bounds=((None, None), (1e-4, None))) # minimize the negative log-likelihood
        mu = result.x[0] * 365.25 # annualize mu
        sigma = result.x[1] * 365.25**0.5 # annualize sigma
        print(f'Estimated {asset} {frequency} Mu:', round(result.x[0],5), 'Estimated Annualized Mu:', round(mu, 5)) # using 365.25 instead of 252 bcs operate 24/7
        print(f'Estimated {asset} {frequency} Sigma:', round(result.x[1],5), 'Estimated Annualized Sigma:', round(sigma, 5))
        S0 = data.iloc[-1] # get LAST price in series
        dt = T / N # time step size
        t = np.linspace(0, T, N)
        W = np.random.standard_normal(size=N)
        W = np.cumsum(W) * np.sqrt(dt)  # standard BM
        X = (mu - 0.5 * sigma**2) * t + sigma * W 
        S = S0 * np.exp(X)  # GBM
        return mu, sigma, S

In [45]:
def get_gbm_data(pair, start_date, end_date, freq, api_key):
    """
    get gbm data from kaiko api or local storage

    asset (str): asset symbol
    start_date (str): start date of data
    end_date (str): end date of data
    freq (str): frequency of data (1h, 1d, 1w)
    api_key (str): kaiko api key

    return pd.DataFrame: price data
    """
    # check if data exists, if not fetch data
    asset1 = pair.split("-")[0] 
    asset2 = pair.split("-")[1]

    if os.path.exists(f"/data/crypto_data/{asset1}-usd_{start_date}_{end_date}_{freq}.csv"):
        data1 =  pd.read_csv(f"/data/crypto_data/{asset1}-usd_{start_date}_{end_date}_{freq}.csv")["price"]
    else: data1 = fetch_data(api_key, asset1+"-usd", start_date, end_date, freq)
    data1['timestamp'] = pd.to_datetime(data1['timestamp'], unit='ms') # convert timestamp to datetime
    data1['price'] = pd.to_numeric(data1['price'])

    if os.path.exists(f"/data/crypto_data/{asset2}-usd_{start_date}_{end_date}_{freq}.csv"):
        data2 =  pd.read_csv(f"/data/crypto_data/{asset2}-usd_{start_date}_{end_date}_{freq}.csv")["price"]
    else: data2 = fetch_data(api_key, asset2+"-usd", start_date, end_date, freq)
    data2['timestamp'] = pd.to_datetime(data2['timestamp'], unit='ms') # convert timestamp to datetime
    data2['price'] = pd.to_numeric(data2['price'])

    return pd.merge(data1, data2, on='timestamp', how='inner', suffixes=("_" + asset1, "_" + asset2)) # merge dataframes on timestamp saving price for each asset denominated in USD for storing AMM market data

In [46]:

def sim1(n, pair, start_dt, end_dt, frequency, L0=1000000, spread=0.5):
    """
    simulate AMM market with data calibrated GBM for external oracles and trading agents
    n (int): number of simulations
    pair (str): asset pair for data (e.g. btc-eth)
    asset1_n (int): number of asset1 tokens
    asset2_n (int): number of asset2 tokens
    start_dt (str): start date for data (YYYY-MM-DD)
    end_dt (str): end date for data (YYYY-MM-DD)
    frequency (str): frequency of data (1h, 1d, 1w)
    L0 (int): number of initial LP tokens
    spread (float): spread for arbitrage agents (e.g. 0.5%)
    return list: list of dataframes for each simulation 
    """

    # # SIM STORAGE # #
    # create list to store dfs from each simulation of amms
    sim_amm_dfs= []
    sim_amms = []
    # parse asset1 and asset2, create USD denominated pairs
    asset1 = pair.split("-")[0] 
    asset2 = pair.split("-")[1]

    # # DATA & GBM CALIBRATION # #
    difference = dt.strptime(end_dt, '%Y-%m-%dT%H:%M:%SZ') - dt.strptime(start_dt, '%Y-%m-%dT%H:%M:%SZ')
    T_years = difference.days / 365.25  # using 365.25 to account for leap years
    marketDF =  get_gbm_data(pair, start_dt, end_dt, frequency, api_key) # get data for assets
    n_timesteps = len(marketDF) # number of timesteps in data
    new_cols = [f'gbm_price_{asset1}', f'gbm_price_{asset2}', # inventory of each asset
                f'{asset1}_inv', f'{asset2}_inv', 'L_inv', f'F{asset1}_inv', f'F{asset2}_inv', 
                'FL_inv', f'dt_{asset1}', f'dt_{asset2}', 'dt_L', f'dt_F{asset1}', f'dt_L']
    marketDF = marketDF.assign(**{col: None for col in new_cols})
    
    L0 = 1000000 # 1 mil LP tokens
    A0 = math.sqrt(L0**2 * marketDF[f'price_{asset1}'][0]/marketDF[f'price_{asset2}'][0]) # evenly distribute assets
    B0 = L0**2 / A0
    market_portfolio = {"A": A0, "B": B0, "L":L0} # initial portfolio 
    
    # gbm_assumption_test(np.log(1 + marketDF[f"price_{asset1}"].pct_change().dropna())) # test gbm assumptions
    # gbm_assumption_test(np.log(1 + marketDF[f"price_{asset2}"].pct_change().dropna())) # test gbm assumptions

    # # TIME SERIES SIMULATIONS # #
    

    # # TODO: ISSUE BELOW
    for simulation in range(n): # for each simulation create new set of amms & run new set of trades
        _,_,marketDF[f'gbm_price_{asset1}'] = calibrate_gbm(asset1, marketDF[f"price_{asset1}"], frequency, T_years, n_timesteps, "mle") # calibrate gbm for asset1 w/ MLE
        _,_,marketDF[f'gbm_price_{asset2}'] = calibrate_gbm(asset2, marketDF[f"price_{asset2}"], frequency, T_years, n_timesteps, "mle") # calibrate gbm for asset2 w/ MLE
        marketDF[f'amm_{asset1}/{asset2}'][0] = A0/B0 # set initial amm ratio

        nofeeAMM = SimpleFeeAMM(fee_structure = NoFee(), initial_portfolio=market_portfolio)
        percentAMM = SimpleFeeAMM(fee_structure = PercentFee(0.01), initial_portfolio=market_portfolio)
        triAMM = SimpleFeeAMM(fee_structure = TriangleFee(0.003, 0.0001, -1), initial_portfolio=market_portfolio) 
        percentDF = marketDF.copy(deep=True)
        nofeeDF = marketDF.copy(deep=True)
        triDF = marketDF.copy(deep=True)
        amms = [(nofeeAMM, nofeeDF), (percentAMM, percentDF), (triAMM, triDF)] # store pairs of amm type & df for updating
        
# TODO: fix here ^
        # # SIMULATION # #
        for t in range(n_timesteps): # iterate over each timestep in crypto market data

            # print(marketDF[f'amm_{asset1}/{asset2}'][t])
            # print((marketDF[f'gbm_{asset1}/{asset2}'][t] * (1+spread/100)))
            # # ARBITRAGE AGENT # #
            
            for amm, df in amms: # update market data with amm data

                if marketDF[f'amm_{asset1}/{asset2}'][t] > (marketDF[f'gbm_{asset1}/{asset2}'][t] * (1+spread/100)): # rule-based arbitrage agents in the market
                    asset_out, asset_in, asset_in_n = asset1, asset2, random.choice(list(range(1, 50))) # modeling market efficiency
                if (marketDF[f'amm_{asset1}/{asset2}'][t] * 1.005) < marketDF[f'gbm_{asset1}/{asset2}'][t]:
                    asset_out, asset_in, asset_in_n = asset2, asset1, random.choice(list(range(1, 50)))
                else: continue



                succ, info = amm.trade_swap(asset_out, asset_in, asset_in_n) # call trade for each AMM
                new_row = {f'{asset1}_inv': amm.portfolio[asset1], f'{asset2}_inv': amm.portfolio[asset2], # add trade info to df
                           'LInv': amm.portfolio['L'], asset1: info['asset_delta'][asset1], 
                           f'{asset2}': info['asset_delta'][asset2], 'L': info['asset_delta']['L'], 
                        f'F{asset1}': amm.fees[asset1], f'F{asset2}': amm.fees[asset2], 'FL': amm.fees['L']}
                df.loc[t] = new_row # append new row to df

        for amm, df in amms:
            sim_amm_dfs.append(df)
            sim_amms.append(amm)
    return sim_amm_dfs, sim_amms # return list of dfs for each simulation

In [47]:
# # NOTES FROM LAST MEETING:
# FOCUS MORE ON TESTING FEES THROUGH SIM

# # EXPERIMENTS TODO: # #
# [1] run for large simulations and evaluate over time - explore different time periods to test from (different market conditions and lengths of historical windows) and different frequencies (1h, 1d, 1w)
# [2] identify GBM paths that deplete pools (depletion of liquidity) and have both fall in value (impermanent loss) to show how fee accumulation compares ot general trend (law of large #s)
        # impermanent loss evaluation could allow for an expected value calculation for LP returns (expected value of fees vs. impermanent loss)
# [3] use stock data to see how compares
# [4] make sure to highlight how different fee AMMs (basically fees) are affected by different market conditions and therefore how fee accumulation is affected

# # UPDATES # #
# [1] *importing stock data to use instead of crypto (more in line with goal application and can properly use GBM to simulate)
# [2] considering train/test split for calibrating GBM and simulating trades source data (not overly urgent given not forecasting)
# [3] maybe also considering changing source data from vwap if stick with crypto data
        # multiple price streams for multiple external oracles

In [48]:

sim1(2, "btc-eth", '2023-02-01T00:00:00Z', '2024-03-01T00:00:00Z', "1d")



Estimated btc 1d Mu: 0.00251 Estimated Annualized Mu: 0.91566
Estimated btc 1d Sigma: 0.2 Estimated Annualized Sigma: 3.8223
Estimated eth 1d Mu: 0.00195 Estimated Annualized Mu: 0.71192
Estimated eth 1d Sigma: 0.2 Estimated Annualized Sigma: 3.8223


KeyError: 'amm_btc/eth'