In [None]:
# GraphQL
from graphql_helper import run_query
from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport

# I/O
from pprint import pprint
import json
import os
import glob

# statistical analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import MaxNLocator
import pwlf_helper

import datetime as dt
import math

In [None]:
uniswap_v3_subgraph_url = "https://api.thegraph.com/subgraphs/name/uniswap/uniswap-v3"
refetch = True
data_folder = "data"
pool_day_data_folder = os.path.join(data_folder, "pool", "day")

# TODO: whitelist by address, not symbols
whitelisted_symbols = ["BUSD", "USDC", "USDT", "DAI", "WETH", "WBNB", "WBTC"]

In [None]:
TEN_LARGEST_TVL_POOLS_QUERY = """
{
    pools(first: 10, orderBy: totalValueLockedUSD, orderDirection: desc) {
        id
    } 
}
"""
if refetch:
    largest_tvl_pool_ids = run_query(uniswap_v3_subgraph_url, TEN_LARGEST_TVL_POOLS_QUERY)
    largest_tvl_pool_addrs = list(map(lambda x: x["id"], largest_tvl_pool_ids["data"]["pools"]))
    print(largest_tvl_pool_addrs)

In [None]:
GET_POOL_BY_ID_QUERY = gql(
    """
    query getPoolById($pool_addr: ID!) {
        pool(id: $pool_addr) {
            tick
            token0 {
                symbol
                id
                decimals
            }
            token1 {
                symbol
                id
                decimals
            }
            feeTier
            sqrtPrice
            liquidity
        }
    }
    """
)

In [None]:
async def fetch_pools_metadata(addresses: list, verbose: bool=False):
    result = {"topPoolDatas": []}

    transport = AIOHTTPTransport(url=uniswap_v3_subgraph_url)

    async with Client(
        transport=transport,
        fetch_schema_from_transport=True,
    ) as session:
        for id in addresses:
            params = {"pool_addr": id}
            temp = await session.execute(GET_POOL_BY_ID_QUERY, variable_values=params)
            result["topPoolDatas"].append(temp["pool"])

    if verbose:
        pprint(result)

    if not os.path.exists(data_folder):
        os.makedirs(data_folder)

    with open(f"{data_folder}/topPoolDatas.json", "w") as f:
        json.dump(result, f, indent=4)

In [None]:
if refetch:
    await fetch_pools_metadata(largest_tvl_pool_addrs)

In [None]:
def format_pool_name(symbol0: str, symbol1: str, fee_tier: int) -> str:
    return f"{symbol0}_{symbol1}_{fee_tier}"

In [None]:
with open(f"{data_folder}/topPoolDatas.json", "r") as f:
    top_pool_datas = json.load(f)

# TODO: better token symbol mapping
token_symbols = dict()

# replace nested dict with token addr
for pool_dict in top_pool_datas["topPoolDatas"]:
    for token in ["token0", "token1"]:
        token_symbols[pool_dict[token]["id"]] = pool_dict[token]["symbol"]
        pool_dict[token] = pool_dict[token]["id"]

top_pools_df = pd.DataFrame.from_dict(top_pool_datas["topPoolDatas"]).astype({
    "token0": str,
    "token1": str,
    "feeTier": int
})

# add addr and name
top_pools_df["pool_addr"] = largest_tvl_pool_addrs
top_pools_df["name"] = top_pools_df.apply(lambda x: format_pool_name(token_symbols[x["token0"]], token_symbols[x["token1"]], x["feeTier"]), axis=1)

# drop unused cols
top_pools_df.drop(columns=["tick", "sqrtPrice", "liquidity"], inplace=True)

# drop pools without whitelisted symbols
has_whitelisted_symbols = top_pools_df["name"].str.contains("|".join(whitelisted_symbols))
top_pools_df = top_pools_df[has_whitelisted_symbols]

top_pools_df.head()

In [None]:
top_pools_df.info()

### Replicating the Info Page
We perform the following two queries over 1000 days (at most):
1. TVL
2. 24H Volume

In [None]:
POOL_CHART = gql(
    """
    query poolDayDatas($startTime: Int!, $skip: Int!, $address: String!) {
        poolDayDatas(
            first: 1000
            skip: $skip
            where: { pool: $address, date_gt: $startTime }
            orderBy: date
            orderDirection: asc
            subgraphError: allow
        ) {
            date
            volumeUSD
            tvlUSD
            feesUSD
        }
    }
    """
)

In [None]:
fetch_pool_error = []

# ref: https://github.com/Uniswap/v3-info/blob/770a05dc1a191cf229432ebc43c1f2ceb3666e3b/src/data/pools/chartData.ts#L14
async def fetch_pool_chart_data(address: str, symbol0: str, symbol1: str, fee_tier: int, verbose: bool=False):
    START_TIMESTAMP = 1619170975 # GMT: Friday, April 23, 2021 9:42:55 AM
    # END_TIMESTAMP = int(time.time()) # current timestamp

    error = False
    skip = 0
    all_found = False
    result = {"poolDayDatas": []}

    transport = AIOHTTPTransport(url=uniswap_v3_subgraph_url)

    async with Client(
        transport=transport,
        fetch_schema_from_transport=True,
    ) as session:
        params = {
            "address": address,
            "startTime": START_TIMESTAMP,
            "skip": skip
        }
        try:
            while not all_found:
                temp = await session.execute(POOL_CHART, variable_values=params)
                skip += 1000
                if len(temp["poolDayDatas"]) < 1000 or error:
                    all_found = True
                if temp:
                    result["poolDayDatas"] = result["poolDayDatas"] + temp["poolDayDatas"] # concat the lists
        except Exception as e:
            print(e)
            error = True
            fetch_pool_error.append(address)

    if not error:
        if verbose:
            pprint(result)

        if not os.path.exists(pool_day_data_folder):
            os.makedirs(pool_day_data_folder)

        with open(f"{pool_day_data_folder}/{format_pool_name(symbol0, symbol1, fee_tier)}.json", "w") as f:
            json.dump(result, f, indent=4)

In [None]:
if refetch:
    # remove existing content in the out folder
    for f in glob.glob(pool_day_data_folder + "/*"):
        os.remove(f)

    # fetch pool data for each pool
    for i, row in top_pools_df.iterrows():
        await fetch_pool_chart_data(row["pool_addr"], token_symbols[row["token0"]], token_symbols[row["token1"]], row["feeTier"])
    print(fetch_pool_error)

In [None]:
# reads pool day datas from json
df = pd.DataFrame(columns=["date"])
pool_names = []

for f in os.listdir(pool_day_data_folder):
    fullname = os.fsdecode(f)

    # not a rigorous check
    with open(os.path.join(pool_day_data_folder, fullname), "r") as file:
        pool_day_datas = json.load(file)

    # parse dict as df
    temp = pd.DataFrame.from_dict(pool_day_datas["poolDayDatas"]).astype({
        "volumeUSD": np.float64,
        "tvlUSD": np.float64
    })

    # Note: there is no need to analyze fees separately,
    # as it is a fixed proportion of the pool's trade volume
    temp.drop(columns=["feesUSD"], inplace=True)

    # prefix columns (except "date") with pool name
    cols = temp.columns[~temp.columns.isin(["date"])]
    pool_name = fullname.split(os.sep)[-1].split(".")[0]
    pool_names.append(pool_name)
    temp.rename(columns = dict(zip(cols, pool_name + "_" + cols)), inplace=True)

    # outer join: union of items on "date"
    df = pd.merge(df, temp, how="outer", on=["date"])

# sort by "date"
df.sort_values(by="date", inplace=True)
df.reset_index(drop="index", inplace=True)

df.head()

In [None]:
# ["date"]: int -> date (in "YYYY-MM-DD")
df["timestamp"] = df["date"] # keep timestamp in a new col
df["date"] = df["date"].map(dt.date.fromtimestamp)

df.head()

In [None]:
df.info()

In [None]:
# sanity check for number of days elapsed
print(df["date"][0], "to", dt.date.today(), "has", (dt.date.today() - df["date"][0]))

## 24H Volume Analysis

In [None]:
plt.figure(figsize=(19, 9))
for pool_name in pool_names:
    plt.plot(df["date"], df[pool_name + "_volumeUSD"])
plt.title("24H Volume over Time")
plt.xlabel("date")
plt.ylabel("24H Volume (in USD)")
plt.legend(pool_names, loc='upper left')
plt.show()

In [None]:
# find the days with the greatest 24H volumes
# df.sort_values(by="volumeUSD", ascending=False)[:10]

In [None]:
# # for reference only, no use now
# fig, ax = plt.subplots(figsize=(15, 1))
# sns.boxplot(data=df, x="volumeUSD")
# plt.xlim(0, 2e8)
# plt.show()

### Histogram
Observe the distribution of the prices.

In [None]:
fig = plt.figure(figsize=(20, 12))
axes = fig.subplots(3, 3)
for i, pool_name in enumerate(pool_names):
    volumeUSD_series = df[pool_name + "_volumeUSD"]
    ax = axes[math.floor(i/3), i%3]
    ax.title.set_text(pool_name)
    ax.hist(volumeUSD_series, bins=100, range=(0, volumeUSD_series.quantile(0.99)))

    # force y-axis ticks to use integers
    ax.get_yaxis().set_major_locator(MaxNLocator(integer=True))

    # highlight 25%-75% percentile
    lq = volumeUSD_series.quantile(0.25)
    uq = volumeUSD_series.quantile(0.75)
    ax.axvspan(lq, uq, color="green", alpha=0.25)

fig.suptitle("24H Volume Distributions")
fig.supxlabel("24H Volume (in USD)")
plt.show()

In [None]:
pool_df = df.drop(columns="timestamp")
# note: df.std() is normalized by N-1
pool_metrics_df = pd.DataFrame(data=[pool_df.mean(), pool_df.std()], index=["mean", "stdev"])
pool_metrics_df

In [None]:
# # for reference only, no use now
# df["volumeUSD"].plot.kde()
# plt.title(pool_name + " 24H Volume KDE")
# plt.xlim(0, 2e8)
# plt.show()

In [None]:
# TODO: time series / autocorelation
# TODO: aggregate weekly and daily patterns and look for anomalies (e.g. Friday)
# TODO: ask for calculation of Greeks (Detla, Vega...)

### Fast Fourier Transform (FFT)
FFT computes the frequency content of the prices as signals.

In [None]:
fig = plt.figure(figsize=(20, 12))
axes = fig.subplots(3, 3)
for i, pool_name in enumerate(pool_names):
    date_volume_df = df[["date", pool_name + "_volumeUSD"]].dropna()
    volumeUSD_series = date_volume_df[pool_name + "_volumeUSD"]

    # reference for zero-mean signal:
    # https://dsp.stackexchange.com/questions/46950/removing-mean-from-signal-massively-distorts-fft
    # only keep those with freq STRICTLY > 0
    f_max = math.ceil(date_volume_df.shape[0]/2)
    Y = abs(np.fft.fft(volumeUSD_series - volumeUSD_series.mean()))[1:f_max]
    freq = np.fft.fftfreq(date_volume_df.shape[0], 1)[1:f_max]

    ax = axes[math.floor(i/3), i%3]
    ax.title.set_text(pool_name)
    ax.plot(freq, Y)

fig.suptitle("24H Volume FFT")
fig.supxlabel("freq (in /day)")
fig.supylabel("24H Volume (in USD)")
plt.show()

In [None]:
fig = plt.figure(figsize=(20, 12))
axes = fig.subplots(3, 3)
for i, pool_name in enumerate(pool_names):
    date_volume_df = df[["date", pool_name + "_volumeUSD"]].dropna()
    volumeUSD_series = date_volume_df[pool_name + "_volumeUSD"]

    # reference for zero-mean signal:
    # https://dsp.stackexchange.com/questions/46950/removing-mean-from-signal-massively-distorts-fft
    # only keep those with freq STRICTLY > 0
    f_max = math.ceil(date_volume_df.shape[0]/2)
    Y = abs(np.fft.fft(volumeUSD_series - volumeUSD_series.mean()))[1:f_max]
    freq = np.fft.fftfreq(date_volume_df.shape[0], 1)[1:f_max]

    # c.f. power spectral density in signal processing
    spectrum = Y.real*Y.real + Y.imag*Y.imag

    ax = axes[math.floor(i/3), i%3]
    ax.title.set_text(pool_name)
    ax.set_xlim(left=freq[1], right=freq[-1])

    # Note: this is possible because FFT must give positive values,
    # so that their logarithms always exist.
    # plot log10(spectrum) against frequency
    ax.semilogy(freq, spectrum)

fig.suptitle("Semilog Plot of 24H Volume FFT")
fig.supxlabel("freq (in /day)")
fig.supylabel("Magnitude")
plt.show()

## TVL Analysis

In [None]:
fig = plt.figure(figsize=(20, 12))
axes = fig.subplots(3, 3, sharex=True)
for i, pool_name in enumerate(pool_names):
    date_tvl_df = df[["date", pool_name + "_tvlUSD", "timestamp"]].dropna()
    # TODO: customize number of turning points
    reg_result = pwlf_helper.regression(date_tvl_df["timestamp"], date_tvl_df[pool_name + "_tvlUSD"], 6)

    ax = axes[math.floor(i/3), i%3]
    ax.title.set_text(pool_name + " TVL over Time")

    ax.plot(date_tvl_df["date"], date_tvl_df[pool_name + "_tvlUSD"])
    ax.plot(date_tvl_df["date"], reg_result.yHat, '--')

    # ax.legend(["TVL", "PWLF fitted trend line"])

    # # x-axis ticks are spaced out biweekly (for now)
    # ax.xticks(pd.date_range(date_tvl_df["date"].iloc[0], date_tvl_df["date"].iloc[-1], freq="14D"))

    # # annotate turning points
    # for tp in reg_result.tp[1:-1]:
    #     tp_date = dt.date.fromtimestamp(tp)
    #     tp_str = tp_date.strftime("%Y-%m-%d")
    #     pred = reg_result.predict(tp)
    #     ax.annotate(tp_str, xy=(tp_date, pred), xytext=(tp_date, pred+0.5e8),
    #         arrowprops=dict(arrowstyle="->", color='red')
    # )

fig.suptitle("Growth Stages of TVL")
fig.supxlabel("date")
fig.supylabel("TVL (in USD)")
plt.show()