# 1 - Data Collection

In this section, we fetch data from Uniswap V3 subgraph, and store them in json for further processing.

In [None]:
# Standard Library
import datetime as dt
import glob
import json
import os
from pprint import pprint

# Third Party Library
import numpy as np
from flatdict import FlatDict
import pandas as pd
from gql import Client, gql
from gql.transport.aiohttp import AIOHTTPTransport

# Local Folder Library
from pyammanalysis.graphql_helper import run_query
from pyammanalysis.util import read_yaml

In [None]:
# refetch setting - if True, rerun GraphQL queries
refetch = True

# data folder paths
data_folder = "data"
pool_day_data_folder = os.path.join(data_folder, "pool", "day")

if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# config
config = read_yaml("../config.yaml")
UNISWAP_V3_SUBGRAPH_URL = config["UNISWAP_V3_SUBGRAPH_URL"]

token_dict = config["tokens"]
token_addr_dict = config["token_addr"]  # mapping from symbol to addr
whitelisted_symbols = np.sort(
    np.concatenate([i for i in FlatDict(token_dict).itervalues()])
)

# address-related config
# addresses in `config.yaml` follow EIP-55: Mixed-case checksum address encoding
# enforce lower case by `str.lower()`
sym2addr = lambda symbol: config["token_addr"][
    symbol
].lower()  # mapping from symbol to addr
addr2sym = lambda addr: {v.lower(): k for k, v in config["token_addr"].items()}[
    addr
]  # mapping from addr to symbol
whitelisted_addresses = np.array(
    [i.lower() for i in FlatDict(token_addr_dict).itervalues()]
)

In [None]:
TWENTY_LARGEST_TVL_POOLS_QUERY = """
{
    pools(first: 20, orderBy: totalValueLockedUSD, orderDirection: desc) {
        id
    } 
}
"""

LARGEST_TVL_POOLS_PATH = os.path.join(data_folder, "largestTVLPoolAddr.json")

if refetch:
    largest_tvl_pool_ids = run_query(
        UNISWAP_V3_SUBGRAPH_URL, TWENTY_LARGEST_TVL_POOLS_QUERY
    )
    largest_tvl_pool_addrs = list(
        map(lambda x: x["id"], largest_tvl_pool_ids["data"]["pools"])
    )
    with open(LARGEST_TVL_POOLS_PATH, "w") as f:
        json.dump({"poolList": largest_tvl_pool_addrs}, f, indent=4)
else:
    with open(LARGEST_TVL_POOLS_PATH, "r") as f:
        largest_tvl_pool_addrs = json.load(f)["poolList"]

print(largest_tvl_pool_addrs)

In [None]:
GET_POOL_BY_ID_QUERY = gql(
    """
    query getPoolById($pool_addr: ID!) {
        pool(id: $pool_addr) {
            tick
            token0 {
                symbol
                id
                decimals
            }
            token1 {
                symbol
                id
                decimals
            }
            feeTier
            sqrtPrice
            liquidity
        }
    }
    """
)

In [None]:
async def fetch_pools_metadata(addresses: list, verbose: bool = False):
    result = {"topPoolDatas": []}

    transport = AIOHTTPTransport(url=UNISWAP_V3_SUBGRAPH_URL)

    async with Client(
        transport=transport,
        fetch_schema_from_transport=True,
    ) as session:
        for id in addresses:
            params = {"pool_addr": id}
            temp = await session.execute(GET_POOL_BY_ID_QUERY, variable_values=params)
            result["topPoolDatas"].append(temp["pool"])

    if verbose:
        pprint(result)

    with open(f"{data_folder}/topPoolDatas.json", "w") as f:
        json.dump(result, f, indent=4)

In [None]:
if refetch:
    await fetch_pools_metadata(largest_tvl_pool_addrs)

In [None]:
def format_pool_name(symbol0: str, symbol1: str, fee_tier: int) -> str:
    return f"{symbol0}_{symbol1}_{fee_tier}"

In [None]:
with open(f"{data_folder}/topPoolDatas.json", "r") as f:
    top_pool_datas = json.load(f)

# replace nested dict with token addr
for pool_dict in top_pool_datas["topPoolDatas"]:
    for token in ["token0", "token1"]:
        pool_dict[token] = pool_dict[token]["id"]

top_pools_df = pd.DataFrame.from_dict(top_pool_datas["topPoolDatas"]).astype(
    {"token0": str, "token1": str, "feeTier": int}
)

# drop unused cols
top_pools_df.drop(columns=["tick", "sqrtPrice", "liquidity"], inplace=True)

# add addr
top_pools_df["pool_addr"] = largest_tvl_pool_addrs

# whitelist a pool if both its token0 and token1 are whitelisted
is_whitelisted_pool = top_pools_df["token0"].isin(whitelisted_addresses) & top_pools_df[
    "token1"
].isin(whitelisted_addresses)
top_pools_df = top_pools_df[is_whitelisted_pool]

# add name
top_pools_df["name"] = top_pools_df.apply(
    lambda x: format_pool_name(
        addr2sym(x["token0"]), addr2sym(x["token1"]), x["feeTier"]
    ),
    axis=1,
)

top_pools_df

In [None]:
top_pools_df.info()

In [None]:
POOL_CHART = gql(
    """
    query poolDayDatas($startTime: Int!, $skip: Int!, $address: String!) {
        poolDayDatas(
            first: 1000
            skip: $skip
            where: { pool: $address, date_gt: $startTime }
            orderBy: date
            orderDirection: asc
            subgraphError: allow
        ) {
            date
            volumeUSD
            tvlUSD
            feesUSD
        }
    }
    """
)

In [None]:
fetch_pool_error = []

# ref: https://github.com/Uniswap/v3-info/blob/770a05dc1a191cf229432ebc43c1f2ceb3666e3b/src/data/pools/chartData.ts#L14
async def fetch_pool_chart_data(
    address: str, symbol0: str, symbol1: str, fee_tier: int, verbose: bool = False
):
    START_TIMESTAMP = 1619170975  # GMT: Friday, April 23, 2021 9:42:55 AM
    # END_TIMESTAMP = int(time.time()) # current timestamp

    error = False
    skip = 0
    all_found = False
    result = {"poolDayDatas": []}

    transport = AIOHTTPTransport(url=UNISWAP_V3_SUBGRAPH_URL)

    async with Client(
        transport=transport,
        fetch_schema_from_transport=True,
    ) as session:
        params = {"address": address, "startTime": START_TIMESTAMP, "skip": skip}
        try:
            while not all_found:
                temp = await session.execute(POOL_CHART, variable_values=params)
                skip += 1000
                if len(temp["poolDayDatas"]) < 1000 or error:
                    all_found = True
                if temp:
                    result["poolDayDatas"] = (
                        result["poolDayDatas"] + temp["poolDayDatas"]
                    )  # concat the lists
        except Exception as e:
            print(e)
            error = True
            fetch_pool_error.append(address)

    if not error:
        if verbose:
            pprint(result)

        if not os.path.exists(pool_day_data_folder):
            os.makedirs(pool_day_data_folder)

        with open(
            f"{pool_day_data_folder}/{format_pool_name(symbol0, symbol1, fee_tier)}.json",
            "w",
        ) as f:
            json.dump(result, f, indent=4)

In [None]:
if refetch:
    # remove existing content in the out folder
    for f in glob.glob(pool_day_data_folder + "/*"):
        os.remove(f)

    # fetch pool data for each pool
    for i, row in top_pools_df.iterrows():
        await fetch_pool_chart_data(
            row["pool_addr"],
            addr2sym(row["token0"]),
            addr2sym(row["token1"]),
            row["feeTier"],
        )
    print(fetch_pool_error)

In [None]:
# reads pool day datas from json
df = pd.DataFrame(columns=["date"])
pool_names = []

for f in os.listdir(pool_day_data_folder):
    fullname = os.fsdecode(f)

    # not a rigorous check
    with open(os.path.join(pool_day_data_folder, fullname), "r") as file:
        pool_day_datas = json.load(file)

    # parse dict as df
    temp = pd.DataFrame.from_dict(pool_day_datas["poolDayDatas"]).astype(
        {"volumeUSD": np.float64, "tvlUSD": np.float64}
    )

    # Note: there is no need to analyze fees separately,
    # as it is a fixed proportion of the pool's trade volume
    temp.drop(columns=["feesUSD"], inplace=True)

    # prefix columns (except "date") with pool name
    cols = temp.columns[~temp.columns.isin(["date"])]
    pool_name = fullname.split(os.sep)[-1].split(".")[0]
    pool_names.append(pool_name)
    temp.rename(columns=dict(zip(cols, pool_name + "_" + cols)), inplace=True)

    # outer join: union of items on "date"
    df = pd.merge(df, temp, how="outer", on=["date"])

# sort by "date"
df.sort_values(by="date", inplace=True)
df.reset_index(drop="index", inplace=True)

df.head()

In [None]:
# ["date"]: int -> date (in "YYYY-MM-DD")
df["timestamp"] = df["date"]  # keep timestamp in a new col
df["date"] = df["date"].map(dt.date.fromtimestamp)

df.head()

In [None]:
df.info()

In [None]:
# sanity check for number of days elapsed
print(df["date"][0], "to", dt.date.today(), "has", (dt.date.today() - df["date"][0]))

In [None]:
pools_df = pd.DataFrame(
    list(zip(pool_names, largest_tvl_pool_addrs)), columns=["name", "addr"]
)
pools_df.to_csv(os.path.join(data_folder, "pools_df.csv"), index=False)
df.to_csv(os.path.join(pool_day_data_folder, "poolDay.csv"), index=False)