# Check data

Investigate combined data for uni exchange.

In [13]:
import datetime
from pathlib import Path

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from artool import toy
from artool.toy.toy_simu import get_pnl_simple

# remove limits on number of rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

import matplotlib
matplotlib.use("Agg")  # important to improve performance
import matplotlib.pyplot as plt
import seaborn as sns


In [14]:
data_dir = Path("/home/yangzhe/data/toy_data_2")
date_start = datetime.datetime(2022, 3, 1)
date_end = datetime.datetime(2022, 9, 1)
symbols = toy.toy_data.get_symbol_list(date_start, date_end)

res_dir = Path("./check_data")
res_dir.mkdir(exist_ok=True)

## Merge all symbol data

In [15]:
# Merge all symbols
df = pd.DataFrame()
for symbol in symbols:
    df_ = pd.read_feather(data_dir / f"{symbol}.feather")
    # remove head/tail 5 rows
    df_ = df_.iloc[5:-5]
    df = pd.concat([df, df_], axis=0)
df = df.reset_index(drop=True)

#df[["symbol", "funding_rate", "funding_rate_future_5"]].head(10)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107595 entries, 0 to 107594
Columns: 120 entries, symbol to mark_price_rol_kurt_50
dtypes: float64(119), object(1)
memory usage: 98.5+ MB


## Investigate indicator distribution

In [16]:
df["funding_rate_expcumsum_5"].describe()

count    107595.000000
mean         -0.000166
std           0.001223
min          -0.052888
25%          -0.000323
50%           0.000149
75%           0.000389
max           0.003644
Name: funding_rate_expcumsum_5, dtype: float64

In [17]:
#for lookforward in [1, 2, 3, 5, 10]:
#    ft = f"funding_rate_expcumsum_{lookforward}"

ft = f"funding_rate_expcumsum_5"
sr_tmp = df[ft]
min_val = sr_tmp.min()
max_val = sr_tmp.max()
print(f"{ft}: min={min_val}, max={max_val}")
fig, ax = plt.subplots(figsize=(6, 4))
sns.histplot(sr_tmp, ax=ax)
ax.set_title(ft)
ax.set_xlim(-0.01, 0.01)
ax.set_xlabel(ft)
ax.set_ylabel("count")
fig.savefig(res_dir / f"{ft}.png")


funding_rate_expcumsum_5: min=-0.052888190384714026, max=0.003643558068698228


## What are those outliers?

In [18]:
ft = f"funding_rate_expcumsum_5"
df_outlier = df[df[ft] < -0.005]
df["is_outlier"] = df[ft] < -0.005

In [19]:
df_outlier.shape
outlier_ratio = df_outlier.shape[0] / df.shape[0]
print(f"outlier_ratio: {outlier_ratio}")

outlier_ratio: 0.007463172080487012


In [20]:
# count different symbols
sb_cnt = df_outlier["symbol"].value_counts()

# plot the first 20 symbols horizontally
fig, ax = plt.subplots(figsize=(4, 6))
sb_cnt[:20][::-1].plot(kind="barh", ax=ax)
ax.set_title("Top 20 symbols with most outliers")
ax.set_xlabel("count")
ax.set_ylabel("symbol")
fig.savefig(res_dir / "top20_symbols.png")

In [21]:
# Get life time of each symbol
life_time = df.groupby("symbol")["funding_timestamp"].agg(["min", "max"])
life_time["life_time"] = pd.to_datetime(life_time["max"], unit="us") - pd.to_datetime(life_time["min"], unit="us")
life_time["life_time"] = life_time["life_time"].dt.days
life_time.reset_index(inplace=True)
life_time.head()


Unnamed: 0,symbol,min,max,life_time
0,1INCHUSDT,1641168000000000.0,1661933000000000.0,240
1,AAVEUSDT,1641168000000000.0,1661933000000000.0,240
2,ADABUSD,1641168000000000.0,1661933000000000.0,240
3,ADAUSDT,1641168000000000.0,1661933000000000.0,240
4,AKROUSDT,1641168000000000.0,1653523000000000.0,143


In [22]:
# plot life time of the first 20 symbols horizontally
top_outlier_symbols = sb_cnt.index[:20]
top_outlier_lifetimes = []
for sb in top_outlier_symbols:
    top_outlier_lifetimes.append(life_time[life_time["symbol"] == sb]["life_time"].values[0])
df_tmp = pd.DataFrame({"symbol": top_outlier_symbols, "life_time": top_outlier_lifetimes})

fig, ax = plt.subplots(figsize=(4, 6))
df_tmp[::-1].plot(kind="barh", x="symbol", y="life_time", ax=ax)
ax.set_title("Top 20 symbols with most outliers")
ax.set_xlabel("life time (days)")
ax.set_ylabel("symbol")
fig.savefig(res_dir / "top20_symbols_lifetime.png")


## Check relation
funding_rate_expcumsum_5 and funding_rate_future_5

In [23]:
fig, ax = plt.subplots(figsize=(8, 6))
# plot funding_rate_future_5 distribution
fr_fut5_not_outlier = df[df["is_outlier"] == False]["funding_rate_future_5"]
fr_fut5_outlier = df[df["is_outlier"] == True]["funding_rate_future_5"]
sns.kdeplot(fr_fut5_not_outlier, ax=ax, label="not outlier")
sns.kdeplot(fr_fut5_outlier, ax=ax, label="outlier")
ax.set_ylim(0.1, 2000)
ax.set_yscale("log")
ax.set_title("funding_rate_future_5")
ax.set_xlabel("funding_rate_future_5")
ax.set_ylabel("arb. unit")
ax.legend()
fig.savefig(res_dir / "funding_rate_future_5.png")

## Other

In [25]:
# Plot funding rate
fig, ax = plt.subplots(figsize=(8, 6))
fr = df["funding_rate"]
sns.kdeplot(fr, ax=ax)
ax.set_title("funding_rate")
ax.set_xlabel("funding_rate")
ax.set_ylabel("arb. unit")
fig.savefig(res_dir / "funding_rate.png")
fr.describe()

count    107595.000000
mean         -0.000038
std           0.000390
min          -0.018750
25%          -0.000071
50%           0.000095
75%           0.000100
max           0.004861
Name: funding_rate, dtype: float64