## EDA (Exploratory Data Analysis) on Bybit Data

In [1]:
%%capture output
!pip install ydata-profiling;

In [14]:
import io

import pandas as pd
from dotenv import load_dotenv, find_dotenv
from ydata_profiling import ProfileReport

from pfeed import bybit
from pfeed.config_handler import ConfigHandler


load_dotenv(find_dotenv())

# load config file
config = ConfigHandler.load_config()


%load_ext autoreload
# Reload all modules (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
pdt = 'BTC_USDT_PERP'
date = '2020-03-25'

### Load raw data

In [15]:
data = bybit.etl.extract_data(pdt, date, 'raw', mode='historical', data_path=config.data_path)
raw_df = pd.read_csv(io.BytesIO(data), compression='gzip')

print(f'{raw_df.shape=}')
raw_df.head()

raw_df.shape=(2693, 10)


Unnamed: 0,timestamp,symbol,side,size,price,tickDirection,trdMatchID,grossValue,homeNotional,foreignNotional
0,1585181000.0,BTCUSDT,Buy,0.042,6698.5,PlusTick,08ff9568-cb50-55d6-b497-13727eec09dc,28133700000.0,0.042,281.337
1,1585181000.0,BTCUSDT,Buy,0.072,6698.0,PlusTick,d9f5154a-d1e7-5ba2-9953-b41d5e74ea68,48225600000.0,0.072,482.256
2,1585181000.0,BTCUSDT,Sell,0.009,6682.0,PlusTick,09dbd416-1ea7-5033-ac1d-11604b0c9bc8,6013800000.0,0.009,60.138
3,1585180000.0,BTCUSDT,Buy,0.082,6676.5,ZeroMinusTick,fd0a0488-3769-5c38-affc-4719c9738bab,54747300000.0,0.082,547.473
4,1585180000.0,BTCUSDT,Buy,0.181,6676.5,PlusTick,8bf4d6e3-5435-5aeb-8ae3-7d3b13896674,120844600000.0,0.181,1208.4465


In [16]:
ProfileReport(raw_df, title=f'{bybit.name} Raw Data Profiling Report')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



### Load tick data

In [17]:
data = bybit.etl.extract_data(pdt, date, 'tick', mode='historical', data_path=config.data_path)
tick_df = pd.read_parquet(io.BytesIO(data))
tick_df.set_index('ts', inplace=True)
tick_df.index = pd.to_datetime(tick_df.index, unit='s')
tick_df.head()

Unnamed: 0_level_0,side,volume,price
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-25 23:58:20.064699888,1,0.042,6698.5
2020-03-25 23:58:20.019999981,1,0.072,6698.0
2020-03-25 23:56:15.859400034,-1,0.009,6682.0
2020-03-25 23:51:08.728499889,1,0.082,6676.5
2020-03-25 23:51:08.728499889,1,0.181,6676.5


### Load second/minute/hour/daily data

In [18]:
data_type = 'second'  # second/minute/hour/daily
data = bybit.etl.extract_data(pdt, date, data_type, mode='historical', data_path=config.data_path)
df = pd.read_parquet(io.BytesIO(data))
df.head()

Unnamed: 0_level_0,num_buys,num_sells,volume,buy_volume,sell_volume,open,high,low,close,first
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-03-25 10:36:12,0,1,0.001,0.0,0.001,6500.0,6500.0,6500.0,6500.0,N
2020-03-25 10:44:16,1,0,0.001,0.001,0.0,6500.0,6500.0,6500.0,6500.0,N
2020-03-25 10:53:14,1,0,0.001,0.001,0.0,6588.0,6588.0,6588.0,6588.0,N
2020-03-25 10:57:19,0,1,0.001,0.0,0.001,6591.5,6591.5,6591.5,6591.5,N
2020-03-25 11:02:37,1,0,0.001,0.001,0.0,6603.5,6603.5,6603.5,6603.5,N
