# EDA

## Importing libraries

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("xnas-itch-20230703.tbbo.csv")

In [3]:
# Preprocessing to create necessary columns
data['price']=data['price']/1e9
data['bid_px_00']=data['bid_px_00']/1e9
data['ask_px_00']=data['ask_px_00']/1e9

data['Close'] = data['price']
data['Volume'] = data['size']
data['High'] = data[['bid_px_00', 'ask_px_00']].max(axis=1)
data['Low'] = data[['bid_px_00', 'ask_px_00']].min(axis=1)
data['Open'] = data['Close'].shift(1).fillna(data['Close'])

# data

In [4]:
# Unique values for each column
# data.nunique()

In [5]:
data = data.drop(['rtype', 'publisher_id', 'instrument_id', 'action', 'depth', 'symbol'], axis=1)
data.head()

Unnamed: 0,ts_recv,ts_event,side,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,bid_sz_00,ask_sz_00,bid_ct_00,ask_ct_00,Close,Volume,High,Low,Open
0,1688371200660869841,1688371200660704717,B,194.12,1,130,165124,303634,193.63,194.12,27,27,1,1,194.12,1,194.12,193.63,194.12
1,1688371201201402566,1688371201201237816,B,194.11,2,130,164750,304724,193.9,194.11,5,400,1,1,194.11,2,194.11,193.9,194.12
2,1688371201233688992,1688371201233524761,B,194.11,8,130,164231,304850,193.9,194.11,5,398,1,1,194.11,8,194.11,193.9,194.11
3,1688371201317556361,1688371201317392163,B,194.11,2,130,164198,305101,193.9,194.11,5,390,1,1,194.11,2,194.11,193.9,194.11
4,1688371201478520666,1688371201478356044,B,194.0,7,130,164622,306430,193.9,194.0,5,200,1,1,194.0,7,194.0,193.9,194.11


In [6]:
import numpy as np
import talib as ta

class TechnicalIndicators:
    def __init__(self, data):
        self.data = data

    def add_momentum_indicators(self):
        self.data['RSI'] = ta.RSI(self.data['Close'], timeperiod=14)
        self.data['MACD'], self.data['MACD_signal'], self.data['MACD_hist'] = ta.MACD(self.data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
        self.data['Stoch_k'], self.data['Stoch_d'] = ta.STOCH(self.data['High'], self.data['Low'], self.data['Close'],
                                                              fastk_period=14, slowk_period=3, slowd_period=3)

    def add_volume_indicators(self):
        self.data['OBV'] = ta.OBV(self.data['Close'], self.data['Volume'])

    def add_volatility_indicators(self):
        self.data['Upper_BB'], self.data['Middle_BB'], self.data['Lower_BB'] = ta.BBANDS(self.data['Close'], timeperiod=20)
        self.data['ATR_1'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=1)
        self.data['ATR_2'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=2)
        self.data['ATR_5'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)
        self.data['ATR_10'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=10)
        self.data['ATR_20'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=20)

    def add_trend_indicators(self):
        self.data['ADX'] = ta.ADX(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['+DI'] = ta.PLUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['-DI'] = ta.MINUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['CCI'] = ta.CCI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)

    def add_other_indicators(self):
        self.data['DLR'] = np.log(self.data['Close'] / self.data['Close'].shift(1))
        self.data['TWAP'] = self.data['Close'].expanding().mean()
        self.data['VWAP'] = (self.data['Volume'] * (self.data['High'] + self.data['Low']) / 2).cumsum() / self.data['Volume'].cumsum()

    def add_all_indicators(self):
        self.add_momentum_indicators()
        self.add_volume_indicators()
        self.add_volatility_indicators()
        self.add_trend_indicators()
        self.add_other_indicators()
        return self.data

In [9]:
data.head()

Unnamed: 0,ts_recv,ts_event,side,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,...,ATR_5,ATR_10,ATR_20,ADX,+DI,-DI,CCI,DLR,TWAP,VWAP
0,1688371200660869841,1688371200660704717,B,194.12,1,130,165124,303634,193.63,194.12,...,,,,,,,,,194.12,193.875
1,1688371201201402566,1688371201201237816,B,194.11,2,130,164750,304724,193.9,194.11,...,,,,,,,,-5.2e-05,194.115,193.961667
2,1688371201233688992,1688371201233524761,B,194.11,8,130,164231,304850,193.9,194.11,...,,,,,,,,0.0,194.113333,193.993182
3,1688371201317556361,1688371201317392163,B,194.11,2,130,164198,305101,193.9,194.11,...,,,,,,,,0.0,194.1125,193.995
4,1688371201478520666,1688371201478356044,B,194.0,7,130,164622,306430,193.9,194.0,...,,,,,,,-74.468085,-0.000567,194.09,193.97925


In [13]:
data['ts_recv'] = pd.to_datetime(data['ts_recv'], unit='ns')

data.head()

Unnamed: 0,ts_recv,ts_event,side,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,...,ATR_5,ATR_10,ATR_20,ADX,+DI,-DI,CCI,DLR,TWAP,VWAP
0,2023-07-03 08:00:00.660869841,1688371200660704717,B,194.12,1,130,165124,303634,193.63,194.12,...,,,,,,,,,194.12,193.875
1,2023-07-03 08:00:01.201402566,1688371201201237816,B,194.11,2,130,164750,304724,193.9,194.11,...,,,,,,,,-5.2e-05,194.115,193.961667
2,2023-07-03 08:00:01.233688992,1688371201233524761,B,194.11,8,130,164231,304850,193.9,194.11,...,,,,,,,,0.0,194.113333,193.993182
3,2023-07-03 08:00:01.317556361,1688371201317392163,B,194.11,2,130,164198,305101,193.9,194.11,...,,,,,,,,0.0,194.1125,193.995
4,2023-07-03 08:00:01.478520666,1688371201478356044,B,194.0,7,130,164622,306430,193.9,194.0,...,,,,,,,-74.468085,-0.000567,194.09,193.97925


In [7]:
ti = TechnicalIndicators(data)
df_with_indicators = ti.add_all_indicators()
df_with_indicators

Unnamed: 0,ts_recv,ts_event,side,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,...,ATR_5,ATR_10,ATR_20,ADX,+DI,-DI,CCI,DLR,TWAP,VWAP
0,1688371200660869841,1688371200660704717,B,194.12,1,130,165124,303634,193.63,194.12,...,,,,,,,,,194.120000,193.875000
1,1688371201201402566,1688371201201237816,B,194.11,2,130,164750,304724,193.90,194.11,...,,,,,,,,-0.000052,194.115000,193.961667
2,1688371201233688992,1688371201233524761,B,194.11,8,130,164231,304850,193.90,194.11,...,,,,,,,,0.000000,194.113333,193.993182
3,1688371201317556361,1688371201317392163,B,194.11,2,130,164198,305101,193.90,194.11,...,,,,,,,,0.000000,194.112500,193.995000
4,1688371201478520666,1688371201478356044,B,194.00,7,130,164622,306430,193.90,194.00,...,,,,,,,-74.468085,-0.000567,194.090000,193.979250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59266,1688417954514485218,1688417954514320323,B,192.44,6,130,164895,252532002,192.40,192.44,...,0.031258,0.028636,0.027592,15.474528,6.612534,3.765166,118.055556,0.000208,192.701135,192.722211
59267,1688417961020718430,1688417961020553920,B,192.44,1,130,164510,252532102,192.40,192.44,...,0.033006,0.029772,0.028212,16.329018,5.953254,3.389772,83.333333,0.000000,192.701131,192.722211
59268,1688417973297905504,1688417973297741235,A,192.40,5,130,164269,252532347,192.40,192.46,...,0.038405,0.032795,0.029802,19.013869,9.751295,2.919558,12.820513,-0.000208,192.701125,192.722211
59269,1688417996889779362,1688417996889614660,B,192.45,3,130,164702,252532944,192.40,192.45,...,0.040724,0.034515,0.030812,21.506945,8.671762,2.596344,100.000000,0.000260,192.701121,192.722211


In [8]:
market_features_df = df_with_indicators[35:]
market_features_df

Unnamed: 0,ts_recv,ts_event,side,price,size,flags,ts_in_delta,sequence,bid_px_00,ask_px_00,...,ATR_5,ATR_10,ATR_20,ADX,+DI,-DI,CCI,DLR,TWAP,VWAP
35,1688371214386057385,1688371214385893078,N,194.05,50,130,164307,326232,194.0,194.30,...,0.098615,0.075141,0.072403,97.257397,30.435801,0.196362,166.666667,0.000000,194.020000,194.021894
36,1688371214386063777,1688371214385899379,N,194.05,50,130,164398,326233,194.0,194.30,...,0.138892,0.097627,0.083783,97.361721,22.989295,0.148320,83.333333,0.000000,194.020811,194.025188
37,1688371215804852019,1688371215804687301,B,194.21,10,130,164718,328131,194.0,194.21,...,0.153114,0.108864,0.090094,97.458593,19.409454,0.125224,79.268293,0.000824,194.025789,194.025596
38,1688371219671476629,1688371219671312224,N,194.14,10,130,164405,331406,194.0,194.16,...,0.164491,0.118978,0.096089,97.548546,16.622008,0.107240,-3.205128,-0.000360,194.028718,194.025873
39,1688371223368835585,1688371223368671235,B,194.13,10,130,164350,334235,194.0,194.13,...,0.159593,0.121080,0.098285,97.632074,15.068361,0.097216,-113.095238,-0.000052,194.031250,194.026071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59266,1688417954514485218,1688417954514320323,B,192.44,6,130,164895,252532002,192.4,192.44,...,0.031258,0.028636,0.027592,15.474528,6.612534,3.765166,118.055556,0.000208,192.701135,192.722211
59267,1688417961020718430,1688417961020553920,B,192.44,1,130,164510,252532102,192.4,192.44,...,0.033006,0.029772,0.028212,16.329018,5.953254,3.389772,83.333333,0.000000,192.701131,192.722211
59268,1688417973297905504,1688417973297741235,A,192.40,5,130,164269,252532347,192.4,192.46,...,0.038405,0.032795,0.029802,19.013869,9.751295,2.919558,12.820513,-0.000208,192.701125,192.722211
59269,1688417996889779362,1688417996889614660,B,192.45,3,130,164702,252532944,192.4,192.45,...,0.040724,0.034515,0.030812,21.506945,8.671762,2.596344,100.000000,0.000260,192.701121,192.722211
