# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
# import cvxopt # <- installation via conda recommended
from collections import defaultdict
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
import scipy.optimize as sopt
import scipy.stats as sstats
import csv

# Data loading

BID = BUY, ASK = SELL

We will be using LOB (Limit Order Book) data from ___ market, September 2013.
Every row of our data represent all active ask and bid orders in some moment on time. Row can be describe as below:

date time 'BID' $p_{b1}$ $w_{b1}$ $p_{b2}$ $w_{b2}$ ... $p_{bn}$ $w_{bn}$ 'ASK' $p_{a1}$ $w_{a1}$ $p_{a2}$ $w_{a2}$ ... $p_{am}$ $w_{am}$,
where $p_b$, $w_b$ are prices and size of bid order and $p_a$, $w_a$ are prives and sizes of ask order. Prices $p_x$ are sorted ascending.

LOB data are often represented as 3-element tuples $(p_x,w_x,t_x)$, where $p_x,w_x,t_x$ represent price,size and time of $x-th$ order and $w_x$ is greater than zero for ask order.

In our case it will be batter to represent data as a list in which every element is tuple of bid and ask orders lists. Bid and ask lists consist of $(p_x,w_x)$ tuples, and $w_x > 0$ for all orders.

We consider orders from $8:30$ to $16:30$ to eliminate abnormal trading behaviour that can occur shortly after the opening auction or shortly before closing auction.





In [None]:
def load_data(path,start_time=83000000,stop_time=163000000):
    X = []
    with open(path,newline='') as file:
        csv_reader = csv.reader(file,delimiter='\t')
        for row in csv_reader:
            date,time = map(int,row[0].split(' '))
            if time < start_time or time > stop_time:
                continue
            
            line = 2
            ASK_list = []
            BID_list = []
            while line < len(row):
                if row[line] == 'ASK':
                    break
                p,w = map(float,row[line:line+2])
                BID_list.append((p,w))
                line += 2
            line += 1
            while line < len(row):
                p,w = map(float,row[line:line+2])
                ASK_list.append((p,w))
                line += 2
            
            X.append((BID_list,ASK_list))

    return X 

In [None]:
path = "C:\Projekt_ED\OrderBookSnapshots.csv"
data = load_data(path)

In [None]:
len(data)

In [None]:
data[0]

At a given time $t$, the bid price $b(t)$ is the highest stated price among active buy orders,  
<center>$b(t) = \max_{x \in BIDlist(t)} p_x $</center>  
and the ask price $a(t)$ is the lowest stated price among active sell orders,  
<center>$a(t) = \min_{x \in ASKlist(t)} p_x $</center>  
The mid price at time $t$ is  
<center>$m(t) = \frac{a(t)+b(t)}{2} $</center>  
  
The bid size $n_b(t)$ is total size of active buy orders with price equal to bid price  
<center>$n_b(t) = \sum_{x \in BIDlist(t) | px = b(t)} w_x $</center>  
and ask size $n_b(t)$ is total size of active sell orders with price equal to ask price  
<center>$n_a(t) = \sum_{x \in ASKlist(t) | px = a(t)} w_x $</center>  
  
At a given time $t$, the queue imbalance $I(t)$ is normalized difference between $n_b(t)$ and $n_a(t)$  
<center>$I(t) = \frac{n_b(t) - n_a(t)}{n_b(t) + n_a(t)} $</center>  

In [None]:
def bid_price(data,t):
    return data[t][0][-1][0]

In [None]:
def ask_price(data,t):
    return data[t][1][0][0]

In [None]:
def mid_price(data,t):
    return (bid_price(data,t) + ask_price(data,t))/2

In [None]:
def bid_size(data,t):
    return data[t][0][-1][1]

In [None]:
def ask_size(data,t):
    return data[t][1][0][1]

In [None]:
def queue_imbalance(data,t):
    nb = bid_size(data,t)
    na = ask_size(data,t)
    return (nb-na)/(nb+na)

In [None]:
mid_price(data,3)

In [None]:
T = [0]
for t in range(1,len(data)):
    mt = mid_price(data,t)
    mt_1 = mid_price(data,T[-1])
    if mt != mt_1:
        T.append(t)
T = np.array(T)