# Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
# import cvxopt # <- installation via conda recommended
from collections import defaultdict
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
import scipy.optimize as sopt
import scipy.stats as sstats
import csv

# Data loading

BID = BUY, ASK = SELL

We will be using LOB (Limit Order Book) data from ___ market, September 2013.
Every row of our data represent all active ask and bid orders in some moment on time. Row can be describe as below:

date time 'BID' $p_{b1}$ $w_{b1}$ $p_{b2}$ $w_{b2}$ ... $p_{bn}$ $w_{bn}$ 'ASK' $p_{a1}$ $w_{a1}$ $p_{a2}$ $w_{a2}$ ... $p_{am}$ $w_{am}$,
where $p_b$, $w_b$ are prices and size of bid order and $p_a$, $w_a$ are prives and sizes of ask order. Prices $p_x$ are sorted ascending.

LOB data are often represented as 3-element tuples $(p_x,w_x,t_x)$, where $p_x,w_x,t_x$ represent price,size and time of $x-th$ order and $w_x$ is greater than zero for ask order.

In our case it will be batter to represent data as a list in which every element is tuple of bid and ask orders lists. Bid and ask lists consist of $(p_x,w_x)$ tuples, and $w_x > 0$ for all orders.

We consider orders from $8:30$ to $16:30$ to eliminate abnormal trading behaviour that can occur shortly after the opening auction or shortly before closing auction.





In [2]:
def load_data(path,start_time=83000000,stop_time=163000000):
    X = []
    with open(path,newline='') as file:
        csv_reader = csv.reader(file,delimiter='\t')
        for row in csv_reader:
            date,time = map(int,row[0].split(' '))
            if time < start_time or time > stop_time:
                continue
            
            line = 2
            ASK_list = []
            BID_list = []
            while line < len(row):
                if row[line] == 'ASK':
                    break
                p,w = map(float,row[line:line+2])
                BID_list.append((p,w))
                line += 2
            line += 1
            while line < len(row):
                p,w = map(float,row[line:line+2])
                ASK_list.append((p,w))
                line += 2
            
            X.append((BID_list,ASK_list))

    return X 

In [3]:
path = "C:\Projekt_ED\OrderBookSnapshots.csv"
data = load_data(path)

In [4]:
len(data)

4810

In [5]:
data[0]

([(258.6, 830.0),
  (2100.0, 6774.0),
  (2104.0, 20000.0),
  (2220.0, 100.0),
  (2225.0, 1335.0),
  (2250.0, 1300.0),
  (2300.0, 2697.0),
  (2336.5, 50.0),
  (2350.0, 581.0),
  (2370.0, 710.0),
  (2400.0, 5503.0),
  (2450.0, 502.0),
  (2485.0, 330.0),
  (2493.0, 697.0),
  (2500.0, 9167.0),
  (2515.0, 272.0),
  (2525.0, 1000.0),
  (2540.0, 120.0),
  (2550.0, 2000.0),
  (2569.0, 2750.0),
  (2575.0, 250.0),
  (2585.0, 330.0),
  (2590.0, 519.0),
  (2595.0, 1000.0),
  (2597.0, 30.0),
  (2600.0, 8823.0),
  (2607.0, 3518.0),
  (2610.0, 976.0),
  (2636.0, 325.0),
  (2649.0, 250.0),
  (2649.5, 100.0),
  (2650.0, 1103.0),
  (2651.0, 2250.0),
  (2655.0, 261.0),
  (2665.0, 400.0),
  (2670.0, 300.0),
  (2686.0, 208.0),
  (2690.0, 820.0),
  (2700.0, 3134.0),
  (2708.0, 75.0),
  (2710.0, 344.0),
  (2715.0, 500.0),
  (2726.0, 77.0),
  (2731.0, 150.0),
  (2750.0, 3277.0),
  (2770.0, 400.0),
  (2785.0, 155.0),
  (2787.0, 200.0),
  (2800.0, 10011.0),
  (2805.0, 1959.0),
  (2810.0, 4207.0),
  (2820.0, 101

At a given time $t$, the bid price $b(t)$ is the highest stated price among active buy orders,  
<center>$b(t) = \max_{x \in BIDlist(t)} p_x $</center>  
and the ask price $a(t)$ is the lowest stated price among active sell orders,  
<center>$a(t) = \min_{x \in ASKlist(t)} p_x $</center>  
The mid price at time $t$ is  
<center>$m(t) = \frac{a(t)+b(t)}{2} $</center>  
  
The bid size $n_b(t)$ is total size of active buy orders with price equal to bid price  
<center>$n_b(t) = \sum_{x \in BIDlist(t) | px = b(t)} w_x $</center>  
and ask size $n_b(t)$ is total size of active sell orders with price equal to ask price  
<center>$n_a(t) = \sum_{x \in ASKlist(t) | px = a(t)} w_x $</center>  
  
At a given time $t$, the queue imbalance $I(t)$ is normalized difference between $n_b(t)$ and $n_a(t)$  
<center>$I(t) = \frac{n_b(t) - n_a(t)}{n_b(t) + n_a(t)} $</center>  

In [6]:
def bid_price(data,t):
    return data[t][0][-1][0]

In [7]:
def ask_price(data,t):
    return data[t][1][0][0]

In [8]:
def mid_price(data,t):
    return (bid_price(data,t) + ask_price(data,t))/2

In [9]:
def bid_size(data,t):
    return data[t][0][-1][1]

In [10]:
def ask_size(data,t):
    return data[t][1][0][1]

In [11]:
def queue_imbalance(data,t):
    nb = bid_size(data,t)
    na = ask_size(data,t)
    return (nb-na)/(nb+na)

In [None]:
def queue_imbalance_k(data,t):
    

In [12]:
mid_price(data,3)

2997.0

In [41]:
def get_time_and_target(data):
    T = [0]
    target = []
    for t in range(1,len(data)):
        t_1 = T[-1]
        mt = mid_price(data,t)
        mt_1 = mid_price(data,t_1)
        if mt != mt_1:
            T.append(t)
            if mt > mt_1:
                target.append(1)
            else:
                target.append(0)
    return np.array(T[:-1]),np.array(target)

In [42]:
T,target = get_time_and_target(data)

In [43]:
target

array([1, 0, 0, ..., 1, 0, 1])

In [59]:
X = np.array([queue_imbalance(data,t) for t in T])

In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
         X, target, test_size=0.2, random_state=42,shuffle=False)

In [61]:
LR = LogisticRegression()

In [62]:
X

array([-0.81956155,  0.87846764,  0.92975207, ..., -0.36411609,
        0.35603996, -0.97839006])

In [63]:
X_train.reshape(-1, 1)

array([[-0.81956155],
       [ 0.87846764],
       [ 0.92975207],
       ...,
       [ 0.53651685],
       [-0.55586987],
       [ 0.76680108]])

In [64]:
LR.fit(X_train.reshape(-1, 1),y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
preds = LR.predict(X_test.reshape(-1, 1))

In [66]:
preds

array([1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [67]:
np.mean(preds == y_test)

0.5403225806451613

In [69]:
preds_train = LR.predict(X_train.reshape(-1,1))

In [70]:
np.mean(preds_train == y_train)

0.5452710495963091