# High frequency market data: could you guess the stock?

## Import the data and libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)

In [78]:
X = pd.read_csv("data/X_train.csv")
y = pd.read_csv("data/y_train.csv")

X_submit = pd.read_csv("data/X_test.csv")

Convert categorical features to categorical data types for performance optimisation and reduced memory usage.

In [81]:
X["action"] = X["action"].astype("category")
X["side"] = X["side"].astype("category")
X["venue"] = X["venue"].astype("category")
X["order_id"] = X["order_id"].astype("category")

In [82]:
X_submit["action"] = X_submit["action"].astype("category")
X_submit["side"] = X_submit["side"].astype("category")
X_submit["venue"] = X_submit["venue"].astype("category")
X_submit["order_id"] = X_submit["order_id"].astype("category")

## Inspect the data

In [83]:
print(X.info())
X.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16080000 entries, 0 to 16079999
Data columns (total 12 columns):
 #   Column    Dtype   
---  ------    -----   
 0   obs_id    int64   
 1   venue     category
 2   order_id  category
 3   action    category
 4   side      category
 5   price     float64 
 6   bid       float64 
 7   ask       float64 
 8   bid_size  int64   
 9   ask_size  int64   
 10  trade     bool    
 11  flux      int64   
dtypes: bool(1), category(4), float64(3), int64(4)
memory usage: 935.4 MB
None


Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
15059524,150595,4,20,A,A,0.02,0.0,0.02,21,519,False,100
3277453,32774,4,23,D,A,0.03,0.0,0.02,205,100,False,-50
12583449,125834,4,38,A,A,0.24,0.0,0.19,175,1,False,100
15060224,150602,2,16,A,B,-0.01,0.0,0.14,2,355,False,100
4674884,46748,0,48,D,B,-0.02,-0.02,0.02,1700,200,False,-100
9705729,97057,4,28,D,A,0.01,0.0,0.01,501,69,False,-6
11833380,118333,5,27,D,B,0.0,0.01,0.12,200,135,False,-100
15592448,155924,1,33,D,A,0.1,0.02,0.1,100,100,False,-100
1873293,18732,2,76,A,A,0.09,0.03,0.05,5,100,False,3
7301953,73019,4,35,A,B,-0.57,0.02,0.13,20,20,False,100


First check for null entries:

In [7]:
X.isnull().sum()

obs_id      0
venue       0
order_id    0
action      0
side        0
price       0
bid         0
ask         0
bid_size    0
ask_size    0
trade       0
flux        0
dtype: int64

In [8]:
with pd.option_context("display.precision", 2):
    print(X.describe())

         obs_id     price       bid       ask  bid_size  ask_size      flux
count  1.61e+07  1.61e+07  1.61e+07  1.61e+07  1.61e+07  1.61e+07  1.61e+07
mean   8.04e+04  8.81e-01  1.71e-04  5.95e-01  4.30e+02  4.29e+02  2.71e-02
std    4.64e+04  2.18e+02  3.66e-02  1.11e+01  7.40e+02  7.27e+02  1.46e+02
min    0.00e+00 -5.70e+02 -7.60e-01 -3.81e+02 -2.00e+00  1.00e+00 -5.00e+04
25%    4.02e+04 -2.00e-02  0.00e+00  1.00e-02  1.00e+02  1.00e+02 -1.00e+02
50%    8.04e+04  1.00e-02  0.00e+00  4.00e-02  2.23e+02  2.21e+02 -1.00e+00
75%    1.21e+05  9.00e-02  0.00e+00  8.00e-02  5.01e+02  5.00e+02  1.00e+02
max    1.61e+05  2.00e+05  1.62e+01  2.68e+02  8.77e+04  1.50e+05  3.13e+04
