# Main Code

In [1]:
import gzip
from collections import defaultdict
import struct
from datetime import time as dt_t
from sys import getsizeof
import time
import pandas as pd
import gzip

In [3]:
# Initialise variables

next_trigger_time = None
order_book = {}
vwap = {}
report_book = defaultdict(lambda: {})

In [4]:
# Initialise constants

INP_FILE_LOCATION = '01302019_NASDAQ_ITCH50.gz'  # Set the path of input zipped binary file here
NS_PER_HOUR = 60*60*10**9

events_length = {
    "S": 12,
    "R": 39,
    "H": 25,
    "Y": 20,
    "L": 26,
    "V": 35,
    "W": 12,
    "K": 28,
    "J": 35,
    "h": 21,
    "A": 36, # Add order
    "F": 40, # Add order
    "E": 31, # Order executed
    "C": 36, # Order executed with price
    "X": 23,
    "D": 19,
    "U": 35, # Order replace msg
    "P": 44, # Trade msg
    "Q": 40, # Trade msg
    "B": 19,
    "I": 50,
    "N": 20,
    "O": 48 
}

msg_formats = {
    "S": {  # Order executed
        "format_str": ">HH6sc",
        "time_stmp": 2,
        "event_code": 3
    }, 
    "E": {  # Order executed
        "format_str": ">HH6sQIQ",
        "time_stmp": 2,
        "ord_ref_num": 3,
        "shares": 4
    },
    "C": {  # Order executed with price
        "format_str": ">HH6sQIQcI", 
        "time_stmp": 2,
        "ord_ref_num": 3,
        "shares": 4,
        "printable": 6,
        "price": 7
    },
    "P": {  # Non cross for hidden orders
        "format_str": ">HH6sQcI8sIQ",
        "time_stmp": 2,
        "shares": 5,
        "stock": 6,
        "price": 7
    },
    "Q": {  # Cross trades
        "format_str": ">HH6sQ8sIQc",
        "time_stmp": 2,
        "shares": 3,
        "stock": 4,
        "price": 5
    },
    "A": {  # Add Order
        "format_str": ">HH6sQcI8sI",
        "time_stmp": 2,
        "ord_ref_num": 3,
        "shares": 5,
        "stock": 6,
        "price": 7
    },
    "F": {  # Add Order with MPID
        "format_str": ">HH6sQcI8sI4s",
        "time_stmp": 2,
        "ord_ref_num": 3,
        "shares": 5,
        "stock": 6,
        "price": 7
    },
    "U": {  # Replace Order
        "format_str": ">HH6sQQII",
        "time_stmp": 2,
        "old_ord_ref_num": 3,
        "ord_ref_num": 4,
        "shares": 5,
        "price": 6
    },
    "D": {  # Delete Order
        "format_str": ">HH6sQ",
        "time_stmp": 2,
        "ord_ref_num": 3
    },
    "X": {  # Delete Cancel
        "format_str": ">HH6sQI",
        "time_stmp": 2,
        "ord_ref_num": 3,
        "shares": 4
    },
}


In [14]:
# Defined functions for calculating VWAP and Updating order book 

def calc_vwap(msg_type, data):
    global next_trigger_time
    
    value = struct.unpack(msg_formats[msg_type]["format_str"], data)
    time = struct.unpack(">Q", b"\x00\x00"+value[msg_formats[msg_type]["time_stmp"]])[0]
    if next_trigger_time and time >= next_trigger_time:
        report_vwap(next_trigger_time)
        next_trigger_time += timedelta(hours=1) 

    shares = value[msg_formats[msg_type]["shares"]]
    if shares == 0:
        """According to section 1.5.2 of ITCH specification if Cross cannot be 
        conducted then shares is returned as 0"""
        return None
        
    if "ord_ref_num" in msg_formats[msg_type]:
        ord_ref_num = value[msg_formats[msg_type]["ord_ref_num"]]

    if "stock" in msg_formats[msg_type]:
        stock = value[msg_formats[msg_type]["stock"]].decode('ascii').strip()
    else:
        stock = order_book[ord_ref_num]["stock"]

    if "price" in msg_formats[msg_type]:
        price = value[msg_formats[msg_type]["price"]]/10000
    else:
        price = order_book[ord_ref_num]["price"]     

    
    if msg_type in {"E", "C"}:
        order_book[ord_ref_num]["shares"] -= shares
        if order_book[ord_ref_num]["shares"] == 0:
            del order_book[ord_ref_num]
    
    if msg_type == "C" and value[msg_formats[msg_type]["printable"]].decode('ascii')=="N":
        """According to section 1.4.2 of ITCH specification non-printable
        should be ignored for volume calculations"""
        return None

    
    if stock in vwap:
        vwap[stock]["PV"] += price*shares
        vwap[stock]["V"] += shares
    else:
        vwap[stock] = {
            "PV": price*shares , 
            "V": shares
        } 
        

def update_order_book(msg_type, data):
    global next_trigger_time
    
    value = struct.unpack(msg_formats[msg_type]["format_str"], data)
    time = struct.unpack(">Q", b"\x00\x00"+value[msg_formats[msg_type]["time_stmp"]])[0]
    if next_trigger_time and time >= next_trigger_time:
        report_vwap(next_trigger_time)
        next_trigger_time += NS_PER_HOUR
        
    ord_ref_num = value[msg_formats[msg_type]["ord_ref_num"]]

    if msg_type == "D":
        del order_book[ord_ref_num]
        return None

    shares = value[msg_formats[msg_type]["shares"]]
    price = value[msg_formats[msg_type]["price"]]/10000

    if msg_type == "X":
        order_book[ord_ref_num] -= shares
        if order_book[ord_ref_num]["shares"] == 0:
            del order_book[ord_ref_num]
        return None
  
    if "stock" in msg_formats[msg_type]:
        stock = value[msg_formats[msg_type]["stock"]].decode('ascii').strip()
    else:
        old_ord_ref_num = value[msg_formats[msg_type]["old_ord_ref_num"]]
        stock = order_book[old_ord_ref_num]["stock"]
        del order_book[old_ord_ref_num]

    order_book[ord_ref_num] = {
        "stock": stock,
        "price": price,
        "shares": shares
    }


In [15]:
# Functions for converting nanoseconds since midnight to time and a fucntion to display information about system events

def nanoseconds_to_time(nanoseconds):
    NS_PER_SECOND = 10**9
    SECONDS_PER_HOUR = 60*60
    SECONDS_PER_MINUTE = 60
    
    total_seconds, remaining_ns = divmod(nanoseconds, NS_PER_SECOND)
    
    hours, remainder = divmod(total_seconds, SECONDS_PER_HOUR)
    minutes, seconds = divmod(remainder, SECONDS_PER_MINUTE)
    
    microseconds = remaining_ns // 1000  # 1 microsecond = 1,000 nanoseconds
    
    return dt_t(hour=int(hours), minute=int(minutes), second=int(seconds), microsecond=int(microseconds))

def display_system_event(msg_type, data):
    global next_trigger_time
    
    value = struct.unpack(msg_formats[msg_type]["format_str"], data)
    time = struct.unpack(">Q", b"\x00\x00"+value[msg_formats[msg_type]["time_stmp"]])[0]
    event_code = value[msg_formats[msg_type]["event_code"]].decode('ascii')
    if event_code == "Q":
        next_trigger_time = time + NS_PER_HOUR 
        report_vwap(time)
    elif event_code == "M":
        next_trigger_time = None 
        report_vwap(time)
    print(f"===== System Message {event_code} {nanoseconds_to_time(time)} =====")

def report_vwap(time):
    global report_book
    
    time = nanoseconds_to_time(time)
    print(f"Generating report at {time}")
    for stock in vwap:
        report_book[stock][time] = vwap[stock]["PV"]/vwap[stock]["V"]
    

In [16]:
# Run main code

t0 = time.time()
with gzip.open(INP_FILE_LOCATION, mode="rb") as file:
    count = 0
    while True:
        file.seek(2, 1)
        data = file.read(1)
        msg_type = data.decode('ascii')

        if msg_type=="":
            print("\n++++++ Reached EOF ++++++")
            break
        
        if count % 10000000 == 0:
            print(msg_type)
            print(f"order book, {len(order_book):_}, {getsizeof(order_book):_}")
            print(f"vwap, {len(vwap):_}, {getsizeof(vwap):_}")
        
        count+=1
        if msg_type == "S":
            data2 = file.read(events_length[msg_type]-1)
            display_system_event(msg_type, data2)
        elif msg_type in {"E", "C", "P", "Q"}:
            data2 = file.read(events_length[msg_type]-1)
            calc_vwap(msg_type, data2)
        elif msg_type in {"A", "F", "U", "D"}:
            data2 = file.read(events_length[msg_type]-1)
            update_order_book(msg_type, data2)    
        else:
            file.seek(events_length[msg_type]-1, 1)

t1=time.time()
print(f"\n++++++ Time elasped: {t1-t0:_} sec ++++++")

S
order book, 0, 64
vwap, 0, 64
===== System Message O 03:03:59.687760 =====
===== System Message S 04:00:00.000181 =====
I
order book, 341_133, 20_971_600
vwap, 943, 26_032
Generating report at 09:30:00.000036
===== System Message Q 09:30:00.000036 =====
A
order book, 1_534_346, 167_772_248
vwap, 4_943, 103_856
D
order book, 1_550_363, 167_772_248
vwap, 5_404, 103_856
A
order book, 1_588_572, 167_772_248
vwap, 5_623, 207_616
A
order book, 1_597_338, 167_772_248
vwap, 5_819, 207_616
A
order book, 1_610_294, 167_772_248
vwap, 5_979, 207_616
D
order book, 1_620_179, 167_772_248
vwap, 6_095, 207_616
Generating report at 10:30:00.000036
A
order book, 1_620_147, 167_772_248
vwap, 6_235, 207_616
D
order book, 1_634_257, 167_772_248
vwap, 6_356, 207_616
U
order book, 1_637_653, 167_772_248
vwap, 6_447, 207_616
U
order book, 1_637_399, 167_772_248
vwap, 6_538, 207_616
U
order book, 1_648_515, 167_772_248
vwap, 6_602, 207_616
D
order book, 1_651_108, 167_772_248
vwap, 6_682, 207_616
Generating 

In this output
```
order book, 1_733_364, 167_772_248
vwap, 7_517, 207_61
```
the first number represents the number of items in the dictionary and the second number represents the size in memory in bytes6

In [17]:
pd.DataFrame(order_book).T

Unnamed: 0,stock,price,shares
86786,ING,12.07,600
148981,AU,13.89,100
91140,STM,16.05,1900
132627,SAP,102.2,792
132643,SAP,102.19,333
...,...,...,...
317719972,SQQQ,12.83,3000
317729196,SQQQ,12.85,4300
329147265,CAT,129.34,10
317755860,SQQQ,12.89,5500


In [18]:
pd.DataFrame(vwap).T.sort_values('V')

Unnamed: 0,PV,V
AHL-D,2.313000e+01,1.0
ZDGE,1.940000e+00,1.0
LOWC,8.494000e+01,1.0
NANR,3.133000e+01,1.0
USVM,4.960000e+01,1.0
...,...,...
EEM,6.278562e+08,14797898.0
VALE,2.275395e+08,18251831.0
SPY,4.956125e+09,18596334.0
HMNY,2.350729e+05,21136911.0


In [23]:
# Save the report book as a pandas datafrome and save to an outfile in csv format

df = pd.DataFrame(report_book).T
df.to_csv('output.csv')
df.sort_index().to_csv('sorted_output.csv')

In [25]:
df

Unnamed: 0,09:30:00.000036,10:30:00.000036,11:30:00.000036,12:30:00.000036,13:30:00.000036,14:30:00.000036,15:30:00.000036,16:00:00.000113
XLV,88.103750,88.724635,88.703853,88.775650,88.794778,88.902260,88.984446,89.044593
TVIX,45.950857,46.259556,46.175834,46.046122,45.961344,45.462876,45.070423,44.872387
AAPL,162.520038,162.081871,162.190693,162.192105,162.264418,162.565211,162.974779,163.223444
DPW,0.094668,0.093719,0.093449,0.092911,0.092905,0.092856,0.092781,0.092759
UGAZ,38.769293,38.587710,38.622480,38.750595,38.779987,38.684160,38.634464,38.629441
...,...,...,...,...,...,...,...,...
FCO,,,,,,,,7.830000
LEJU,,,,,,,,1.570000
YAO,,,,,,,,30.200000
OBAS,,,,,,,,9.724000


# Sandbox

### VWAP Analysis

In [16]:
vwap

{'XLV     ': {'PV': 211754168.23999915, 'V': 2378065},
 'TVIX    ': {'PV': 148590871.62000084, 'V': 3315087},
 'AAPL    ': {'PV': 2374162743.47502, 'V': 14515529},
 'DPW     ': {'PV': 37707.194800000005, 'V': 406915},
 'UGAZ    ': {'PV': 35626923.585000046, 'V': 922187},
 'HMY     ': {'PV': 778212.52, 'V': 398588},
 'GOLD    ': {'PV': 22044120.79999992, 'V': 1696227},
 'DRD     ': {'PV': 29408.61, 'V': 13460},
 'QCOM    ': {'PV': 270387522.98999995, 'V': 5403884},
 'SPY     ': {'PV': 4956124728.079975, 'V': 18596334},
 'UPRO    ': {'PV': 41177930.435, 'V': 989017},
 'AMD     ': {'PV': 887216970.7500011, 'V': 39866581},
 'TEF     ': {'PV': 1772342.9200000002, 'V': 203815},
 'BX      ': {'PV': 23815655.760000028, 'V': 725381},
 'FB      ': {'PV': 1844045783.2899954, 'V': 12054147},
 'AMZN    ': {'PV': 2980297732.475007, 'V': 1802474},
 'GOOG    ': {'PV': 540689659.5200012, 'V': 499566},
 'ALGN    ': {'PV': 272919895.92500114, 'V': 1218520},
 'OCX     ': {'PV': 3362258.1650000014, 'V': 63

In [17]:
len(vwap.keys())

8713

In [18]:
sorted(vwap.items(), key=lambda x: x[1]['V'], reverse=True) 

[('AMD     ', {'PV': 887216970.7500011, 'V': 39866581}),
 ('HMNY    ', {'PV': 235072.89759999537, 'V': 21136911}),
 ('SPY     ', {'PV': 4956124728.079975, 'V': 18596334}),
 ('VALE    ', {'PV': 227539450.68000004, 'V': 18251831}),
 ('EEM     ', {'PV': 627856224.1599989, 'V': 14797898}),
 ('AAPL    ', {'PV': 2374162743.47502, 'V': 14515529}),
 ('MSFT    ', {'PV': 1406069485.1449919, 'V': 13347965}),
 ('SIRI    ', {'PV': 74316784.67999983, 'V': 12540032}),
 ('FB      ', {'PV': 1844045783.2899954, 'V': 12054147}),
 ('CZR     ', {'PV': 107218176.64499982, 'V': 11928705}),
 ('T       ', {'PV': 294457183.0300002, 'V': 10031383}),
 ('QQQ     ', {'PV': 1616273808.4999976, 'V': 9803170}),
 ('GE      ', {'PV': 87443838.27499999, 'V': 9584381}),
 ('XLF     ', {'PV': 226054602.1200001, 'V': 8707144}),
 ('BAC     ', {'PV': 251752882.01999998, 'V': 8606374}),
 ('GDX     ', {'PV': 189603172.19000006, 'V': 8571637}),
 ('INTC    ', {'PV': 400552842.2349996, 'V': 8494587}),
 ('CODX    ', {'PV': 24208549.

In [19]:
sum([i['V'] for i in vwap.values()])  # Total volumme of shares traded

1408908286

In [20]:
39866581/1408908286

0.0282960795930758

### Need to consider Delete Order events as it will reduce size of order book

In [7]:
count = defaultdict(int)
with gzip.open(INP_FILE_LOCATION, mode="rb") as file:
    cnt = 0
    last_pos = 0
    while True:
        file.seek(2, 1)
        data = file.read(1)
        msg_type = data.decode('ascii')
        
        if cnt%100000000 == 0:
            print(count)
        cnt += 1
        
        if msg_type=="":
            print("Reached EOF")
            break
        # print(msg_type)
        count[msg_type]+=1
        file.seek(events_length[msg_type]-1, 1)


defaultdict(<class 'int'>, {})
defaultdict(<class 'int'>, {'S': 3, 'R': 8714, 'H': 8726, 'Y': 8768, 'L': 193738, 'A': 43762947, 'D': 41720703, 'F': 1051447, 'U': 7840455, 'X': 2149719, 'E': 1804131, 'P': 350754, 'V': 1, 'I': 1052716, 'C': 38458, 'Q': 8715, 'J': 5})
defaultdict(<class 'int'>, {'S': 3, 'R': 8714, 'H': 8753, 'Y': 8792, 'L': 193752, 'A': 89214226, 'D': 85706955, 'F': 1281753, 'U': 14842947, 'X': 3060168, 'E': 3855365, 'P': 673252, 'V': 1, 'I': 1062015, 'C': 74550, 'Q': 8717, 'J': 37})
defaultdict(<class 'int'>, {'S': 3, 'R': 8714, 'H': 8790, 'Y': 8810, 'L': 193753, 'A': 134106673, 'D': 129116146, 'F': 1561441, 'U': 22586813, 'X': 4119882, 'E': 6101759, 'P': 993297, 'V': 1, 'I': 1067417, 'C': 117613, 'Q': 8717, 'J': 55, 'B': 116})
Reached EOF


In [11]:
# Data

from pprint import pprint

pprint(count)

defaultdict(<class 'int'>,
            {'A': 162970455,
             'B': 116,
             'C': 158886,
             'D': 158273361,
             'E': 8096995,
             'F': 1725898,
             'H': 8805,
             'I': 3684511,
             'J': 62,
             'L': 193769,
             'P': 1326184,
             'Q': 17430,
             'R': 8714,
             'S': 6,
             'U': 27222746,
             'V': 1,
             'X': 4669874,
             'Y': 8821})


In [12]:
# Total number of events
sum(count.values())

368366634

In [13]:
sorted(count.items(), key=lambda x: x[1], reverse=True)

[('A', 162970455),
 ('D', 158273361),
 ('U', 27222746),
 ('E', 8096995),
 ('X', 4669874),
 ('I', 3684511),
 ('F', 1725898),
 ('P', 1326184),
 ('L', 193769),
 ('C', 158886),
 ('Q', 17430),
 ('Y', 8821),
 ('H', 8805),
 ('R', 8714),
 ('B', 116),
 ('J', 62),
 ('S', 6),
 ('V', 1)]

This shows the number of deleted orders are of the same magnitude as added orders and hence can be considered in order to keep the order book lean