In [1]:
import gzip
import struct
from datetime import datetime
import pandas as pd
import os
import time
import struct
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import folium
import glob
import random
import scipy.stats as stats
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.distributions.empirical_distribution import ECDF

from pmdarima.arima import auto_arima

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from numpy import array
import keras
import keras.backend as K
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.layers import Reshape
from keras.layers import Input
from keras.layers import Dropout
from keras.layers import Concatenate
from keras.layers import RepeatVector

from tensorflow.keras.optimizers import SGD
from keras.preprocessing.sequence import TimeseriesGenerator

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

from keras.layers import Flatten
from keras.layers import TimeDistributed
from keras.layers import Conv1D
from keras.layers import MaxPooling1D

import tensorflow as tf
from datetime import datetime
from textwrap import wrap
import itertools

from sklearn.model_selection import train_test_split



In [2]:
class StockDataProcessor:
    def __init__(self):
        self.temp_data = []   # Temporary data storage
        self.time_flag = None  # Flag to hold the current hour
        self.csv_counter = 0   # Counter for the number of CSV files generated

    def read_binary_data(self, size):
        read_data = nasdaq_file.read(size)  # Read binary data of the given size
        return read_data

    def transform_timestamp(self, stamp):
        time_obj = datetime.fromtimestamp(stamp / 1e9)  # Convert timestamp to datetime object
        return time_obj.strftime('%H:%M:%S')  # Format the datetime object to string

    def append_time_data(self, row):
        return str(row['time']) + ':00:00'  # Append ':00:00' to the time data

    def calculate_vwap(self, message):
        parsed_data, hour = self.process_trade_message(message)  # Process the trade message and get the parsed data and hour
        if self.time_flag is None:  # If the time flag is not set
            self.time_flag = hour  # Set the time flag to the current hour
        if self.time_flag != hour:  # If the current hour is different from the time flag
            self.generate_and_save_csv()  # Generate and save CSV with VWAP data
            self.reset_temp_data_and_time_flag(hour)  # Reset temporary data and time flag
        self.temp_data.append(parsed_data)  # Append the parsed data to the temporary data


    def generate_and_save_csv(self):
        df = pd.DataFrame(self.temp_data, columns=['time', 'symbol', 'price', 'volume'])
        df['total_amount'] = df['price'] * df['volume']
        df['time'] = pd.to_datetime(df['time'])
        df = df.groupby([df['time'].dt.hour, df['symbol']]).agg({'total_amount': 'sum', 'volume': 'sum'}).reset_index()
        df['VWA'] = df['total_amount'] / df['volume']
        df.to_excel(f'result{self.csv_counter}.xlsx', index=False)
        self.csv_counter += 1
        print(df)

    def reset_temp_data_and_time_flag(self, hour):
        self.temp_data = []  # Clear the temporary data
        self.time_flag = hour  # Set the time flag to the current hour
    
    def process_trade_message(self, raw_message):
        message_type = b'P'
        unpacked_data = struct.unpack('>4s6sQcI8cIQ', raw_message)

        packed_message = struct.pack('>s4s2s6sQsI8sIQ', message_type, unpacked_data[0], b'\x00\x00', unpacked_data[1],
                                     unpacked_data[2], unpacked_data[3], unpacked_data[4],
                                     b''.join(list(unpacked_data[5:13])), unpacked_data[13], unpacked_data[14])

        processed_data = struct.unpack('>sHHQQsI8sIQ', packed_message)
        processed_data = list(processed_data)

        processed_data[3] = self.transform_timestamp(processed_data[3])
        processed_data[7] = processed_data[7].strip()
        processed_data[8] = float(processed_data[8])
        processed_data[8] = processed_data[8] / 10000

        return [processed_data[3], processed_data[7], processed_data[8], processed_data[6]], processed_data[3].split(':')[0] # Return the data and the hour


In [3]:
message_length_mapping = {
    "S": 11,
    "R": 38,
    "H": 24,
    "Y": 19,
    "L": 25,
    "V": 34,
    "W": 11,
    "K": 27,
    "A": 35,
    "F": 39,
    "E": 30,
    "C": 35,
    "X": 22,
    "D": 18,
    "U": 34,
    "P": 43,
    "Q": 39,
    "B": 18,
    "I": 49,
    "N": 19
}

In [4]:
nasdaq_file = gzip.open(os.path.join('01302019.NASDAQ_ITCH50.gz'), 'rb')
char_read= nasdaq_file.read(1)
stockdataprocessor = StockDataProcessor()

while char_read:
    try:
        char_read = char_read.decode("utf-8")
    except UnicodeDecodeError:
        char_read= nasdaq_file.read(1)
        continue
    
    if char_read not in message_length_mapping:
        char_read= nasdaq_file.read(1)
        continue

    length = message_length_mapping[char_read]

    message = stockdataprocessor.read_binary_data(length)
    if char_read == "P":
        stockdataprocessor.calculate_vwap(message)

    char_read= nasdaq_file.read(1)

nasdaq_file.close()

    time   symbol  total_amount  volume          VWA
0      9  b'AAPL'     180154.08    1108   162.593935
1      9   b'AMD'      82082.68    3938    20.843748
2      9  b'AMZN'      88847.33      55  1615.406000
3      9  b'BABA'     657345.69    4111   159.899219
4      9    b'FB'       7550.92      52   145.210000
5      9   b'GLD'        248.54       2   124.270000
6      9   b'MCD'       7295.60      40   182.390000
7      9  b'NVDA'      26800.00     200   134.000000
8      9   b'PCG'      22963.50    1695    13.547788
9      9   b'SPY'      67408.25     255   264.346078
10     9   b'TME'        318.80      20    15.940000
11     9  b'TQQQ'       9994.14     234    42.710000
12     9  b'UGAZ'      22201.58     573    38.746213
13     9  b'VALE'      34416.00    2900    11.867586
    time    symbol  total_amount  volume          VWA
0     10   b'AAPL'     214871.71    1319   162.905011
1     10   b'ALGN'      19905.60      96   207.350000
2     10    b'AMD'      60678.07    2915   

      time    symbol  total_amount  volume        VWA
0       18      b'A'      61386.96     825  74.408436
1       18     b'AA'      28232.45     975  28.956359
2       18   b'AABA'      64068.91     956  67.017688
3       18    b'AAL'     123500.79    3394  36.387976
4       18    b'AAN'      19731.50     400  49.328750
...    ...       ...           ...     ...        ...
3217    18    b'ZTO'      26861.00    1580  17.000633
3218    18    b'ZTS'      90135.06    1058  85.193819
3219    18   b'ZUMZ'       4028.21     161  25.019938
3220    18    b'ZUO'      11312.49     531  21.304124
3221    18  b'ZVZZT'       6210.90     620  10.017581

[3222 rows x 5 columns]
   time                            symbol  total_amount  volume          VWA
0     8  b'\x00\x00\x00\x00\x00\x13D\x13'  6.434532e+10  152800  421108.1257
      time    symbol  total_amount  volume        VWA
0       18      b'A'    146512.920    1968  74.447622
1       18     b'AA'     34936.560    1203  29.041197
2       18 