Import libraries

In [1]:
import os
import pandas as pd
import re

Def line parsing function

In [2]:
def parse_line(line):
    parts = line.strip()[1:-1].split(', ', 2)
    if len(parts) < 3:
        return None, None, [], []
    timestamp = float(parts[0])
    exchange = parts[1].strip('\'')
    bid_ask_data = parts[2][1:-1]
    bid_data = re.findall(r'\[\'bid\', \[\[(.*?)\]\]\]', bid_ask_data)
    bid = [list(map(int, pair.split(', '))) for pair in bid_data[0].split('], [')] if bid_data else []
    ask_data = re.findall(r'\[\'ask\', \[\[(.*?)\]\]\]', bid_ask_data)
    ask = [list(map(int, pair.split(', '))) for pair in ask_data[0].split('], [')] if ask_data else []
    return timestamp, exchange, bid, ask

Process file chunk function

In [3]:
def process_file_chunk(file_chunk):
    data = []
    for line in file_chunk:
        parsed_line = parse_line(line)
        if parsed_line[0] is not None:  # check that the line was parsed successfully
            timestamp, exchange, bid, ask = parsed_line
            data.append({'Timestamp': timestamp, 'Exchange': exchange, 'Bid': bid, 'Ask': ask})
    return data

Iterative file processing with chunking

In [4]:
def process_files(directory, chunk_size=10000):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                chunk = []
                for line in file:
                    chunk.append(line)
                    if len(chunk) >= chunk_size:
                        yield process_file_chunk(chunk)
                        chunk = []  # reset chunk for the next iteration
                if chunk:  # process any remaining lines
                    yield process_file_chunk(chunk)

directory = 'data/lob/'
chunk_size = 5000  # might need to adjust here if it crashes!

# empty df
df = pd.DataFrame()

for file_chunk in process_files(directory, chunk_size=chunk_size):
    chunk_df = pd.DataFrame(file_chunk)
    # could process chunks here?
    df = pd.concat([df, chunk_df], ignore_index=True)



In [5]:
df.head()

Unnamed: 0,Timestamp,Exchange,Bid,Ask
0,0.0,Exch0,[],[]
1,0.496,Exch0,"[[63, 4]]",[]
2,0.527,Exch0,"[[115, 4], [63, 4]]",[]
3,0.713,Exch0,"[[115, 2], [63, 4]]",[]
4,0.744,Exch0,"[[115, 2], [70, 4]]",[]


In [1]:
df.to_csv("full_lob.csv", index=False)

NameError: name 'df' is not defined

In [None]:
import os
import pandas as pd
import re

data = []

def parse_line(line):
    # trim and split
    parts = line.strip()[1:-1].split(', ', 2)
    if len(parts) < 3:
        #if not enough parts, return a default or error
        return None, None, [], []
    # extracting values
    timestamp = float(parts[0])
    exchange = parts[1].strip('\'')
    # extracting bid and ask values
    bid_ask_data = parts[2][1:-1]
    bid_data = re.findall(r'\[\'bid\', \[\[(.*?)\]\]\]', bid_ask_data)
    bid = [list(map(int, pair.split(', '))) for pair in bid_data[0].split('], [')] if bid_data else []
    ask_data = re.findall(r'\[\'ask\', \[\[(.*?)\]\]\]', bid_ask_data)
    ask = [list(map(int, pair.split(', '))) for pair in ask_data[0].split('], [')] if ask_data else []
    return timestamp, exchange, bid, ask

directory = 'data/lob/'

for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), 'r') as file:
            for line in file:
                timestamp, exchange, bid, ask = parse_line(line)
                # add  data to list
                data.append({'Timestamp': timestamp, 'Exchange': exchange, 'Bid': bid, 'Ask': ask, 'File': filename})

# create dataframe from list
df = pd.DataFrame(data)

df

Separate bid and ask columns into rows

In [None]:
# Columns to melt
columns_to_melt = ['Bid', 'Ask']

# Melt the specified columns
melted_df = df.melt(id_vars=['Timestamp', 'Exchange', 'File'] value_vars=columns_to_melt,
                    var_name='Order Type', value_name='Value')

melted_df

KeyError: "The following 'id_vars' are not present in the DataFrame: ['File']"

Reshape df to show one bid and ask per row

In [None]:
df = melted_df.explode('Value')

df

separate price and quantity from values column

In [None]:
df[['Price', 'Quantity']] = pd.DataFrame(df['Value'].apply(lambda x: x if isinstance(x, list) else []).tolist(), index=df.index)
df.drop(columns=['Value'], inplace=True)
df

Extract de from file column and store it as datetime in date column

In [None]:
# Extract date from 'File' column and add it as a new column
df['Date'] = df['File'].str.extract(r'(\d{4}-\d{2}-\d{2})')
# Convert the 'Date' column to datetime type
df['Date'] = pd.to_datetime(df['Date'])
df.drop(columns=['File'], inplace=True)
df

save to csv

In [None]:
df.to_csv("full_lob.csv", index=False)