In [13]:
import pandas as pd

In [20]:
# Read the newly provided LOB data, adjusting for the updated indexing
lob_csv_path = 'lob_short.csv'  # Updated file path

# Reading the updated LOB CSV file into a pandas DataFrame
lob_df = pd.read_csv(lob_csv_path)
lob_df.columns = ['Timestamp', 'Bid', 'Ask']
lob_df['Bid'] = lob_df['Bid'].apply(lambda x: eval(x) if x.startswith('[') else [])
lob_df['Ask'] = lob_df['Ask'].apply(lambda x: eval(x) if x.startswith('[') else [])

# I think this does the same thing
# lob_df['Bid'] = lob_df['Bid'].apply(eval)
# lob_df['Ask'] = lob_df['Ask'].apply(eval)

# Correcting tape data reading, including the first row as actual data
tape_csv_path = 'tape_short.csv'  # Tape file path remains the same
tape_df = pd.read_csv(tape_csv_path, header=None, skiprows=0)
tape_df.columns = ['Timestamp', 'Price', 'Volume']

tape_df['Price'] = tape_df['Price'].astype(int)
tape_df['Volume'] = tape_df['Volume'].astype(int)

# Display the first few rows of each DataFrame to confirm correct loading
lob_df.head(), tape_df.head()


(   Timestamp       Bid         Ask
 0      0.000        []          []
 1      0.279  [[1, 6]]          []
 2      1.333  [[1, 6]]  [[800, 1]]
 3      1.581  [[1, 6]]  [[799, 1]]
 4      1.643  [[1, 6]]  [[798, 1]],
    Timestamp  Price  Volume
 0     10.881    267       1
 1     11.067    269       1
 2     11.222    267       2
 3     12.338    270       2
 4     13.733    267       3)

In [22]:
# Initialize a new DataFrame for merged data
merged_lob_df = lob_df.copy()

# Iterate through each row in the tape DataFrame
for index, tape_row in tape_df.iterrows():
    tape_timestamp = tape_row['Timestamp']
    price_volume_pair = [tape_row['Price'], tape_row['Volume']]

    # Check if this timestamp already exists in the LOB DataFrame
    if tape_timestamp not in lob_df['Timestamp'].values:
        # Find the last LOB entry before this tape timestamp
        last_lob_entry = merged_lob_df[merged_lob_df['Timestamp'] < tape_timestamp].iloc[-1]

        # Copy the last LOB entry and update it with the new timestamp and tape data
        new_bid_list = [price_volume_pair] + last_lob_entry['Bid']
        new_ask_list = [price_volume_pair] + last_lob_entry['Ask']
        new_lob_entry = pd.DataFrame({
            'Timestamp': [tape_timestamp],
            'Bid': [new_bid_list],
            'Ask': [new_ask_list]
        })

        # Append the new entry to the merged LOB DataFrame
        merged_lob_df = pd.concat([merged_lob_df, new_lob_entry], ignore_index=True)

# Sort the merged LOB DataFrame by Timestamp
merged_lob_df.sort_values(by='Timestamp', inplace=True)
merged_lob_df.reset_index(drop=True, inplace=True)

# Display the merged LOB DataFrame
merged_lob_df.tail()  # Show the last few rows to confirm the new entries
# output to csv
merged_lob_df.to_csv('merged_lob.csv', index=False)