In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
# load the first n rows from data/lob/full_lob.csv
df = pd.read_csv('data/lob/full_lob_head.csv', index_col=0)
df

Unnamed: 0,Timestamp,Exchange,Order Type,Price,Quantity,Date
25781997,0.000,Exch0,Bid,,,2025-01-02
67020690,0.000,Exch0,Ask,,,2025-01-02
25781998,0.279,Exch0,Bid,1.0,6.0,2025-01-02
67020691,0.279,Exch0,Ask,,,2025-01-02
25781999,1.333,Exch0,Bid,1.0,6.0,2025-01-02
...,...,...,...,...,...,...
26123018,29573.938,Exch0,Bid,292.0,2.0,2025-01-02
26123018,29573.938,Exch0,Bid,291.0,9.0,2025-01-02
26123018,29573.938,Exch0,Bid,288.0,3.0,2025-01-02
26123018,29573.938,Exch0,Bid,287.0,4.0,2025-01-02


In [3]:
# Ensure 'Timestamp' is treated as a number
df['Timestamp'] = pd.to_numeric(df['Timestamp'])
# Sort the DataFrame by date and Timestamp to ensure it's in chronological order
df.sort_values(by=['Date','Timestamp'], inplace=True)


In [4]:
# Pivot the 'Order Type' column
pivot_df = df.pivot_table(index=['Timestamp', 'Exchange', 'Date'], 
                          columns='Order Type', 
                          values=['Price', 'Quantity'])  # You can change 'mean' to another aggregation function if needed

# The resulting 'pivot_df' will have multi-level column headers ('Price' and 'Quantity' under 'Ask' and 'Bid')
# Flatten the multi-level column headers
pivot_df.columns = [' '.join(col).strip() for col in pivot_df.columns.values]

In [5]:
# Step 1 & 2: Feature Engineering - Calculate mid-price and create lag features
pivot_df['Mid_Price'] = (pivot_df['Price Ask'] + pivot_df['Price Bid']) / 2
pivot_df['Mid_Price_Future'] = pivot_df['Mid_Price'].shift(-1)  # Future mid-price as target
pivot_df = pivot_df.dropna()  # Drop rows with NaN values created by shifting

In [6]:
pivot_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Price Ask,Price Bid,Quantity Ask,Quantity Bid,Mid_Price,Mid_Price_Future
Timestamp,Exchange,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.333,Exch0,2025-01-02,800.0,1.000000,1.0,6.000000,400.500000,400.000000
1.581,Exch0,2025-01-02,799.0,1.000000,1.0,6.000000,400.000000,399.500000
1.643,Exch0,2025-01-02,798.0,1.000000,1.0,6.000000,399.500000,464.500000
1.736,Exch0,2025-01-02,798.0,131.000000,1.0,3.500000,464.500000,464.000000
1.984,Exch0,2025-01-02,797.0,131.000000,1.0,3.500000,464.000000,349.250000
...,...,...,...,...,...,...,...,...
29573.721,Exch0,2025-01-02,406.0,255.181818,2.8,3.727273,330.590909,343.490909
29573.752,Exch0,2025-01-02,431.8,255.181818,2.8,3.727273,343.490909,370.190909
29573.783,Exch0,2025-01-02,485.2,255.181818,2.8,3.727273,370.190909,393.590909
29573.814,Exch0,2025-01-02,532.0,255.181818,2.8,3.727273,393.590909,384.490909


In [7]:
# Step 3: Splitting the dataset
X = pivot_df.drop(['Mid_Price_Future'], axis=1)  # Features
y = pivot_df['Mid_Price_Future']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Normalizing/Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Model Selection
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 6: Training
model.fit(X_train_scaled, y_train)

# Step 7: Evaluation
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate R^2 Score
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2}')

Mean Squared Error: 148.19743746307472
R^2 Score: 0.8679612791722835
