# Baseline Model

Before model selection a simpel model must be run to act as a baseline for compaison of model performance. For this baseline we will use a simple Random Forest Regressor to predic the Mid-Price in 20 ticks time. 

The baseline model will be trained using a simple featureset consiting solely of data from the first level of the Limit Order Book.

### Import Libraries

In [1]:
#import required libraries
from ..utils import aws # used to create aws session and load parquet 
import pandas as pd
import numpy as np
import ast 
# import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


### Load Dataset

Currently this is run using the sample dataset, but going forward will need to be run using the full LOB dataset.

In [2]:
#load sample feature set from s3 to a dask dataframe
samp_lob_ddf = aws.load_s3_file_as_ddf("s3://dsmp-ol2/processed-data/temp_sample_lob_feature_set.parquet")

In [3]:
#compute the dask datafram to a pandas dataframe
df = samp_lob_ddf.compute()

In [4]:
df.head()

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price,Total_Order_Volume,OBV,Total_Volume_Imbalance,Mid_Price_Future,...,Lower_BB,Log_Returns,Realised_Semi_Variance,Squared_Log_Returns,Realised_Volatility,Abs_Log_Returns,Realised_Bipower_Variation,Total_Quadratic_Variation,Jump_Variation,Smoothed_Mid_Price
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0,7,-7,0.714286,399.5,...,,-0.001249,,1.560549e-06,,0.001249,,,,361.728571
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5,7,-14,0.714286,529.5,...,,-0.001251,,1.564455e-06,,0.001251,0.000556,,,480.685714
5,1.736,Exch0,"[[261, 1], [1, 6]]","[[798, 1]]",2025-01-02,529.5,8,-6,0.75,529.0,...,,0.281719,,0.07936582,,0.281719,0.000972,,,515.571429
6,1.984,Exch0,"[[261, 1], [1, 6]]","[[797, 1]]",2025-01-02,529.0,8,-14,0.75,299.5,...,,-0.000945,,8.925208e-07,,0.000945,0.001262,,,483.014286
7,2.015,Exch0,"[[261, 1], [1, 6]]","[[338, 3], [797, 1]]",2025-01-02,299.5,11,-25,0.272727,279.0,...,,-0.568874,,0.3236176,,0.568874,0.064202,,,353.157143


In [5]:
cols_to_keep = ['Timestamp', 'Mid_Price', 'Total_Order_Volume', 'Total_Volume_Imbalance', 'Bid_Ask_Spread', 'Level_1_Bid_Price', 'Level_1_Bid_Quantity', 'Level_1_Ask_Price', 'Level_1_Ask_Quantity', 'Level_1_Order_Imbalance']

simple_df = df[cols_to_keep]

In [6]:
simple_df

Unnamed: 0,Timestamp,Mid_Price,Total_Order_Volume,Total_Volume_Imbalance,Bid_Ask_Spread,Level_1_Bid_Price,Level_1_Bid_Quantity,Level_1_Ask_Price,Level_1_Ask_Quantity,Level_1_Order_Imbalance
3,1.581,400.0,7,0.714286,798,1,6,799,1,5
4,1.643,399.5,7,0.714286,797,1,6,798,1,5
5,1.736,529.5,8,0.750000,537,261,1,798,1,0
6,1.984,529.0,8,0.750000,536,261,1,797,1,0
7,2.015,299.5,11,0.272727,77,261,1,338,3,-2
...,...,...,...,...,...,...,...,...,...,...
1037929,30599.418,330.5,25,-0.040000,15,323,2,338,1,1
1037930,30599.449,330.5,25,-0.040000,15,323,2,338,1,1
1037931,30599.635,330.5,25,-0.040000,15,323,2,338,1,1
1037932,30599.697,330.5,25,-0.040000,15,323,2,338,1,1


### Create Target Column

The baseline model will be used to predict the future Mid-Price at a single horizon. 

#### Set the Horizon

In [7]:
# The horizon is how far in the future the Mid-Proce is being predicted
horizon = 20

In [8]:
# Assuming 'simple_df' is your DataFrame
simple_df = simple_df.copy()

simple_df['Target'] = simple_df['Mid_Price'].shift(-horizon)

# Drop rows where all values in the 'Target' column are NaN
simple_df.dropna(subset=['Target'], how='all', inplace=True)

simple_df

Unnamed: 0,Timestamp,Mid_Price,Total_Order_Volume,Total_Volume_Imbalance,Bid_Ask_Spread,Level_1_Bid_Price,Level_1_Bid_Quantity,Level_1_Ask_Price,Level_1_Ask_Quantity,Level_1_Order_Imbalance,Target
3,1.581,400.0,7,0.714286,798,1,6,799,1,5,275.0
4,1.643,399.5,7,0.714286,797,1,6,798,1,5,274.0
5,1.736,529.5,8,0.750000,537,261,1,798,1,0,273.5
6,1.984,529.0,8,0.750000,536,261,1,797,1,0,273.5
7,2.015,299.5,11,0.272727,77,261,1,338,3,-2,273.5
...,...,...,...,...,...,...,...,...,...,...,...
1037909,30598.054,324.5,34,0.058824,5,322,2,327,4,-2,330.5
1037910,30598.178,324.5,34,0.058824,5,322,2,327,4,-2,330.5
1037911,30598.240,324.5,34,0.058824,5,322,2,327,4,-2,330.5
1037912,30598.302,331.5,30,0.200000,19,322,2,341,2,0,330.5


In [11]:
def create_sequences(df, seq_length):
    data = df.values  # Convert the entire DataFrame to a NumPy array first
    num_samples = data.shape[0] - seq_length + 1
    X = np.zeros((num_samples, seq_length, data.shape[1] - 1))
    y = np.zeros((num_samples,))

    for i in range(num_samples):
        X[i] = data[i:i + seq_length, :-1]  # all features except the last
        y[i] = data[i + seq_length - 1, -1]  # the last feature (target)
    return X, y

seq_length = 10  # Number of timesteps in each sequence
X, y = create_sequences(simple_df, seq_length)


In [12]:
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]


In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Flatten the data to fit the scaler, then reshape back to the original
X_train_reshaped = X_train.reshape(-1, X_train.shape[2])
X_test_reshaped = X_test.reshape(-1, X_test.shape[2])
X_train_scaled = scaler.fit_transform(X_train_reshaped).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test_reshaped).reshape(X_test.shape)


In [14]:
# Define the Bidirectional LSTM model
import torch
import torch.nn as nn

class BidirectionalLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(BidirectionalLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.linear(x[:, -1, :])
        return x


In [15]:
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)

model = BidirectionalLSTM(input_dim=X_train.shape[2], hidden_dim=50)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()


In [17]:
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Epoch 1, Loss: 133.3868865966797
Epoch 2, Loss: 86.40515899658203
Epoch 3, Loss: 339.5355529785156


In [18]:
# Ensure your model is in evaluation mode
model.eval()

# Convert scaled test data to tensors if not already done
# Assuming this has been done: X_test_tensor and y_test_tensor are available

# DataLoader for the test set
test_data = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# No gradient computation is needed for evaluation
with torch.no_grad():
    test_predictions = []
    test_targets = []
    for inputs, targets in test_loader:
        outputs = model(inputs)
        test_predictions.extend(outputs.view(-1).tolist())
        test_targets.extend(targets.view(-1).tolist())

# Convert predictions and actual values to tensors for loss computation
test_predictions_tensor = torch.tensor(test_predictions)
test_targets_tensor = torch.tensor(test_targets)

# Calculate MSE and RMSE
mse = torch.nn.functional.mse_loss(test_predictions_tensor, test_targets_tensor)
rmse = torch.sqrt(mse)

print(f'Test MSE: {mse.item()}')
print(f'Test RMSE: {rmse.item()}')


Test MSE: 658.0306396484375
Test RMSE: 25.65210723876953
