# Baseline Model

Before model selection a simpel model must be run to act as a baseline for compaison of model performance. For this baseline we will use a simple Random Forest Regressor to predic the Mid-Price in 20 ticks time. 

The baseline model will be trained using a simple featureset consiting solely of data from the first level of the Limit Order Book.

### Import Libraries

In [2]:
#import required libraries
from utils import aws # used to create aws session and load parquet 
import pandas as pd
import numpy as np
import ast 
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

### Load Dataset

Currently this is run using the sample dataset, but going forward will need to be run using the full LOB dataset.

In [3]:
#load sample feature set from s3 to a dask dataframe
samp_lob_ddf = aws.load_s3_file_as_ddf("s3://dsmp-ol2/processed-data/temp_sample_lob_feature_set.parquet")

In [4]:
#compute the dask datafram to a pandas dataframe
df = samp_lob_ddf.compute()

In [5]:
df.head()

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price,Total_Order_Volume,OBV,Total_Volume_Imbalance,Bid_Ask_Spread,...,Lower_BB,Log_Returns,Realised_Semi_Variance,Squared_Log_Returns,Realised_Volatility,Abs_Log_Returns,Realised_Bipower_Variation,Total_Quadratic_Variation,Jump_Variation,Spot_Volatility
3,1.581,Exch0,"[array([1, 6], dtype=int64)]","[array([799, 1], dtype=int64)]",2025-01-02,400.0,7,-7,0.714286,798,...,,-0.001249,,1.560549e-06,,0.001249,,,,
4,1.643,Exch0,"[array([1, 6], dtype=int64)]","[array([798, 1], dtype=int64)]",2025-01-02,399.5,7,-14,0.714286,797,...,,-0.001251,,1.564455e-06,,0.001251,0.000556,,,
5,1.736,Exch0,"[array([261, 1], dtype=int64) array([1, 6], ...","[array([798, 1], dtype=int64)]",2025-01-02,529.5,8,-6,0.75,537,...,,0.281719,,0.07936582,,0.281719,0.000972,,,
6,1.984,Exch0,"[array([261, 1], dtype=int64) array([1, 6], ...","[array([797, 1], dtype=int64)]",2025-01-02,529.0,8,-14,0.75,536,...,,-0.000945,,8.925208e-07,,0.000945,0.001262,,,
7,2.015,Exch0,"[array([261, 1], dtype=int64) array([1, 6], ...","[array([338, 3], dtype=int64) array([797, ...",2025-01-02,299.5,11,-25,0.272727,77,...,,-0.568874,,0.3236176,,0.568874,0.064202,,,


In [6]:
cols_to_keep = ['Timestamp', 'Mid_Price', 'Total_Order_Volume', 'Total_Volume_Imbalance', 'Bid_Ask_Spread', 'Level_1_Bid_Price', 'Level_1_Bid_Quantity', 'Level_1_Ask_Price', 'Level_1_Ask_Quantity', 'Level_1_Order_Imbalance']

simple_df = df[cols_to_keep]

In [7]:
simple_df

Unnamed: 0,Timestamp,Mid_Price,Total_Order_Volume,Total_Volume_Imbalance,Bid_Ask_Spread,Level_1_Bid_Price,Level_1_Bid_Quantity,Level_1_Ask_Price,Level_1_Ask_Quantity,Level_1_Order_Imbalance
3,1.581,400.0,7,0.714286,798,1,6,799,1,5
4,1.643,399.5,7,0.714286,797,1,6,798,1,5
5,1.736,529.5,8,0.750000,537,261,1,798,1,0
6,1.984,529.0,8,0.750000,536,261,1,797,1,0
7,2.015,299.5,11,0.272727,77,261,1,338,3,-2
...,...,...,...,...,...,...,...,...,...,...
1037929,30599.418,330.5,25,-0.040000,15,323,2,338,1,1
1037930,30599.449,330.5,25,-0.040000,15,323,2,338,1,1
1037931,30599.635,330.5,25,-0.040000,15,323,2,338,1,1
1037932,30599.697,330.5,25,-0.040000,15,323,2,338,1,1


### Create Target Column

The baseline model will be used to predict the future Mid-Price at a single horizon. 

#### Set the Horizon

In [8]:
# The horizon is how far in the future the Mid-Proce is being predicted
horizon = 20

In [9]:
# Assuming 'simple_df' is your DataFrame
simple_df = simple_df.copy()

simple_df['Target'] = simple_df['Mid_Price'].shift(-horizon)

# Drop rows where all values in the 'Target' column are NaN
simple_df.dropna(subset=['Target'], how='all', inplace=True)

simple_df

Unnamed: 0,Timestamp,Mid_Price,Total_Order_Volume,Total_Volume_Imbalance,Bid_Ask_Spread,Level_1_Bid_Price,Level_1_Bid_Quantity,Level_1_Ask_Price,Level_1_Ask_Quantity,Level_1_Order_Imbalance,Target
3,1.581,400.0,7,0.714286,798,1,6,799,1,5,275.0
4,1.643,399.5,7,0.714286,797,1,6,798,1,5,274.0
5,1.736,529.5,8,0.750000,537,261,1,798,1,0,273.5
6,1.984,529.0,8,0.750000,536,261,1,797,1,0,273.5
7,2.015,299.5,11,0.272727,77,261,1,338,3,-2,273.5
...,...,...,...,...,...,...,...,...,...,...,...
1037909,30598.054,324.5,34,0.058824,5,322,2,327,4,-2,330.5
1037910,30598.178,324.5,34,0.058824,5,322,2,327,4,-2,330.5
1037911,30598.240,324.5,34,0.058824,5,322,2,327,4,-2,330.5
1037912,30598.302,331.5,30,0.200000,19,322,2,341,2,0,330.5


In [11]:
# Step 3: Splitting the dataset
X = simple_df.drop(['Target'], axis=1)  # Features
y = simple_df['Target']  # Target - Shift 'Mid_Price' by 20 timestamps into the future

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Normalizing/Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Model Selection
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 6: Training
model.fit(X_train_scaled, y_train)

# Step 7: Evaluation
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate Mean Squared Error
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Calculate R^2 Score
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2}')

Mean Squared Error: 82.60650700484662
Root Mean Squared Error: 9.088812188886214
R^2 Score: 0.887556598275256
