In [94]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# import speicidfic models here
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

import shap


In [95]:
target_column  = 'Mid_Price_Future'

In [96]:
# load your dataset
def load_data():
    df = pd.read_csv('data/lob/full_lob_head.csv', index_col=0)
    return df

In [97]:
# preprocessing steps
def preprocess_data(df):
    # Ensure 'Timestamp' is treated as a number
    df['Timestamp'] = pd.to_numeric(df['Timestamp'])
    # Sort the DataFrame by date and Timestamp to ensure it's in chronological order
    df.sort_values(by=['Date','Timestamp'], inplace=True)
    
    # Pivot the 'Order Type' column
    df = df.pivot_table(index=['Timestamp', 'Exchange', 'Date'], 
                            columns='Order Type', 
                            values=['Price', 'Quantity'])  # You can change 'mean' to another aggregation function if needed

    # The resulting 'pivot_df' will have multi-level column headers ('Price' and 'Quantity' under 'Ask' and 'Bid')
    # Flatten the multi-level column headers
    df.columns = [' '.join(col).strip() for col in df.columns.values]

    # Step 1 & 2: Feature Engineering - Calculate mid-price and create lag features
    df['Mid_Price'] = (df['Price Ask'] + df['Price Bid']) / 2
    df['Mid_Price_Future'] = df['Mid_Price'].shift(-1)  # Future mid-price as target
    df = df.dropna()  # Drop rows with NaN values created by shifting

    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [98]:
# model pipeline
def test_model(model, X_train, X_test, y_train, y_test):
    # Define pipeline steps
    steps = [
        ('scaler', StandardScaler()),  # Feature scaling
        ('model', model)  # Classifier/regressor model
    ]
    
    # Create pipeline
    pipeline = Pipeline(steps)
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate model performance
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')
    # Calculate R^2 Score
    r2 = r2_score(y_test, y_pred)
    print(f'R^2 Score: {r2}')

    # # Create an explainer object
    # explainer = shap.Explainer(model)

    # # Calculate SHAP values
    # shap_values = explainer.shap_values(X_train)

    # # Plot the SHAP summary plot
    # shap.summary_plot(shap_values, X_train)
    
    # Additional evaluation metrics or visualizations can be added here
    
    # Return the trained pipeline
    return pipeline

In [99]:
# Main function to run the pipeline
def main():
    # load data
    data = load_data()
    # preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(data)
    
    models = {
        # models go here
        # "Support Vector Machine": SVC(),
        # "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
    }
    
    # model testing
    for name, model in models.items():
        print("\nTesting", name)
        pipeline = test_model(model, X_train, X_test, y_train, y_test)
        
        # todo: Perform cross-validation
        # cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
        # print("Cross-validation scores:", cv_scores)
        
        # todo: Hyperparameter tuning using GridSearchCV
        # param_grid = {...}
        # grid_search = GridSearchCV(pipeline, param_grid, cv=5)
        # grid_search.fit(X_train, y_train)
        # best_params = grid_search.best_params_
        # print("Best hyperparameters:", best_params)
        
        # todo: Save the best model
        # best_model = grid_search.best_estimator_
        # Save the best_model for future use

In [100]:
main()


Testing Random Forest
Mean Squared Error: 148.22240561909663
R^2 Score: 0.8679390334206767
