In [1]:
#import required libraries
from utils import aws # used to create aws session and load parquet 
import pandas as pd
import numpy as np
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor

In [2]:
# Step 1: Load historical data
# Load historical limit order book and tape data into pandas DataFrame
# load sample feature set from s3 to a dask dataframe
# samp_lob_ddf = aws.load_s3_file_as_ddf("s3://dsmp-ol2/processed-data/temp_sample_lob_feature_set.parquet")
# # compute the dask datafram to a pandas dataframe
# df = samp_lob_ddf.compute()

df = pd.read_parquet('data/output/temp_sample_lob_feature_set.parquet')

# Preprocess the data to extract relevant features
# Create a target column
# The horizon is how far in the future the Mid-Proce is being predicted
horizon = 20

# Assuming 'simple_df' is your DataFrame
df = df.copy()

df['Target'] = df['Mid_Price'].shift(-horizon)

# Drop rows where all values in the 'Target' column are NaN
df.dropna(subset=['Target'], how='all', inplace=True)

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price,Total_Order_Volume,OBV,Total_Volume_Imbalance,Mid_Price_Future,...,Log_Returns,Realised_Semi_Variance,Squared_Log_Returns,Realised_Volatility,Abs_Log_Returns,Realised_Bipower_Variation,Total_Quadratic_Variation,Jump_Variation,Smoothed_Mid_Price,Target
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0,7,-7,0.714286,399.5,...,-0.001249,,1.560549e-06,,0.001249,,,,361.728571,275.0
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5,7,-14,0.714286,529.5,...,-0.001251,,1.564455e-06,,0.001251,0.000556,,,480.685714,274.0
5,1.736,Exch0,"[[261, 1], [1, 6]]","[[798, 1]]",2025-01-02,529.5,8,-6,0.750000,529.0,...,0.281719,,7.936582e-02,,0.281719,0.000972,,,515.571429,273.5
6,1.984,Exch0,"[[261, 1], [1, 6]]","[[797, 1]]",2025-01-02,529.0,8,-14,0.750000,299.5,...,-0.000945,,8.925208e-07,,0.000945,0.001262,,,483.014286,273.5
7,2.015,Exch0,"[[261, 1], [1, 6]]","[[338, 3], [797, 1]]",2025-01-02,299.5,11,-25,0.272727,279.0,...,-0.568874,,3.236176e-01,,0.568874,0.064202,,,353.157143,273.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037909,30598.054,Exch0,"[[322, 2], [321, 4], [152, 2], [104, 3], [55, ...","[[327, 4], [341, 2], [442, 1], [680, 4], [724,...",2025-01-06,324.5,34,-2799920,0.058824,324.5,...,0.001542,0.000468,2.377827e-06,0.029506,0.001542,0.000004,0.000871,0.000867,324.414286,330.5
1037910,30598.178,Exch0,"[[322, 2], [321, 4], [104, 3], [55, 1], [7, 6]...","[[327, 4], [341, 2], [442, 1], [680, 4], [724,...",2025-01-06,324.5,34,-2799920,0.058824,324.5,...,0.000000,0.000468,0.000000e+00,0.029506,0.000000,0.000000,0.000871,0.000871,323.942857,330.5
1037911,30598.240,Exch0,"[[322, 2], [321, 4], [306, 1], [104, 3], [7, 6...","[[327, 4], [341, 2], [442, 1], [680, 4], [724,...",2025-01-06,324.5,34,-2799920,0.058824,331.5,...,0.000000,0.000468,0.000000e+00,0.029506,0.000000,0.000000,0.000871,0.000871,326.300000,330.5
1037912,30598.302,Exch0,"[[322, 2], [321, 4], [306, 1], [104, 3], [7, 6...","[[341, 2], [442, 1], [680, 4], [724, 5]]",2025-01-06,331.5,30,-2799890,0.200000,331.5,...,0.021342,0.000468,4.554926e-04,0.036416,0.021342,0.000000,0.001326,0.001326,329.700000,330.5


In [3]:
# Step 2: Load and train prediction model
# Split the data
X = df.drop(['Target'], axis=1)  # Features
y = df['Target']  # Target - Shift 'Mid_Price' by 20 timestamps into the future

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Load the prediction model
prediction_model = DummyRegressor(strategy="mean") # Instantiate a dummy model as a placeholder 

# Train the model
prediction_model.fit(X_train, y_train)

In [4]:
# Step 3: Define trading strategy
# Define your trading strategy based on model predictions and trading parameters
def simple_trading_strategy(prediction, current_mid_price):
    if prediction > current_mid_price:
        return 'buy'  # Buy if the prediction is positive
    elif prediction < current_mid_price:
        return 'sell'  # Sell if the prediction is negative
    else:
        return 'hold'  # Hold if the prediction is neutral

In [5]:
# Step 4: Implement trading simulator
class TradingSimulator:
    def __init__(self, historical_data, prediction_model, trading_strategy):
        self.historical_data = historical_data
        self.prediction_model = prediction_model
        self.trading_strategy = trading_strategy
        self.asset_val = 0  # Initial portfolio value
        self.cash_val = 1000 # Initial cash value
        self.profit_loss = []  # Store profit/loss at each time step
        self.trades = [] # Store trade history

    def run_simulation(self):
        for index, row in self.historical_data.iterrows():
            # Extract features from historical data for prediction
            features = row

            # Use prediction model to generate predictions
            prediction = self.prediction_model.predict([features])[0]

            # Get the current mid-price (replace 'current_mid_price' with the actual value)
            current_mid_price = row['Mid_Price']

            # Implement trading strategy within the simulator
            trade_action = self.trading_strategy(prediction, current_mid_price)

            # Execute trade and update portfolio
            # Update portfolio value, profit/loss, trade history, etc.
            self.trades.append([row['Date'], row['Timestamp'], trade_action])
            
            # Calculate metrics (e.g., profit/loss, Sharpe ratio, maximum drawdown) and store
        return self.trades
            
    def get_performance_metrics(self):
        # Calculate and return performance metrics such as profit/loss, Sharpe ratio, etc.
        pass

In [6]:
# Step 5: Run simulation
simulator = TradingSimulator(X_test, prediction_model, simple_trading_strategy)
trades = simulator.run_simulation()
trades

[['2025-01-03', 19507.091, 'buy'],
 ['2025-01-03', 29398.044, 'buy'],
 ['2025-01-06', 11705.476, 'buy'],
 ['2025-01-03', 2805.562, 'buy'],
 ['2025-01-03', 5102.786, 'sell'],
 ['2025-01-02', 7704.926, 'buy'],
 ['2025-01-06', 14428.392, 'sell'],
 ['2025-01-06', 3027.057, 'buy'],
 ['2025-01-03', 411.618, 'buy'],
 ['2025-01-06', 6253.196, 'sell'],
 ['2025-01-02', 7768.817, 'buy'],
 ['2025-01-03', 11460.111, 'buy'],
 ['2025-01-06', 11583.801, 'buy'],
 ['2025-01-02', 4973.826, 'sell'],
 ['2025-01-03', 23942.075, 'sell'],
 ['2025-01-02', 22240.423, 'buy'],
 ['2025-01-06', 19659.022, 'sell'],
 ['2025-01-02', 18936.97, 'buy'],
 ['2025-01-03', 13659.003, 'buy'],
 ['2025-01-06', 19394.623, 'sell'],
 ['2025-01-03', 7187.009, 'buy'],
 ['2025-01-03', 23027.048, 'sell'],
 ['2025-01-02', 14230.612, 'buy'],
 ['2025-01-02', 5686.516, 'buy'],
 ['2025-01-02', 22166.581, 'buy'],
 ['2025-01-02', 13031.47, 'buy'],
 ['2025-01-06', 2085.804, 'sell'],
 ['2025-01-03', 12997.897, 'buy'],
 ['2025-01-03', 713.093, 

In [7]:
# Step 6: Get performance metrics
performance_metrics = simulator.get_performance_metrics()
print(performance_metrics)

None
