In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from typing import Tuple

# 1. Function to compute bid and ask size deltas
def compute_bid_ask_deltas(df: pd.DataFrame, levels: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    bid_cols = [f"bid_sz_{i:02d}" for i in range(levels)]
    ask_cols = [f"ask_sz_{i:02d}" for i in range(levels)]
    bid_deltas = df[bid_cols].diff().fillna(0)
    ask_deltas = df[ask_cols].diff().fillna(0)
    return bid_deltas, ask_deltas

# 2. Compute best-level OFI
def compute_ofi_best(bid_deltas: pd.DataFrame, ask_deltas: pd.DataFrame) -> pd.Series:
    return bid_deltas["bid_sz_00"] - ask_deltas["ask_sz_00"]

# 3. Compute multi-level OFI
def compute_ofi_multi(bid_deltas: pd.DataFrame, ask_deltas: pd.DataFrame) -> pd.Series:
    return (bid_deltas.values - ask_deltas.values).sum(axis=1)

# 4. Compute integrated OFI via PCA
def compute_ofi_integrated(bid_deltas: pd.DataFrame, ask_deltas: pd.DataFrame) -> pd.Series:
    ofi_levels = bid_deltas.values - ask_deltas.values
    pca = PCA(n_components=1)
    pca.fit(ofi_levels)
    weights = pca.components_[0]
    weights /= np.sum(np.abs(weights))
    return ofi_levels @ weights

# 5. Wrapper function to build all OFI features
def build_ofi_features(
    df: pd.DataFrame,
    timestamp_col: str = "ts_event",
    levels: int = 10
) -> pd.DataFrame:
    # Sort by timestamp
    df_sorted = df.sort_values(timestamp_col).reset_index(drop=True)
    # Compute deltas
    bid_deltas, ask_deltas = compute_bid_ask_deltas(df_sorted, levels)
    # Build features
    ofi_df = pd.DataFrame({
        "timestamp": df_sorted[timestamp_col],
        "ofi_best": compute_ofi_best(bid_deltas, ask_deltas),
        "ofi_multi": compute_ofi_multi(bid_deltas, ask_deltas),
        "ofi_integrated": compute_ofi_integrated(bid_deltas, ask_deltas)
    })
    return ofi_df

# --------------------------
# Main execution
# --------------------------

# Load the dataset (adjust path as needed)
df = pd.read_csv("first_25000_rows.csv")

# Generate OFI features
ofi_features = build_ofi_features(df, levels=10)

# Display the first few rows
print(ofi_features.head())

# Optionally, save to CSV
ofi_features.to_csv("ofi_features.csv", index=False)



                        timestamp  ofi_best  ofi_multi  ofi_integrated
0  2024-10-21T11:54:29.221064336Z       0.0        0.0        0.000000
1  2024-10-21T11:54:29.223769812Z       2.0        2.0       -0.007063
2  2024-10-21T11:54:29.225030400Z       3.0        3.0       -0.010594
3  2024-10-21T11:54:29.712434212Z       0.0      200.0      -43.400473
4  2024-10-21T11:54:29.764673165Z       0.0     -200.0       43.400473
