In [8]:
import sys, os
sys.path.append(os.path.abspath(".."))


In [9]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report


In [10]:
df = pd.read_csv("../data/processed/ofi_features_labels.csv")
df.head()

Unnamed: 0,timestamp,bid_price,bid_size,ask_price,ask_size,buy_volume,sell_volume,mid_price,ofi,ofi_sum_5,ofi_sum_20,total_volume,ofi_norm,mid_price_future,ret_future,direction
0,2025-01-01 09:30:30,99.993047,27,100.013047,48,45.0,0.0,100.003047,45.0,45.0,45.0,45.0,1.0,99.975237,-0.000278,-1
1,2025-01-01 09:30:31,99.982647,68,100.002647,53,0.0,29.0,99.992647,-29.0,16.0,16.0,29.0,-1.0,99.983015,-9.6e-05,0
2,2025-01-01 09:30:32,99.990152,46,100.010152,31,0.0,9.0,100.000152,-9.0,7.0,7.0,9.0,-1.0,99.983675,-0.000165,-1
3,2025-01-01 09:30:33,99.999557,40,100.019557,80,0.0,39.0,100.009557,-39.0,-32.0,-32.0,39.0,-1.0,99.994947,-0.000146,-1
4,2025-01-01 09:30:34,99.980047,72,100.000047,62,8.0,0.0,99.990047,8.0,-24.0,-24.0,8.0,1.0,99.999623,9.6e-05,0


In [11]:
feature_cols = [
    "ofi",
    "ofi_sum_5",
    "ofi_sum_20",
    "ofi_norm",
    "buy_volume",
    "sell_volume",
    "mid_price",
]

X = df[feature_cols]

# Regression target (future return)
y_reg = df["ret_future"]

# Classification target (direction: -1, 0, 1)
y_clf = df["direction"]


In [12]:
n = len(df)
split_idx = int(0.8 * n)   # 80% train, 20% test

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_reg_train = y_reg.iloc[:split_idx]
y_reg_test  = y_reg.iloc[split_idx:]

y_clf_train = y_clf.iloc[:split_idx]
y_clf_test  = y_clf.iloc[split_idx:]

n, split_idx, X_train.shape, X_test.shape


(990, 792, (792, 7), (198, 7))

In [13]:
reg = LinearRegression()
reg.fit(X_train, y_reg_train)

y_reg_pred = reg.predict(X_test)

mse = mean_squared_error(y_reg_test, y_reg_pred)
print("Linear regression MSE on test set:", mse)


Linear regression MSE on test set: 1.463053271979196e-07


In [14]:
comparison_reg = pd.DataFrame(
    {
        "ret_true": y_reg_test.values[:10],
        "ret_pred": y_reg_pred[:10],
    }
)
comparison_reg


Unnamed: 0,ret_true,ret_pred
0,1.745145e-06,8.4e-05
1,5.436998e-07,4e-05
2,0.0003087503,0.000122
3,0.0004313413,8e-05
4,0.0003182206,6.5e-05
5,8.728716e-05,7.5e-05
6,-4.752735e-05,0.000144
7,-2.965418e-05,0.000132
8,3.479349e-05,0.000112
9,8.550439e-05,9.9e-05


In [16]:
clf = LogisticRegression(
    max_iter=1000,
    multi_class="auto"
)

clf.fit(X_train, y_clf_train)

y_clf_pred = clf.predict(X_test)

print("Accuracy on test set:", accuracy_score(y_clf_test, y_clf_pred))
print("\nClassification report:\n")
print(classification_report(y_clf_test, y_clf_pred))


Accuracy on test set: 0.5555555555555556

Classification report:

              precision    recall  f1-score   support

          -1       0.67      0.60      0.63        92
           0       0.00      0.00      0.00        41
           1       0.47      0.85      0.61        65

    accuracy                           0.56       198
   macro avg       0.38      0.48      0.41       198
weighted avg       0.47      0.56      0.49       198



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
//8️⃣ (Optional) Cell 7 – confusion matrix

In [17]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_clf_test, y_clf_pred, labels=[-1, 0, 1])
cm_df = pd.DataFrame(cm, index=["true -1","true 0","true 1"], columns=["pred -1","pred 0","pred 1"])
cm_df


Unnamed: 0,pred -1,pred 0,pred 1
true -1,55,0,37
true 0,17,0,24
true 1,10,0,55
