In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

df = pd.read_parquet("../data/features.parquet").dropna(subset=["regime"])

X = df[["regime","lev_bucket","Side","Size Tokens","Execution Price"]].copy()
y = (df["Closed PnL"] > 0).astype(int)

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["regime","lev_bucket","Side"]),
    ("num", "passthrough", ["Size Tokens","Execution Price"])
])

model = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
model.fit(X_train, y_train)
pred = model.predict_proba(X_test)[:,1]
print("ROC-AUC:", roc_auc_score(y_test, pred))
print(classification_report(y_test, pred>0.5))


ModuleNotFoundError: No module named 'sklearn'