In [2]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

### LOADING DATA

In [4]:
df = pd.read_csv("../output/dataset_labeled_with_exit.csv")
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values("time").reset_index(drop=True)
df.head(10)

Unnamed: 0,time,side,spread_pts,atr_pts,r1,r2,r3,r4,r5,r6,...,loww8,body9,upw9,loww9,body10,upw10,loww10,y,exit_reason,exit_points
0,2025-11-05 03:50:00,SELL,18,547.0,-450.0,-218.0,-229.0,217.0,-265.0,180.0,...,86.0,42.0,196.0,195.0,131.0,88.0,286.0,1,EARLY_PROFIT,30.0
1,2025-11-05 04:15:00,SELL,18,525.142857,-244.0,115.0,393.0,-2.0,36.0,-450.0,...,0.0,218.0,193.0,3.0,275.0,210.0,37.0,1,EARLY_PROFIT,114.0
2,2025-11-05 07:45:00,SELL,18,355.0,-518.0,-86.0,-430.0,-75.0,36.0,105.0,...,29.0,217.0,85.0,123.0,202.0,109.0,5.0,1,EARLY_PROFIT,12.0
3,2025-11-05 08:25:00,SELL,18,401.0,-278.0,322.0,217.0,-193.0,14.0,171.0,...,166.0,520.0,6.0,179.0,87.0,31.0,79.0,1,EARLY_PROFIT,16.0
4,2025-11-05 08:55:00,SELL,18,419.571429,-250.0,-59.0,-184.0,166.0,-192.0,417.0,...,264.0,217.0,12.0,110.0,194.0,170.0,55.0,1,EARLY_PROFIT,97.0
5,2025-11-05 11:10:00,SELL,18,431.571429,-289.0,-103.0,-49.0,-178.0,179.0,-189.0,...,56.0,173.0,160.0,3.0,195.0,122.0,157.0,1,EARLY_PROFIT,1049.0
6,2025-11-05 13:05:00,SELL,18,465.285714,-590.0,427.0,228.0,-559.0,72.0,121.0,...,41.0,173.0,33.0,0.0,47.0,107.0,43.0,1,EARLY_PROFIT,633.0
7,2025-11-05 14:00:00,SELL,18,351.142857,-305.0,144.0,-69.0,298.0,259.0,178.0,...,65.0,231.0,40.0,17.0,11.0,75.0,63.0,1,EARLY_PROFIT,155.0
8,2025-11-05 14:10:00,SELL,18,319.928571,-143.0,185.0,-305.0,144.0,-69.0,298.0,...,119.0,62.0,30.0,56.0,53.0,133.0,65.0,1,EARLY_PROFIT,60.0
9,2025-11-05 14:35:00,SELL,18,323.642857,-294.0,-54.0,5.0,-44.0,359.0,-143.0,...,165.0,144.0,68.0,47.0,22.0,256.0,96.0,1,EARLY_PROFIT,206.0


### TARGET

In [None]:
df["target"] = (df["exit_points"] > 0).astype(int)

### FEATURE SELECTION

In [None]:
feature_cols = ["spread_pts","atr_pts"] + \
               [f"r{i}" for i in range(1,11)] + \
               [f"{p}{i}" for i in range(1,11) for p in ["body","upw","loww"]]

X = df[feature_cols].astype(float)
y = df["target"]
points = df["exit_points"]

### TIME-BASED TRAIN/TEST SPLIT

In [None]:
split = int(len(df)*0.8)

X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]
points_train, points_test = points.iloc[:split], points.iloc[split:]

print("Train size:", len(points_train))
print("Test size:", len(points_test))

### MODEL (XGBOOST REGRESSOR)

In [None]:
model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    max_depth=3,
    learning_rate=0.03,
    n_estimators=600,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

    
model.fit(X_train, y_train)

### PREDICT PROBABILITY

In [None]:
proba_test = model.predict_proba(X_test)[:,1]

### EVALUATION

In [None]:
def evaluate_threshold(th):
    mask = proba_test >= th
    if mask.sum() == 0:
        return 0,0,0,0
    
    pts = points_test[mask]
    wins = pts[pts > 0]
    losses = pts[pts <= 0]

    win_rate = len(wins) / len(pts)
    avg_win = wins.mean() if len(wins)>0 else 0
    avg_loss = losses.mean() if len(losses)>0 else 0

    expectancy = win_rate*avg_win + (1-win_rate)*avg_loss
    net = pts.sum()
    
    return expectancy, net, win_rate, len(pts)

best_th = 0
best_exp = -1e9

for th in np.arange(0.5,0.9,0.01):
    exp, net, wr, ntr = evaluate_threshold(th)
    if exp > best_exp:
        best_exp = exp
        best_th = th

print("Best threshold:", round(best_th,2))

exp, net, wr, ntr = evaluate_threshold(best_th)

print("Test trades taken:", ntr)
print("Win rate:", wr)
print("Net points:", net)
print("Expectancy per trade:", exp)
print("AUC:", roc_auc_score(y_test, proba_test))

### FEATURE IMPORTANCE

In [None]:
importances = model.feature_importances_
fi = pd.DataFrame({
    "feature": feature_cols,
    "importance": importances
}).sort_values("importance", ascending=False)

print("\nTop 10 Features:")
print(fi.head(10))

### SAVE MODEL

In [None]:
joblib.dump({
    "model": model,
    "feature_cols": feature_cols,
    "threshold": best_th
}, "./output/xgb_classifier_confirm.pkl")

print("Saved xgb_classifier_confirm.pkl")