In [None]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_curve
import joblib
from data_preprocess import X_train, y_train, X_val, y_val

In [None]:
# compute imbalance weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

model = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objectives="binary:logistic",
    eval_metrics="aucpr",
    scale_pos_weight=scale_pos_weight,
    early_stopping_rounds=30,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

# save model
joblib.dump(model, "plutus_xgb.pkl")

In [None]:
y_probs = model.predict_proba(X_val)[:, 1]

precisions, recalls, thresholds, = precision_recall_curve(y_val, y_probs)

# pick threshold wit recall >= 0.8
target_recall = 0.8
idx = np.where(recalls >= target_recall)[0][-1]
chosen_threshold = thresholds[idx]

print("Chosen threshold:", chosen_threshold)

# save threshold
joblib.dump(chosen_threshold, "threshold.pkl")

In [None]:
y_pred = (y_probs >= chosen_threshold).astype(int)

print(classification_report(y_val, y_pred))

In [None]:
for t in [0.1, 0.2, 0.3, 0.4, 0.5, 0.9]:
    preds = (y_probs >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_val, preds, digits=3))