## Model Inference 

- Run the inference on unseen test data 
- Apply the optimised decision threshold 
- Produce the final metrics 

In [10]:
# imports and configuration 

import pandas as pd
import numpy as np 

from sklearn.metrics import (
  roc_auc_score,
  average_precision_score,
  precision_score,
  recall_score,
  f1_score,
  confusion_matrix,
  classification_report,
  accuracy_score
)

In [2]:
# load the test data 

from pathlib import Path

DATA_DIR = Path("../data/processed")

X_test = pd.read_parquet(DATA_DIR / "X_test.parquet")
y_test = pd.read_parquet(DATA_DIR / "y_test.parquet").values.ravel()

print(X_test.shape, y_test.shape)

(42722, 30) (42722,)


In [7]:
# load the best model with optimised threshold

import joblib 

MODEL_PATH = Path("../models/final_xgb_with_threshold.joblib")

model = joblib.load(MODEL_PATH)

print("Model loaded successfully:", type(model))
print(model.keys())


Model loaded successfully: <class 'dict'>
dict_keys(['model', 'threshold', 'val_metrics', 'test_metrics'])


In [8]:
xgb_model = model['model']
threshold = model['threshold']

y_test_proba = xgb_model.predict_proba(X_test)[:, 1]

In [9]:
# apply threshold to get the final predictions 

y_test_pred = (y_test_proba >= threshold).astype(int)

In [11]:
if 'y_test' in globals():
    acc = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_test_proba)
    cm = confusion_matrix(y_test, y_test_pred)

    print("\n=== Test Set Evaluation ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1 Score : {f1:.4f}")
    print(f"ROC-AUC  : {roc_auc:.4f}")
    print("Confusion Matrix:")
    print(cm)


=== Test Set Evaluation ===
Accuracy : 0.9995
Precision: 0.9655
Recall   : 0.7568
F1 Score : 0.8485
ROC-AUC  : 0.9718
Confusion Matrix:
[[42646     2]
 [   18    56]]


In [12]:
predictions_df = pd.DataFrame({
    "y_proba": y_test_proba,
    "y_pred": y_test_pred
})


predictions_df

Unnamed: 0,y_proba,y_pred
0,3.456435e-08,0
1,1.629356e-09,0
2,1.503698e-09,0
3,4.495799e-06,0
4,1.405755e-07,0
...,...,...
42717,4.533736e-09,0
42718,1.506428e-07,0
42719,3.479468e-08,0
42720,6.894880e-10,0


In [13]:
DATA_DIR = "../data/outputs/"  # ensure this folder exists

# Save predictions to CSV in DATA_DIR
predictions_df.to_csv(DATA_DIR + "test_predictions.csv", index=False)
print(f"\nPredictions saved to '{DATA_DIR}test_predictions.csv'")


Predictions saved to '../data/outputs/test_predictions.csv'
