# ML Pipeline: Fraud Detection Training
This notebook loads gold parquet data, performs time-based train/val/test/OOT splits, preprocesses data for Logistic Regression and XGBoost, handles class imbalance, trains models with Optuna-tuned hyperparameters, and evaluates them with MLflow logging.

In [6]:
import numpy as np
import pandas as pd
import os
import mlflow
from glob import glob
from src.data_loader import load_gold_parquet
from src.data_splitter import time_based_split
from src.features import preprocess_features
from src.imbalance_handler import handle_imbalance
from src.model_trainer import train_logistic_regression_tuned, train_xgboost_tuned
from src.model_evaluator import evaluate_model


In [2]:
# Configuration
MONTHS = pd.date_range("2017-01-01", "2019-10-01", freq="MS").strftime("%Y_%m").tolist()
CUTOFFS = {
    "oot1": "2018-11-01",
    "oot2": "2019-03-01",
    "oot3": "2019-07-01"
}
FEATURE_DIR = "/app/datamart/gold/feature_store"
LABEL_PATH = "/app/datamart/gold/label_store/gold_labels.parquet"

In [3]:
folder_path = "/app/datamart/gold/feature_store"
parquet_files = glob(os.path.join(folder_path, "*.parquet"))

print("Found parquet files:", parquet_files)  # Debugging

df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
print(df.head())
print(df.shape)

Found parquet files: ['/app/datamart/gold/feature_store/gold_features2015_01_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_02_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_03_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_04_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_05_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_06_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_07_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_08_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_09_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_10_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_11_01.parquet', '/app/datamart/gold/feature_store/gold_features2015_12_01.parquet', '/app/datamart/gold/feature_store/gold_features2016_01_01.parquet', '/app/datamart/gold/feature_store/gold_features2016_02_01.parquet', '/app/datamart/gold/featur

In [6]:
folder_path = "/app/datamart/gold/label_store/gold_labels.parquet"
parquet_files = glob(os.path.join(folder_path, "*.parquet"))

print("Found parquet files:", parquet_files)  # Debugging

df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
print(df.head())
print(df.shape)

Found parquet files: ['/app/datamart/gold/label_store/gold_labels.parquet/part-00000-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/app/datamart/gold/label_store/gold_labels.parquet/part-00001-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/app/datamart/gold/label_store/gold_labels.parquet/part-00002-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/app/datamart/gold/label_store/gold_labels.parquet/part-00003-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/app/datamart/gold/label_store/gold_labels.parquet/part-00004-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/app/datamart/gold/label_store/gold_labels.parquet/part-00005-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/app/datamart/gold/label_store/gold_labels.parquet/part-00006-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/app/datamart/gold/label_store/gold_labels.parquet/part-00007-1ba3aad8-01ec-4321-8732-2547e7bebdd9-c000.snappy.parquet', '/

In [4]:
# Load and split data
df = load_gold_parquet(FEATURE_DIR, LABEL_PATH, MONTHS)
splits = time_based_split(df, date_col="date", target_col="is_fraud", cutoffs=CUTOFFS)

for key in splits:
    splits[key] = (
        splits[key][0],
        splits[key][1].astype(str).str.lower().map({"no": 0, "yes": 1}).astype(int)
    )

X_train, y_train = splits["train"]
X_val, y_val = splits["val"]
X_test, y_test = splits["test"]
X_oot1, y_oot1 = splits["oot1"]
X_oot2, y_oot2 = splits["oot2"]
X_oot3, y_oot3 = splits["oot3"]

print("Date range in dataset:")
print(df["date"].min(), "→", df["date"].max())

print("Train columns:", X_train.columns.tolist())
print("OOT1 columns:", X_oot1.columns.tolist())

[Train] Size: (1029521, 40)
[Val]   Size: (343174, 40)
[Test]  Size: (343174, 40)
[OOT1]  Size: (306433, 41)
[OOT2]  Size: (312160, 41)
[OOT3]  Size: (314760, 41)
Date range in dataset:
2017-01-01 00:00:00 → 2019-10-31 00:00:00
Train columns: ['transaction_id', 'date', 'client_id', 'card_id', 'amount', 'use_chip', 'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'errors', 'year_month', 'current_age', 'retirement_age', 'birth_year', 'birth_month', 'gender', 'address', 'latitude', 'longitude', 'per_capita_income', 'yearly_income', 'total_debt', 'credit_score', 'num_credit_cards', 'card_brand', 'card_type', 'card_number', 'expires', 'cvv', 'has_chip', 'num_cards_issued', 'credit_limit', 'acct_open_date', 'year_pin_last_changed', 'card_on_dark_web', 'mcc_description', 'acct_opened_months', 'yrs_since_pin_changed']
OOT1 columns: ['transaction_id', 'date', 'client_id', 'card_id', 'amount', 'use_chip', 'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'errors', 'y

In [7]:
# Check the label distribution
print("Train label distribution:", np.bincount(y_train))
print("Validation label distribution:", np.bincount(y_val))
print("Test label distribution:", np.bincount(y_test))
print("OOT1 label distribution:", np.bincount(y_oot1))
print("OOT2 label distribution:", np.bincount(y_oot2))
print("OOT3 label distribution:", np.bincount(y_oot3))

Train label distribution: [1028653     868]
Validation label distribution: [342885    289]
Test label distribution: [342885    289]
OOT1 label distribution: [305836    597]
OOT2 label distribution: [311578    582]
OOT3 label distribution: [314224    536]


In [None]:
# Logistic Regression pipeline
print("[Step] Preprocessing features for Logistic Regression...")
X_train_lr, lr_pipeline = preprocess_features(X_train, model_type="logistic", fit_pipeline=True)
X_val_lr, _ = preprocess_features(X_val, model_type="logistic", fit_pipeline=False, pipeline=lr_pipeline)
X_test_lr, _ = preprocess_features(X_test, model_type="logistic", fit_pipeline=False, pipeline=lr_pipeline)
X_oot1_lr, _ = preprocess_features(X_oot1, model_type="logistic", fit_pipeline=False, pipeline=lr_pipeline)
print("[Done] Feature preprocessing completed.")

print("[Step] Handling imbalance with SMOTE...")
X_train_lr, y_train_lr = handle_imbalance(X_train_lr, y_train, strategy="smote")
print("[Done] SMOTE resampling completed.")

In [None]:
# Set experiment and start training
print("[Step] Starting MLflow experiment...")
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("fraud_detection")

with mlflow.start_run(run_name="LogisticRegression"):
    print("[Step] Training logistic regression with Optuna tuning...")
    logreg_model = train_logistic_regression_tuned(X_train_lr, y_train_lr, X_val_lr, y_val, pipeline=lr_pipeline, X_raw_train=X_train)
    print("[Done] Logistic regression training completed.")

    print("[Step] Evaluating on test set...")
    evaluate_model(logreg_model, X_test_lr, y_test, model_name="LogReg", dataset_label="Test")
    print("[Done] Test set evaluation completed.")

    # print("[Step] Evaluating on OOT1 set...")
    # evaluate_model(logreg_model, X_oot1_lr, y_oot1, model_name="LogReg", dataset_label="OOT1")
    # print("[Done] OOT1 set evaluation completed.")

print("[ALL COMPLETE] Logistic regression pipeline executed successfully.")

In [8]:
# XGBoost pipeline
print("[Step] Preprocessing features for XGBoost...")
X_train_xgb, _, xgb_input_example = preprocess_features(X_train, model_type="xgboost", return_sample=True)
X_val_xgb, _ = preprocess_features(X_val, model_type="xgboost")
X_test_xgb, _ = preprocess_features(X_test, model_type="xgboost")
X_oot1_xgb, _ = preprocess_features(X_oot1, model_type="xgboost")
print("[Done] Feature preprocessing completed.")

print("[Step] Handling imbalance for XGBoost...")
X_train_xgb, y_train_xgb = handle_imbalance(X_train_xgb, y_train, strategy="undersample")
print("[Done] Imbalance handling completed.")

[Step] Preprocessing features for XGBoost...
[Done] Feature preprocessing completed.
[Step] Handling imbalance for XGBoost...
[Imbalance Handler] Strategy: undersample
[Resampled] Samples: 1029521 → 1736 (Class 1: 868, Class 0: 868)
[Done] Imbalance handling completed.


In [9]:
print("[Step] Starting MLflow run for XGBoost...")
mlflow.set_experiment("fraud_detection")

with mlflow.start_run(run_name="XGBoost"):
    print("[Step] Training XGBoost with Optuna tuning...")
    xgb_model = train_xgboost_tuned(
        X_train_xgb, y_train_xgb, X_val_xgb, y_val, input_example=xgb_input_example
    )
    print("[Done] XGBoost training completed.")

    print("[Step] Evaluating XGBoost on test set...")
    evaluate_model(xgb_model, X_test_xgb, y_test, model_name="XGBoost", dataset_label="Test")
    print("[Done] Test set evaluation completed.")

    # print("[Step] Evaluating XGBoost on OOT1 set...")
    # evaluate_model(xgb_model, X_oot1_xgb, y_oot1, model_name="XGBoost", dataset_label="OOT1")
    # print("[Done] OOT1 set evaluation completed.")

    print("[Step] Ending MLflow run for XGBoost...")
    mlflow.end_run()
    print("[Done] MLflow run ended.")

print("[ALL COMPLETE] XGBoost pipeline executed successfully.")


[Step] Starting MLflow run for XGBoost...


[I 2025-05-30 02:22:29,318] A new study created in memory with name: no-name-2d0b001e-ba5b-4f0d-b2a7-f677f2636c21


[Step] Training XGBoost with Optuna tuning...


[I 2025-05-30 02:22:34,057] Trial 0 finished with value: 0.9982807739810621 and parameters: {'n_estimators': 397, 'max_depth': 9, 'learning_rate': 0.1791196565096831, 'subsample': 0.8279480853078705, 'colsample_bytree': 0.6152226492958887}. Best is trial 0 with value: 0.9982807739810621.
[I 2025-05-30 02:22:37,236] Trial 1 finished with value: 0.9982807739810621 and parameters: {'n_estimators': 271, 'max_depth': 8, 'learning_rate': 0.039616248610473574, 'subsample': 0.6662601004961927, 'colsample_bytree': 0.7407303950174817}. Best is trial 0 with value: 0.9982807739810621.
[I 2025-05-30 02:22:40,808] Trial 2 finished with value: 0.9988571428571429 and parameters: {'n_estimators': 489, 'max_depth': 9, 'learning_rate': 0.19339196112090265, 'subsample': 0.8541924560299821, 'colsample_bytree': 0.6401276674551465}. Best is trial 2 with value: 0.9988571428571429.
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[Validation Performance - XGBoost]
              precision    recall  f1-score   support

           0     0.9992    0.9999    0.9995    342885
           1     0.0000    0.0000    0.0000       289

    accuracy                         0.9990    343174
   macro avg     0.4996    0.4999    0.4998    343174
weighted avg     0.9983    0.9990    0.9987    343174



  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[Saved] xgboost model -> artifacts\xgboost_model.pkl
[Done] XGBoost training completed.
[Step] Evaluating XGBoost on test set...

[Evaluating] XGBoost on Test set...
Confusion Matrix:
 [[342851     34]
 [   289      0]]
F1-score: 0.0000 | ROC AUC: 0.9980 | PR AUC: 0.1729
[Done] Test set evaluation completed.
[Step] Ending MLflow run for XGBoost...
[Done] MLflow run ended.
[ALL COMPLETE] XGBoost pipeline executed successfully.


In [None]:
import numpy as np
# Class distribution check
print("Train class distribution:", np.bincount(y_train))
print("Validation class distribution:", np.bincount(y_val))

Train class distribution: [1028653     868]
Val class distribution: [342885    289]


In [15]:
# Class prediction distribution
y_pred = best_model.predict(X_val)
print("Predicted class distribution:", np.bincount(y_pred))

# AUC-PR
from sklearn.metrics import average_precision_score
print("AUC-PR:", average_precision_score(y_val, best_model.predict_proba(X_val)[:, 1]))

NameError: name 'best_model' is not defined