In [9]:
import pandas as pd
import glob
import os

# path to your data folder (adjust if needed)
data_path = "data"   # for example: "/Users/rounakkumar/Desktop/data"

# read all daily pickle files
all_files = sorted(glob.glob(os.path.join(data_path, "*.pkl")))

dfs = []
for f in all_files:
    df = pd.read_pickle(f)
    dfs.append(df)

# merge into one big DataFrame
full_df = pd.concat(dfs, ignore_index=True)

# save as CSV for the pipeline
full_df.to_csv("transactions.csv", index=False)

print("✅ transactions.csv created successfully!")
print("Shape:", full_df.shape)
print("Columns:", full_df.columns.tolist())


✅ transactions.csv created successfully!
Shape: (1754155, 9)
Columns: ['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD', 'TX_FRAUD_SCENARIO']


In [3]:
! pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn, imblearn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [imblearn]
[1A[2KSuccessfully installed imbalanced-learn-0.14.0 imblearn-0.0


In [5]:
# 1. Install libomp (needed by xgboost)
! brew install libomp

# 2. Link it so xgboost can find it
! brew link libomp --force

# 3. Reinstall XGBoost so it compiles correctly for your architecture
! pip uninstall xgboost -y
! pip install xgboost --no-cache-dir


[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with `$HOMEBREW_AUTO_UPDATE_SECS` or disable with
`$HOMEBREW_NO_AUTO_UPDATE=1`. Hide these hints with `$HOMEBREW_NO_ENV_HINTS=1` (see `man brew`).
[34m==>[0m [1mHomebrew collects anonymous analytics.[0m
[1mRead the analytics documentation (and how to opt-out) here:
  [4mhttps://docs.brew.sh/Analytics[24m[0m
No analytics have been recorded yet (nor will be during this `brew` run).

[34m==>[0m [1mHomebrew is run entirely by unpaid volunteers. Please consider donating:[0m
  [4mhttps://github.com/Homebrew/brew#donations[24m

[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
airtable-mcp-server: MCP Server for Airtable
archgw: CLI for Arch Gateway
chrome-devtools-mcp: Chrome DevTools for coding agents
cliproxyapi: Wrap Gemini CLI, Codex, Claude Code, Qwen Code as an API service
config-file-validator: CLI tool to validate dif

In [7]:
! pip install shap


Collecting shap
  Downloading shap-0.48.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.62.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.8 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting llvmlite<0.46,>=0.45.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.45.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.8 kB)
Downloading shap-0.48.0-cp311-cp311-macosx_11_0_arm64.whl (548 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m548.0/548.0 kB[0m [31m5.3 MB/s[0m  [33m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading numba-0.62.1-cp311-cp311-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m3.1 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25

In [10]:
# fraud_pipeline.py
# Requirements: pandas, numpy, scikit-learn, xgboost, imbalanced-learn, joblib, matplotlib, seaborn, shap
# pip install pandas numpy scikit-learn xgboost imbalanced-learn joblib matplotlib seaborn shap

import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# modeling
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (precision_recall_fscore_support, classification_report,
                             roc_auc_score, average_precision_score, confusion_matrix)
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# 1) Load data
df = pd.read_csv("transactions.csv", parse_dates=["TX_DATETIME"])  # adjust filename/column names if necessary

# Preview
print("Rows:", len(df))
print(df.head())

# 2) Basic cleaning
# Drop duplicates
df = df.drop_duplicates(subset=["TRANSACTION_ID"])
# Ensure TX_FRAUD is int (0/1)
df["TX_FRAUD"] = df["TX_FRAUD"].astype(int)

# 3) Feature engineering
# Time features
df["tx_hour"] = df["TX_DATETIME"].dt.hour
df["tx_dayofweek"] = df["TX_DATETIME"].dt.dayofweek
df["tx_day"] = df["TX_DATETIME"].dt.day
df["tx_month"] = df["TX_DATETIME"].dt.month
df["tx_is_weekend"] = df["tx_dayofweek"].isin([5,6]).astype(int)

# Amount transformations
df["tx_amount_log"] = np.log1p(df["TX_AMOUNT"])

# Rolling / historical aggregates per customer and terminal
# To avoid future leakage we will compute aggregated features *up to the transaction time* in a time-ordered manner.
df = df.sort_values("TX_DATETIME").reset_index(drop=True)

# Helper to compute expanding aggregates up to previous transaction per entity
def expanding_aggregates(df, group_col, amount_col="TX_AMOUNT", prefix="cust"):
    # compute for each row, the prior average and count per group
    df[f"{prefix}_tx_count_prior"] = 0
    df[f"{prefix}_tx_amount_mean_prior"] = 0.0
    # We'll iterate group by group for safe "prior only"
    for g, gdf in df.groupby(group_col):
        idx = gdf.index
        # expanding count and mean, shifted by 1 to be prior-only
        cumsum = gdf[amount_col].cumsum() - gdf[amount_col]
        count = np.arange(len(gdf))
        mean_prior = np.where(count == 0, 0.0, cumsum / np.where(count == 0, 1, count))
        df.loc[idx, f"{prefix}_tx_count_prior"] = count
        df.loc[idx, f"{prefix}_tx_amount_mean_prior"] = mean_prior
    return df

# Customer aggregates
df = expanding_aggregates(df, group_col="CUSTOMER_ID", amount_col="TX_AMOUNT", prefix="cust")
# Terminal aggregates
df = expanding_aggregates(df, group_col="TERMINAL_ID", amount_col="TX_AMOUNT", prefix="term")

# Additional features: ratio of amount to customer's avg, terminal's avg
df["tx_amount_over_cust_avg"] = df["TX_AMOUNT"] / (df["cust_tx_amount_mean_prior"].replace(0, np.nan))
df["tx_amount_over_term_avg"] = df["TX_AMOUNT"] / (df["term_tx_amount_mean_prior"].replace(0, np.nan))
# fill inf/nan
df["tx_amount_over_cust_avg"].replace([np.inf, -np.inf], np.nan, inplace=True)
df["tx_amount_over_cust_avg"].fillna(0, inplace=True)
df["tx_amount_over_term_avg"].replace([np.inf, -np.inf], np.nan, inplace=True)
df["tx_amount_over_term_avg"].fillna(0, inplace=True)

# Flag for high amount (based on PDF rule)
df["flag_amount_gt_220"] = (df["TX_AMOUNT"] > 220).astype(int)

# 4) Select features and target
feature_cols = [
    "TX_AMOUNT", "tx_amount_log", "tx_hour", "tx_dayofweek", "tx_is_weekend",
    "cust_tx_count_prior", "cust_tx_amount_mean_prior",
    "term_tx_count_prior", "term_tx_amount_mean_prior",
    "tx_amount_over_cust_avg", "tx_amount_over_term_avg",
    "flag_amount_gt_220"
]
target_col = "TX_FRAUD"

X = df[feature_cols].fillna(0)
y = df[target_col]

# 5) Time-based split: use earliest 80% (by date) as train, latest 20% as test
cutoff_index = int(0.8 * len(df))
train_idx = df.index[:cutoff_index]
test_idx = df.index[cutoff_index:]

X_train, y_train = X.loc[train_idx], y.loc[train_idx]
X_test, y_test = X.loc[test_idx], y.loc[test_idx]

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Fraud ratio train:", y_train.mean(), "test:", y_test.mean())

# 6) Modeling pipeline (with class weighting via scale_pos_weight for XGBoost)
# We'll try XGBoost as baseline with SMOTE + scaling for numerical stability (scaling helps the over-sampler)
numeric_features = feature_cols
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ],
    remainder="drop"
)

# Use imblearn Pipeline to apply SMOTE only on training folds
model = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    n_jobs=4,
    random_state=42
)

pipe = ImbPipeline(steps=[
    ("preproc", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", model)
])

# 7) Hyperparameter search (randomized)
param_dist = {
    "clf__n_estimators": [100, 200, 400],
    "clf__max_depth": [3, 6, 10],
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__subsample": [0.6, 0.8, 1.0],
    "clf__colsample_bytree": [0.6, 0.8, 1.0],
}

tscv = TimeSeriesSplit(n_splits=3)
rs = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=12,
                        scoring="average_precision", cv=tscv, verbose=2, random_state=42)

rs.fit(X_train, y_train)

print("Best params:", rs.best_params_)
best_model = rs.best_estimator_

# 8) Evaluation on test set
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("PR AUC (avg precision):", average_precision_score(y_test, y_proba))
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

# Save model and feature list and columns
os.makedirs("artifacts", exist_ok=True)
joblib.dump(best_model, "artifacts/fraud_xgb_pipeline.pkl")
joblib.dump(feature_cols, "artifacts/feature_cols.pkl")

# 9) SHAP explainability (sample)
# This will show which features drive predictions for a few transactions
explainer = shap.TreeExplainer(best_model.named_steps["clf"])
# Need to provide preprocessed features to explainer
X_test_pre = best_model.named_steps["preproc"].transform(X_test)
# Convert to DataFrame for SHAP labeling
X_test_pre_df = pd.DataFrame(X_test_pre, columns=feature_cols)
shap_values = explainer.shap_values(X_test_pre_df)
# Summary plot (save to file)
shap.summary_plot(shap_values, X_test_pre_df, show=False)
plt.savefig("artifacts/shap_summary.png", bbox_inches="tight")
plt.close()

print("Pipeline finished. Artifacts saved in ./artifacts")


Rows: 1754155
   TRANSACTION_ID         TX_DATETIME  CUSTOMER_ID  TERMINAL_ID  TX_AMOUNT  \
0               0 2018-04-01 00:00:31          596         3156      57.16   
1               1 2018-04-01 00:02:10         4961         3412      81.51   
2               2 2018-04-01 00:07:56            2         1365     146.00   
3               3 2018-04-01 00:09:29         4128         8737      64.49   
4               4 2018-04-01 00:10:34          927         9906      50.99   

   TX_TIME_SECONDS  TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               31             0         0                  0  
1              130             0         0                  0  
2              476             0         0                  0  
3              569             0         0                  0  
4              634             0         0                  0  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["tx_amount_over_cust_avg"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["tx_amount_over_cust_avg"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat

Train shape: (1403324, 12) Test shape: (350831, 12)
Fraud ratio train: 0.00821691925742024 test: 0.008978682043491025
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=10, clf__n_estimators=400, clf__subsample=0.6; total time=   4.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=10, clf__n_estimators=400, clf__subsample=0.6; total time=   9.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=10, clf__n_estimators=400, clf__subsample=0.6; total time=  14.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=400, clf__subsample=0.6; total time=   2.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=400, clf__subsample=0.6; total time=   4.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=400, clf__subsample=0.6; total time=   7.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=1.0; total time=   3.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=1.0; total time=   5.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=1.0; total time=   8.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=0.8; total time=   3.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=0.8; total time=   6.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=0.8; total time=   9.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=10, clf__n_estimators=200, clf__subsample=0.8; total time=   2.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=10, clf__n_estimators=200, clf__subsample=0.8; total time=   5.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.1, clf__max_depth=10, clf__n_estimators=200, clf__subsample=0.8; total time=   7.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=6, clf__n_estimators=200, clf__subsample=0.8; total time=   2.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=6, clf__n_estimators=200, clf__subsample=0.8; total time=   3.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.01, clf__max_depth=6, clf__n_estimators=200, clf__subsample=0.8; total time=   5.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=400, clf__subsample=1.0; total time=   5.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=400, clf__subsample=1.0; total time=   9.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=400, clf__subsample=1.0; total time=  13.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=0.6; total time=   3.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=0.6; total time=   6.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.1, clf__max_depth=6, clf__n_estimators=400, clf__subsample=0.6; total time=  10.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=6, clf__n_estimators=100, clf__subsample=0.6; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=6, clf__n_estimators=100, clf__subsample=0.6; total time=   2.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.01, clf__max_depth=6, clf__n_estimators=100, clf__subsample=0.6; total time=   3.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   2.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.8, clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   3.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__subsample=0.6; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__subsample=0.6; total time=   3.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=0.6, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__subsample=0.6; total time=   4.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__subsample=1.0; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__subsample=1.0; total time=   2.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END clf__colsample_bytree=1.0, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__subsample=1.0; total time=   4.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params: {'clf__subsample': 1.0, 'clf__n_estimators': 200, 'clf__max_depth': 3, 'clf__learning_rate': 0.05, 'clf__colsample_bytree': 0.8}
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    347681
           1       0.47      0.32      0.38      3150

    accuracy                           0.99    350831
   macro avg       0.73      0.66      0.69    350831
weighted avg       0.99      0.99      0.99    350831

ROC AUC: 0.6642750705205368
PR AUC (avg precision): 0.33264595124364976
Confusion matrix:
 [[346543   1138]
 [  2130   1020]]
Pipeline finished. Artifacts saved in ./artifacts
