# Training

In [2]:
# -----------------------------------------------
# 📦 Imports
# -----------------------------------------------
from features.preprocess import drop_leakage_cols, apply_smote
from data.load_data import load_processed_data
from models.logistic_regression import train_logistic_regression
from models.random_forest import train_random_forest
from models.gradient_boosting import train_gradient_boosting
from models.xgboost_model import train_xgboost
from utils.metrics import evaluate_model, threshold_tuning, print_classification_report
from utils.mlflow_utils import setup_mlflow, log_model_mlflow
from sklearn.model_selection import train_test_split

In [3]:
# -----------------------------------------------
# 📦 Load Data
# -----------------------------------------------
X, y = load_processed_data("../../data/processed/processed_data.parquet")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
# -----------------------------------------------
# 📦 Logistic Regression
# -----------------------------------------------
lr_model = train_logistic_regression(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]
lr_metrics = evaluate_model(y_test, y_pred_lr, y_prob_lr)
print("Logistic Regression Metrics:", lr_metrics)

Logistic Regression Metrics: {'accuracy': 0.5715405723245786, 'precision': 0.11776608660461352, 'recall': 0.9797979797979798, 'f1': 0.21026011560693642, 'roc_auc': 0.7787303800715756}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# -----------------------------------------------
# 📦 Random Forest
# -----------------------------------------------
cols_to_drop = ["FraudResult"]
X_train_clean = drop_leakage_cols(X_train, cols_to_drop)

X_train_smote, y_train_smote = apply_smote(X_train_clean, y_train)

param_grid_rf = {
    "n_estimators": [100, 300, 500],
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
}

best_rf, best_params_rf = train_random_forest(X_train_smote, y_train_smote, param_grid_rf)

X_test_clean = drop_leakage_cols(X_test, cols_to_drop)
y_prob_rf = best_rf.predict_proba(X_test_clean)[:, 1]
best_thresh_rf, best_f1_rf = threshold_tuning(y_test, y_prob_rf)
y_pred_rf = (y_prob_rf >= best_thresh_rf).astype(int)
rf_metrics = evaluate_model(y_test, y_pred_rf, y_prob_rf)

print("\nRandom Forest Metrics:", rf_metrics)
print_classification_report(y_test, y_pred_rf)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  23.2s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  23.9s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  24.2s
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time=  23.9s
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time=  24.6s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=  12.6s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=  15.7s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=  15.3s
[CV] END max_depth=10, min_samples_split=5, n_estimators=300; total time=  42.5s
[CV] END max_depth=10, min_samples_split=10, n_estimators=300; total time=  43.5s
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time=  21.2s
[CV] END max_depth=10, min_samples_split=

In [6]:
# -----------------------------------------------
# 📦 Gradient Boosting
# -----------------------------------------------
param_grid_gbm = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "subsample": [0.8],
}

best_gbm, best_params_gbm = train_gradient_boosting(X_train_smote, y_train_smote, param_grid_gbm)
# GBM predictions
y_pred_gbm = best_gbm.predict(X_test_clean)
y_prob_gbm = best_gbm.predict_proba(X_test_clean)[:, 1]

gbm_metrics = evaluate_model(y_test, y_pred_gbm, y_prob_gbm)

print("\nGradient Boosting Metrics:", gbm_metrics)
print_classification_report(y_test, y_pred_gbm)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=  34.2s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time=  34.0s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=  34.3s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=  35.0s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time=  35.5s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time=  35.7s
[CV] END learning_rate=0.05, max_depth=5, n_estimators=100, subsample=0.8; total time=  52.6s
[CV] END learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time=  52.4s
[CV] END learning_rate=0.05, max_depth=5, n_estimators=100, subsample=0.8; total time=  53.5s
[CV] END learning_rate=0.05, max_depth=5, n_estimators=100, subsample=0.8; total time=  53.9s
[CV]

In [7]:
# -----------------------------------------------
# 📦 XGBoost
# -----------------------------------------------
param_grid_xgb = {
    "n_estimators": [100, 300, 500],
    "max_depth": [3, 6, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "scale_pos_weight": [1, 2, 5],
}

best_xgb, best_params_xgb = train_xgboost(X_train_smote, y_train_smote, param_grid_xgb)

# XGBoost predictions
y_prob_xgb = best_xgb.predict_proba(X_test_clean)[:, 1]
best_thresh_xgb, best_f1_xgb = threshold_tuning(y_test, y_prob_xgb)
y_pred_xgb = (y_prob_xgb >= best_thresh_xgb).astype(int)
xgb_metrics = evaluate_model(y_test, y_pred_xgb, y_prob_xgb)

print("\nXGBoost Metrics:", xgb_metrics)
print_classification_report(y_test, y_pred_xgb)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, scale_pos_weight=2, subsample=0.8; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, scale_pos_weight=2, subsample=0.8; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, scale_pos_weight=2, subsample=0.8; total time=   2.1s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   2.0s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   1.9s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   1.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.6; total time=   1.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time=   3.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time=   3.8s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.6; total time=   1.5s

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time=   4.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, scale_pos_weight=2, subsample=1.0; total time=   4.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, scale_pos_weight=2, subsample=1.0; total time=   4.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, scale_pos_weight=2, subsample=1.0; total time=   4.5s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=500, scale_pos_weight=1, subsample=0.6; total time=   4.0s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=10, n_estimators=300, scale_pos_weight=2, subsample=0.8; total time=   8.5s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=500, scale_pos_weight=1, subsample=0.6; total time=   4.4s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=500, scale_pos_weight=1, subsample=0.6; total time=   4.3s
[CV] END colsample_bytree=0.6, learning_rate=0.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Metrics: {'accuracy': 0.887952436952829, 'precision': 0.2831578947368421, 'recall': 0.6038159371492705, 'f1': 0.38552490146900753, 'roc_auc': 0.9132033325119231}
              precision    recall  f1-score   support

           0     0.9737    0.9055    0.9384     14415
           1     0.2832    0.6038    0.3855       891

    accuracy                         0.8880     15306
   macro avg     0.6284    0.7547    0.6619     15306
weighted avg     0.9335    0.8880    0.9062     15306



In [8]:
# -----------------------------------------------
# 📦 MLflow Logging
# -----------------------------------------------
setup_mlflow()

rf_run_id = log_model_mlflow(
    best_rf,
    model_name="CreditRiskModel",
    params=best_params_rf,
    metrics=rf_metrics,
    threshold=best_thresh_rf,
    run_name="RandomForest_SMOTE_ThresholdTuned",
    flavor="sklearn",
    register=True,             # ✅ NEW
    stage="Production",        # ✅ NEW
)

xgb_run_id = log_model_mlflow(
    best_xgb,
    model_name="CreditRiskModel",
    params=best_params_xgb,
    metrics=xgb_metrics,
    threshold=best_thresh_xgb,
    run_name="XGBoost_SMOTE_ThresholdTuned",
    flavor="xgboost",
    register=True,             # ✅ NEW
    stage="Production",        # ✅ NEW
)


print(f"Random Forest run ID: {rf_run_id}")
print(f"XGBoost run ID: {xgb_run_id}")

Registered model 'CreditRiskModel' already exists. Creating a new version of this model...
2025/06/30 12:56:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: CreditRiskModel, version 3
Created version '3' of model 'CreditRiskModel'.
  self.get_booster().save_model(fname)


Registered model 'CreditRiskModel' as version 3
Moved model version 3 to stage 'Production'


Registered model 'CreditRiskModel' already exists. Creating a new version of this model...
2025/06/30 12:56:51 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: CreditRiskModel, version 4


Registered model 'CreditRiskModel' as version 4
Moved model version 4 to stage 'Production'
Random Forest run ID: eec6ca038f764c54b2003080c8763888
XGBoost run ID: 348c63120b4241458b5f584dd0d62669


Created version '4' of model 'CreditRiskModel'.


In [11]:
import pandas as pd

df = pd.read_parquet("../../data/processed/processed_data.parquet")
print(df.columns.tolist())


['datetime__TransactionStartTime_hour', 'datetime__TransactionStartTime_day', 'datetime__TransactionStartTime_month', 'datetime__TransactionStartTime_weekday', 'datetime__TransactionStartTime_year', 'numeric__Amount', 'numeric__Value', 'numeric__Amount_log', 'numeric__Amount_capped', 'numeric__total_transaction_amount', 'numeric__avg_transaction_amount', 'numeric__transaction_count', 'numeric__std_transaction_amount', 'woe__ProductCategory_woe', 'woe__ChannelId_woe', 'woe__ProviderId_woe', 'woe__ProductId_woe', 'onehot__PricingStrategy_0', 'onehot__PricingStrategy_1', 'onehot__PricingStrategy_2', 'onehot__PricingStrategy_4', 'onehot__is_large_transaction_0', 'onehot__is_large_transaction_1', 'is_high_risk', 'FraudResult']
