In [None]:
import xgboost as xgb
import optuna
import shap
import joblib
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import roc_auc_score

TRAINING MODEL

Defining x an y for train and test

In [None]:
'''Loadtrain_encoded.parquet and test_encoded.parquet
Split into X_train, y_train, X_test, y_test
Define an Optuna objective function:
Tune: eta, max_depth, min_child_weight, subsample, colsample_bytree
Use xgb.train() with early_stopping_rounds=50
Return AUC on test set

Run Optuna for 30 trials
Train final model with best params
Print final test AUC
Create SHAP explainer
Save:
Model → models/xgb_model.pkl
Explainer → models/shap_explainer.pkl

Save SHAP summary plot → shap_summary.png'''

In [16]:
os.chdir('c:\\Renzo\\Projects\\credit-risk-ai\\')
train = pd.read_parquet('data/train_encoded.parquet')
test = pd.read_parquet('data/test_encoded.parquet')

In [17]:
X_train = train.drop(columns=['target'])
y_train = train['target']
X_test = test.drop(columns=['target'])
y_test =  test['target']

In [18]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:verification_status: object

Defining Optuna Objective function

In [9]:
def train(params,trial):

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtest, 'test')],
        early_stopping_rounds=50,
        verbose_eval=False,
        callbacks=[pruning_callback]
    )

    y_pred = model.predict(dtest)
    auc = roc_auc_score(y_test, y_pred)

    return auc


In [10]:
def objective(trial):
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'random_state': 42,
        'tree_method': 'hist',
        'n_jobs': -1
    }

    return train(params, trial)

In [13]:
func = lambda trail: objective(trail)
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(func, n_trials=30, timeout=1800)

[I 2025-11-16 00:04:33,529] A new study created in memory with name: no-name-d7d9d6a8-8f2e-4c0d-81c4-e00ecca6ff0d
[W 2025-11-16 00:04:33,536] Trial 0 failed with parameters: {'eta': 0.2375473566813336, 'max_depth': 4, 'min_child_weight': 2.7759913554297815, 'subsample': 0.9529685366184227, 'colsample_bytree': 0.6366003571248084, 'lambda': 0.003516225113647617, 'alpha': 0.0013537491384377055} because of the following error: ModuleNotFoundError('\nCould not find `optuna-integration` for `xgboost`.\nPlease run `pip install optuna-integration[xgboost]`.').
Traceback (most recent call last):
  File "c:\Renzo\Projects\credit-risk-ai\venv\Lib\site-packages\optuna\integration\xgboost.py", line 5, in <module>
    from optuna_integration.xgboost import XGBoostPruningCallback
ModuleNotFoundError: No module named 'optuna_integration'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Renzo\Projects\credit-risk-ai\venv\Lib\site-packag

ModuleNotFoundError: 
Could not find `optuna-integration` for `xgboost`.
Please run `pip install optuna-integration[xgboost]`.

In [14]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'random_state': 42,
        'tree_method': 'hist',
        'n_jobs': -1
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtest, 'test')],
        early_stopping_rounds=50,
        verbose_eval=False,
        callbacks=[pruning_callback]
    )

    preds = model.predict(dtest)
    auc = roc_auc_score(y_test, preds)
    return auc

# 3. Run Optuna Study
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=30, timeout=1800)  # 30 min max

print(f"\nBest AUC: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")

[I 2025-11-16 00:05:41,519] A new study created in memory with name: no-name-5a727adc-5d34-44a5-b066-23b56427ca07
[W 2025-11-16 00:05:41,520] Trial 0 failed with parameters: {'eta': 0.025651948859844356, 'max_depth': 9, 'min_child_weight': 4.34108613048495, 'subsample': 0.6894076935067541, 'colsample_bytree': 0.6307646124698039, 'lambda': 0.001215737392450457, 'alpha': 0.48131730742797046} because of the following error: NameError("name 'X_train' is not defined").
Traceback (most recent call last):
  File "c:\Renzo\Projects\credit-risk-ai\venv\Lib\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Owner\AppData\Local\Temp\ipykernel_26352\2688898350.py", line 17, in objective
    dtrain = xgb.DMatrix(X_train, label=y_train)
                         ^^^^^^^
NameError: name 'X_train' is not defined. Did you mean: 'train'?
[W 2025-11-16 00:05:41,521] Trial 0 failed with value None.


NameError: name 'X_train' is not defined