In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import optuna
from utils.preprocessing import preprocess_data
from lightgbm import early_stopping, log_evaluation

  from pandas.core import (


In [2]:
# --- Config tuned for CPU
N_TRIALS = 20
OPTUNA_N_JOBS = 1
RANDOM_STATE = 42
CV_FOLDS = 3

In [3]:
DATA_DIR = "data"
MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)

In [4]:
# --- Create sample dataset if missing
sample_path = os.path.join(DATA_DIR, "sample_dataset.csv")
if not os.path.exists(sample_path):
    np.random.seed(RANDOM_STATE)
    n_samples = 1000
    data = {
        'customer_id': [f'CUST{str(i).zfill(5)}' for i in range(1, n_samples + 1)],
        'age': np.random.randint(25, 70, n_samples),
        'employment_status': np.random.choice(['employed', 'self-employed', 'unemployed', 'student'], n_samples, p=[0.6,0.2,0.1,0.1]),
        'annual_income': np.random.randint(20000,100000,n_samples),
        'education_level': np.random.choice(['High School','Diploma','Bachelor','Master'], n_samples, p=[0.2,0.3,0.4,0.1]),
        'credit_history_length': np.random.randint(1,20,n_samples),
        'num_previous_loans': np.random.randint(0,10,n_samples),
        'num_defaults': np.random.randint(0,3,n_samples),
        'avg_payment_delay_days': np.random.randint(0,15,n_samples),
        'current_credit_score': np.random.randint(500,800,n_samples),
        'loan_amount': np.random.randint(5000,150000,n_samples),
        'loan_term_months': np.random.choice([12,24,36,48,60], n_samples),
        'loan_purpose': np.random.choice(['Business','Crypto-Backed','Car Loan','Education','Home Loan'], n_samples),
        'collateral_present': np.random.choice(['Yes','No'], n_samples, p=[0.7,0.3]),
        'identity_verified_on_chain': np.random.randint(0,2,n_samples),
        'transaction_consistency_score': np.round(np.random.uniform(0.2,1.0,n_samples),2),
        'fraud_alert_flag': np.random.randint(0,2,n_samples, p=[0.9,0.1]),
        'on_chain_credit_history': np.random.randint(0,10,n_samples)
    }
    df = pd.DataFrame(data)
    prob_default = (
        0.3 * (df['num_defaults'] > 0).astype(int) +
        0.2 * (df['employment_status'] == 'unemployed').astype(int) +
        0.1 * (df['current_credit_score'] < 600).astype(int) +
        0.1 * (df['avg_payment_delay_days'] > 7).astype(int) +
        0.1 * (df['loan_amount'] / df['annual_income'] > 0.5).astype(int) +
        0.1 * (df['fraud_alert_flag'] == 1).astype(int) +
        np.random.normal(0, 0.1, len(df))
    )
    df['default_flag'] = (prob_default > 0.5).astype(int)
    df['probability_of_default'] = np.clip(1 / (1 + np.exp(-prob_default)), 0.01, 0.99)
    os.makedirs(DATA_DIR, exist_ok=True)
    df.to_csv(sample_path, index=False)
    print("Sample dataset created at", sample_path)

In [5]:
# --- Load and preprocess
df = pd.read_csv(sample_path)
df = preprocess_data(df)

features = [
 'age','employment_status','annual_income','education_level',
 'credit_history_length','num_previous_loans','num_defaults',
 'avg_payment_delay_days','current_credit_score','loan_amount',
 'loan_term_months','loan_purpose','collateral_present',
 'identity_verified_on_chain','transaction_consistency_score',
 'fraud_alert_flag','on_chain_credit_history','income_to_loan_ratio',
 'credit_utilization','default_rate'
]

X = df[features]
y = df['default_flag']

joblib.dump(features, os.path.join(MODELS_DIR,'feature_columns.pkl'))

['models\\feature_columns.pkl']

In [6]:
# --- Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# --- Optuna objective
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_jobs': 2
    }

    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    for train_idx, val_idx in cv.split(X_train, y_train):
        Xt, Xv = X_train.iloc[train_idx], X_train.iloc[val_idx]
        yt, yv = y_train.iloc[train_idx], y_train.iloc[val_idx]

        sm = SMOTE(random_state=RANDOM_STATE)
        Xt_res, yt_res = sm.fit_resample(Xt, yt)

        model = lgb.LGBMClassifier(**params)
        model.fit(
            Xt_res, yt_res,
            eval_set=[(Xv, yv)],
            eval_metric='auc',
            callbacks=[early_stopping(50), log_evaluation(0)]
        )

        preds = model.predict_proba(Xv)[:, 1]
        scores.append(roc_auc_score(yv, preds))

    return float(np.mean(scores))


# --- Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, n_jobs=OPTUNA_N_JOBS)

print("Best trial params:", study.best_trial.params)
best = study.best_trial.params
best.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'n_jobs': 2
})


[I 2025-09-07 11:42:42,220] A new study created in memory with name: no-name-dd240d10-b6a1-40af-b6e5-4c93ac8abc62


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[142]	valid_0's auc: 0.80225	valid_0's binary_logloss: 0.525738
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	valid_0's auc: 0.808174	valid_0's binary_logloss: 0.624938
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:43,715] Trial 0 finished with value: 0.7999303436377515 and parameters: {'n_estimators': 301, 'learning_rate': 0.011650672954748257, 'num_leaves': 21, 'max_depth': 4, 'min_child_samples': 30, 'subsample': 0.7772412155010655, 'colsample_bytree': 0.6757356950409328, 'reg_alpha': 8.115945837833762e-05, 'reg_lambda': 0.07758010288880915}. Best is trial 0 with value: 0.7999303436377515.


Early stopping, best iteration is:
[3]	valid_0's auc: 0.789366	valid_0's binary_logloss: 0.682568
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.800431	valid_0's binary_logloss: 0.515495
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	valid_0's auc: 0.809584	valid_0's binary_logloss: 0.549045
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:44,401] Trial 1 finished with value: 0.8006154051583941 and parameters: {'n_estimators': 118, 'learning_rate': 0.051062493007548775, 'num_leaves': 57, 'max_depth': 3, 'min_child_samples': 44, 'subsample': 0.9355909123995245, 'colsample_bytree': 0.8115854595901646, 'reg_alpha': 1.030438818802362e-06, 'reg_lambda': 4.250632498684083}. Best is trial 1 with value: 0.8006154051583941.


Early stopping, best iteration is:
[19]	valid_0's auc: 0.791832	valid_0's binary_logloss: 0.559512
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	valid_0's auc: 0.802095	valid_0's binary_logloss: 0.518299
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[121]	valid_0's auc: 0.815166	valid_0's binary_logloss: 0.47469
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:45,600] Trial 2 finished with value: 0.8035791730404077 and parameters: {'n_estimators': 373, 'learning_rate': 0.06378125571603142, 'num_leaves': 77, 'max_depth': 5, 'min_child_samples': 23, 'subsample': 0.8449637552880775, 'colsample_bytree': 0.7818633285201985, 'reg_alpha': 1.7059874571396516e-07, 'reg_lambda': 0.000911473495737746}. Best is trial 2 with value: 0.8035791730404077.


Early stopping, best iteration is:
[89]	valid_0's auc: 0.793476	valid_0's binary_logloss: 0.506157
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.800623	valid_0's binary_logloss: 0.493861
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[11]	valid_0's auc: 0.811905	valid_0's binary_logloss: 0.546561
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:46,851] Trial 3 finished with value: 0.800773230604852 and parameters: {'n_estimators': 168, 'learning_rate': 0.07281852285425537, 'num_leaves': 48, 'max_depth': 7, 'min_child_samples': 40, 'subsample': 0.9576708662275372, 'colsample_bytree': 0.9418338055830278, 'reg_alpha': 1.8363129768262239, 'reg_lambda': 6.441356439478513e-07}. Best is trial 2 with value: 0.8035791730404077.


Early stopping, best iteration is:
[70]	valid_0's auc: 0.789792	valid_0's binary_logloss: 0.506551
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[31]	valid_0's auc: 0.805769	valid_0's binary_logloss: 0.558183
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[63]	valid_0's auc: 0.811704	valid_0's binary_logloss: 0.510331
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:48,082] Trial 4 finished with value: 0.8033034196719585 and parameters: {'n_estimators': 142, 'learning_rate': 0.023403653283452813, 'num_leaves': 21, 'max_depth': 7, 'min_child_samples': 29, 'subsample': 0.7961395262956045, 'colsample_bytree': 0.9709863086997461, 'reg_alpha': 2.3299777808222995e-08, 'reg_lambda': 9.4280454764533e-08}. Best is trial 2 with value: 0.8035791730404077.


Did not meet early stopping. Best iteration is:
[142]	valid_0's auc: 0.792437	valid_0's binary_logloss: 0.506444
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[123]	valid_0's auc: 0.806483	valid_0's binary_logloss: 0.489482
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[88]	valid_0's auc: 0.81483	valid_0's binary_logloss: 0.487443
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:50,824] Trial 5 finished with value: 0.8047959328247902 and parameters: {'n_estimators': 276, 'learning_rate': 0.03263895188043232, 'num_leaves': 82, 'max_depth': 9, 'min_child_samples': 22, 'subsample': 0.8132207242218386, 'colsample_bytree': 0.6393286571835568, 'reg_alpha': 0.00019962354313519113, 'reg_lambda': 0.001419652202696352}. Best is trial 5 with value: 0.8047959328247902.


Early stopping, best iteration is:
[142]	valid_0's auc: 0.793074	valid_0's binary_logloss: 0.510273
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[102]	valid_0's auc: 0.803591	valid_0's binary_logloss: 0.512151
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[129]	valid_0's auc: 0.812688	valid_0's binary_logloss: 0.494171
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:52,566] Trial 6 finished with value: 0.8025363783552231 and parameters: {'n_estimators': 129, 'learning_rate': 0.018014420662708933, 'num_leaves': 59, 'max_depth': 7, 'min_child_samples': 44, 'subsample': 0.6692824786775623, 'colsample_bytree': 0.730356170748953, 'reg_alpha': 7.025367202746252e-06, 'reg_lambda': 0.013855661971873642}. Best is trial 5 with value: 0.8047959328247902.


Early stopping, best iteration is:
[60]	valid_0's auc: 0.79133	valid_0's binary_logloss: 0.54875
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[103]	valid_0's auc: 0.800371	valid_0's binary_logloss: 0.534646
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[53]	valid_0's auc: 0.809686	valid_0's binary_logloss: 0.572003
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:53,966] Trial 7 finished with value: 0.8001535737617277 and parameters: {'n_estimators': 104, 'learning_rate': 0.012227886690739164, 'num_leaves': 76, 'max_depth': 5, 'min_child_samples': 14, 'subsample': 0.6506602023514166, 'colsample_bytree': 0.8104709716073193, 'reg_alpha': 2.1368119487753614e-05, 'reg_lambda': 0.0001295560452903721}. Best is trial 5 with value: 0.8047959328247902.


Did not meet early stopping. Best iteration is:
[102]	valid_0's auc: 0.790403	valid_0's binary_logloss: 0.543227
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[95]	valid_0's auc: 0.799295	valid_0's binary_logloss: 0.495176
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[83]	valid_0's auc: 0.808484	valid_0's binary_logloss: 0.486766
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:56,133] Trial 8 finished with value: 0.7986658654503617 and parameters: {'n_estimators': 327, 'learning_rate': 0.03605216535625339, 'num_leaves': 77, 'max_depth': 7, 'min_child_samples': 5, 'subsample': 0.9295878523425744, 'colsample_bytree': 0.8772835315093697, 'reg_alpha': 6.251586509500215e-07, 'reg_lambda': 1.941302323690281e-05}. Best is trial 5 with value: 0.8047959328247902.


Early stopping, best iteration is:
[36]	valid_0's auc: 0.788219	valid_0's binary_logloss: 0.534613
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[103]	valid_0's auc: 0.807226	valid_0's binary_logloss: 0.483299
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[88]	valid_0's auc: 0.816163	valid_0's binary_logloss: 0.475594
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:57,188] Trial 9 finished with value: 0.8065602424108237 and parameters: {'n_estimators': 226, 'learning_rate': 0.15153571888069864, 'num_leaves': 73, 'max_depth': 4, 'min_child_samples': 27, 'subsample': 0.9715094024731001, 'colsample_bytree': 0.6557948651850357, 'reg_alpha': 6.875598417841964, 'reg_lambda': 0.0004562814645840411}. Best is trial 9 with value: 0.8065602424108237.


Early stopping, best iteration is:
[48]	valid_0's auc: 0.796293	valid_0's binary_logloss: 0.505436
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[28]	valid_0's auc: 0.805666	valid_0's binary_logloss: 0.49002
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.812812	valid_0's binary_logloss: 0.479171
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:42:58,517] Trial 10 finished with value: 0.8031982337247529 and parameters: {'n_estimators': 225, 'learning_rate': 0.19325545317674933, 'num_leaves': 115, 'max_depth': 9, 'min_child_samples': 35, 'subsample': 0.9954145448938799, 'colsample_bytree': 0.6084742744730259, 'reg_alpha': 7.303750363803917, 'reg_lambda': 4.545530545135894e-06}. Best is trial 9 with value: 0.8065602424108237.


Early stopping, best iteration is:
[35]	valid_0's auc: 0.791116	valid_0's binary_logloss: 0.506223
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[31]	valid_0's auc: 0.80008	valid_0's binary_logloss: 0.497302
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.809019	valid_0's binary_logloss: 0.51334
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:00,150] Trial 11 finished with value: 0.7963293234936751 and parameters: {'n_estimators': 231, 'learning_rate': 0.16209874219858653, 'num_leaves': 101, 'max_depth': 10, 'min_child_samples': 19, 'subsample': 0.8622900114718066, 'colsample_bytree': 0.6069491210905977, 'reg_alpha': 0.012275189988260181, 'reg_lambda': 0.0031951879086374643}. Best is trial 9 with value: 0.8065602424108237.


Early stopping, best iteration is:
[18]	valid_0's auc: 0.779889	valid_0's binary_logloss: 0.525341
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[28]	valid_0's auc: 0.800019	valid_0's binary_logloss: 0.50038
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's auc: 0.806756	valid_0's binary_logloss: 0.48772
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:02,033] Trial 12 finished with value: 0.7965777610838053 and parameters: {'n_estimators': 271, 'learning_rate': 0.10605459216592869, 'num_leaves': 97, 'max_depth': 9, 'min_child_samples': 14, 'subsample': 0.7207835911222589, 'colsample_bytree': 0.6819303496634639, 'reg_alpha': 0.007095796310767704, 'reg_lambda': 0.5191905642240651}. Best is trial 9 with value: 0.8065602424108237.


Early stopping, best iteration is:
[30]	valid_0's auc: 0.782958	valid_0's binary_logloss: 0.519968
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[143]	valid_0's auc: 0.803874	valid_0's binary_logloss: 0.489356
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[157]	valid_0's auc: 0.81413	valid_0's binary_logloss: 0.479032
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:03,916] Trial 13 finished with value: 0.8043001162217361 and parameters: {'n_estimators': 197, 'learning_rate': 0.03474138169690774, 'num_leaves': 94, 'max_depth': 5, 'min_child_samples': 22, 'subsample': 0.8708626918737405, 'colsample_bytree': 0.6724927822545992, 'reg_alpha': 0.011601647927107231, 'reg_lambda': 0.00012756705205307206}. Best is trial 9 with value: 0.8065602424108237.


Did not meet early stopping. Best iteration is:
[197]	valid_0's auc: 0.794897	valid_0's binary_logloss: 0.50597
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[100]	valid_0's auc: 0.805085	valid_0's binary_logloss: 0.485453
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[129]	valid_0's auc: 0.820258	valid_0's binary_logloss: 0.474409
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:05,001] Trial 14 finished with value: 0.8068918599688938 and parameters: {'n_estimators': 280, 'learning_rate': 0.1164122329540914, 'num_leaves': 45, 'max_depth': 3, 'min_child_samples': 35, 'subsample': 0.6073612007150517, 'colsample_bytree': 0.7372700550609441, 'reg_alpha': 0.0006296566669509104, 'reg_lambda': 0.024751335478177896}. Best is trial 14 with value: 0.8068918599688938.


Early stopping, best iteration is:
[67]	valid_0's auc: 0.795332	valid_0's binary_logloss: 0.507817
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[132]	valid_0's auc: 0.806602	valid_0's binary_logloss: 0.484166
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[141]	valid_0's auc: 0.81946	valid_0's binary_logloss: 0.475128
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:06,167] Trial 15 finished with value: 0.8074082250640412 and parameters: {'n_estimators': 342, 'learning_rate': 0.11874991435729515, 'num_leaves': 44, 'max_depth': 3, 'min_child_samples': 50, 'subsample': 0.6104583219154918, 'colsample_bytree': 0.742867644462533, 'reg_alpha': 0.32677089472054405, 'reg_lambda': 0.08288837978275913}. Best is trial 15 with value: 0.8074082250640412.


Early stopping, best iteration is:
[70]	valid_0's auc: 0.796163	valid_0's binary_logloss: 0.507035
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[150]	valid_0's auc: 0.8071	valid_0's binary_logloss: 0.48376
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[132]	valid_0's auc: 0.818048	valid_0's binary_logloss: 0.476542
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:07,367] Trial 16 finished with value: 0.807586754899123 and parameters: {'n_estimators': 396, 'learning_rate': 0.0971863244886752, 'num_leaves': 42, 'max_depth': 3, 'min_child_samples': 50, 'subsample': 0.6022760516275157, 'colsample_bytree': 0.7443528354837698, 'reg_alpha': 0.23485753737446954, 'reg_lambda': 0.11535031762901428}. Best is trial 16 with value: 0.807586754899123.


Early stopping, best iteration is:
[91]	valid_0's auc: 0.797612	valid_0's binary_logloss: 0.506108
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[198]	valid_0's auc: 0.806378	valid_0's binary_logloss: 0.483855
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[184]	valid_0's auc: 0.819215	valid_0's binary_logloss: 0.474894
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:08,651] Trial 17 finished with value: 0.8061541698089402 and parameters: {'n_estimators': 400, 'learning_rate': 0.09204744760078684, 'num_leaves': 35, 'max_depth': 3, 'min_child_samples': 50, 'subsample': 0.605286446961651, 'colsample_bytree': 0.8562729755247627, 'reg_alpha': 0.11819728738420711, 'reg_lambda': 6.2270626650455485}. Best is trial 16 with value: 0.807586754899123.


Early stopping, best iteration is:
[34]	valid_0's auc: 0.792869	valid_0's binary_logloss: 0.522149
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	valid_0's auc: 0.802379	valid_0's binary_logloss: 0.526665
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[82]	valid_0's auc: 0.815748	valid_0's binary_logloss: 0.477918
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:09,686] Trial 18 finished with value: 0.8043071002913758 and parameters: {'n_estimators': 340, 'learning_rate': 0.08495294653495292, 'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 50, 'subsample': 0.7241335499268673, 'colsample_bytree': 0.745100363583745, 'reg_alpha': 0.22018793279724946, 'reg_lambda': 0.3356293508730355}. Best is trial 16 with value: 0.807586754899123.


Early stopping, best iteration is:
[70]	valid_0's auc: 0.794795	valid_0's binary_logloss: 0.507398
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[44]	valid_0's auc: 0.802749	valid_0's binary_logloss: 0.490109
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[35]	valid_0's auc: 0.807939	valid_0's binary_logloss: 0.482657
Training until validation scores don't improve for 50 rounds


[I 2025-09-07 11:43:10,834] Trial 19 finished with value: 0.8003249147905761 and parameters: {'n_estimators': 375, 'learning_rate': 0.1322537139119982, 'num_leaves': 64, 'max_depth': 6, 'min_child_samples': 45, 'subsample': 0.6619725186950479, 'colsample_bytree': 0.8584437769446859, 'reg_alpha': 0.26670843558541224, 'reg_lambda': 0.7716693966719235}. Best is trial 16 with value: 0.807586754899123.


Early stopping, best iteration is:
[32]	valid_0's auc: 0.790287	valid_0's binary_logloss: 0.510411
Best trial params: {'n_estimators': 396, 'learning_rate': 0.0971863244886752, 'num_leaves': 42, 'max_depth': 3, 'min_child_samples': 50, 'subsample': 0.6022760516275157, 'colsample_bytree': 0.7443528354837698, 'reg_alpha': 0.23485753737446954, 'reg_lambda': 0.11535031762901428}


In [7]:
# --- Final training with SMOTE
sm = SMOTE(random_state=RANDOM_STATE)
X_tr_res, y_tr_res = sm.fit_resample(X_train, y_train)

final_model = lgb.LGBMClassifier(**best)
final_model.fit(
    X_tr_res, y_tr_res,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[early_stopping(50), log_evaluation(50)]  # ✅ replacement
)

joblib.dump(final_model, os.path.join(MODELS_DIR, 'trained_lgbm_model.pkl'))

Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.80372	valid_0's binary_logloss: 0.501689
[100]	valid_0's auc: 0.804595	valid_0's binary_logloss: 0.491401
[150]	valid_0's auc: 0.801739	valid_0's binary_logloss: 0.490372
Early stopping, best iteration is:
[104]	valid_0's auc: 0.805219	valid_0's binary_logloss: 0.490769


['models\\trained_lgbm_model.pkl']

In [8]:
# --- Calibration
calibrated = CalibratedClassifierCV(final_model, method='isotonic', cv=3)
calibrated.fit(X_tr_res, y_tr_res)
joblib.dump(calibrated, os.path.join(MODELS_DIR, 'calibration_model.pkl'))

['models\\calibration_model.pkl']

In [9]:
# --- Evaluation
y_proba = calibrated.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

ROC AUC: 0.8007709464207514
Accuracy: 0.7305
F1: 0.5104450499545867
              precision    recall  f1-score   support

           0       0.76      0.88      0.81      1341
           1       0.64      0.43      0.51       659

    accuracy                           0.73      2000
   macro avg       0.70      0.65      0.66      2000
weighted avg       0.72      0.73      0.71      2000

