In [2]:
import pandas as pd
import numpy as np
import optuna
import seaborn as sns
import warnings
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, ConfusionMatrixDisplay, RocCurveDisplay
from scipy.stats import ks_2samp
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
print("--- 库导入和环境设置完成 ---")
print("\n--- [阶段一] 开始数据加载与预处理 ---")

try:
    file_path = 'accepted_2007_to_2018q4.csv'
    df = pd.read_csv(file_path, low_memory=False)
    print(f"数据加载成功，原始数据形状: {df.shape}")
    df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')
    start_date = pd.to_datetime('2007-01-01')
    end_date = pd.to_datetime('2014-12-31')
    df_filtered = df[(df['issue_d'] >= start_date) & (df['issue_d'] <= end_date)].copy()
    print(f"筛选2007-2014年数据后，形状为: {df_filtered.shape}")
    cols_to_drop = [
        'id', 'member_id', 'url', 'desc', 'title', 'emp_title', 'pymnt_plan', 'out_prncp',
        'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
        'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d',
        'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'acc_now_delinq',
        'chargeoff_within_12_mths', 'delinq_amnt', 'mths_since_last_delinq',
        'mths_since_last_record', 'mths_since_last_major_derog', 'hardship_flag',
        'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term',
        'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',
        'hardship_length', 'hardship_dpd', 'hardship_loan_status',
        'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount',
        'hardship_last_payment_amount', 'debt_settlement_flag', 'debt_settlement_flag_date',
        'settlement_status', 'settlement_date', 'settlement_amount', 'settlement_percentage',
        'settlement_term', 'funded_amnt', 'funded_amnt_inv', 'initial_list_status',
        'verification_status_joint'
    ]
    existing_cols_to_drop = [col for col in cols_to_drop if col in df_filtered.columns]
    df_cleaned = df_filtered.drop(columns=existing_cols_to_drop, errors='ignore')
    print(f"剔除贷后及无关特征后，形状为: {df_cleaned.shape}")
    good_status = ['Fully Paid']
    bad_status = ['Charged Off', 'Default']
    df_model_data = df_cleaned[df_cleaned['loan_status'].isin(good_status + bad_status)].copy()
    df_model_data['Y'] = df_model_data['loan_status'].apply(lambda x: 0 if x in good_status else 1)
    df_model_data = df_model_data.drop(columns=['loan_status'])
    print(f"定义Y并筛选后，数据集形状: {df_model_data.shape}")
    final_df = df_model_data.copy()
    print(f"使用全部 {len(final_df)} 条数据进行OOT划分。")
    print("开始进行特征工程与格式转换...")
    if 'term' in final_df.columns: final_df['term'] = final_df['term'].str.extract('(\d+)').astype(float)
    if 'int_rate' in final_df.columns: final_df['int_rate'] = final_df['int_rate'].astype(float) / 100.0
    if 'revol_util' in final_df.columns: final_df['revol_util'] = final_df['revol_util'].astype(float) / 100.0
    emp_map = {'< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6,
               '7 years': 7, '8 years': 8, '9 years': 9, '10+ years': 10}
    if 'emp_length' in final_df.columns: final_df['emp_length'] = final_df['emp_length'].map(emp_map)
    if 'earliest_cr_line' in final_df.columns: final_df['earliest_cr_line'] = pd.to_datetime(
        final_df['earliest_cr_line'], errors='coerce')
    if 'issue_d' in final_df.columns: final_df['issue_d'] = pd.to_datetime(final_df['issue_d'], errors='coerce')
    if 'earliest_cr_line' in final_df.columns and 'issue_d' in final_df.columns:
        final_df['credit_history_months'] = ((final_df['issue_d'] - final_df['earliest_cr_line']).dt.days) / 30.0

    cols_to_remove_after_processing = ['earliest_cr_line', 'zip_code', 'addr_state', 'sub_grade',
                                       'emp_title']
    final_df = final_df.drop(columns=[col for col in cols_to_remove_after_processing if col in final_df.columns])

    print("开始处理缺失值...")
    cols_to_fill_999 = ['mths_since_rcnt_il', 'mths_since_recent_bc_dlq', 'mths_since_recent_revol_delinq',
                        'mths_since_recent_inq', 'mo_sin_rcnt_rev_tl_op', 'mths_since_recent_bc', 'mo_sin_rcnt_tl']
    cols_to_fill_0 = ['il_util', 'all_util', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'open_acc_6m', 'open_act_il',
                      'open_il_12m', 'open_il_24m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'total_bal_il',
                      'emp_length', 'pub_rec_bankruptcies', 'collections_12_mths_ex_med', 'mo_sin_old_il_acct',
                      'num_tl_120dp_2m', 'avg_cur_bal', 'mo_sin_old_rev_tl_op', 'num_actv_rev_tl', 'num_il_tl',
                      'num_op_rev_tl', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths',
                      'mort_acc', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_bc_sats', 'num_bc_tl',
                      'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
                      'num_tl_op_past_12m', 'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                      'total_il_high_credit_limit']
    cols_to_fill_median = ['dti', 'revol_util', 'credit_history_months', 'last_fico_range_high', 'last_fico_range_low',
                           'pct_tl_nvr_dlq', 'bc_open_to_buy', 'bc_util', 'percent_bc_gt_75']
    ALL_COLS_TO_KEEP = cols_to_fill_999 + cols_to_fill_0 + cols_to_fill_median
    missing_rates = final_df.isnull().sum() / len(final_df)
    high_missing_cols = missing_rates[missing_rates > 0.4].index.tolist()
    cols_to_actually_drop = [col for col in high_missing_cols if col not in ALL_COLS_TO_KEEP]
    if cols_to_actually_drop: final_df = final_df.drop(columns=cols_to_actually_drop)
    for col in cols_to_fill_999:
        if col in final_df.columns: final_df[col].fillna(999, inplace=True)
    for col in cols_to_fill_0:
        if col in final_df.columns: final_df[col].fillna(0, inplace=True)
    for col in cols_to_fill_median:
        if col in final_df.columns and final_df[col].isnull().any():
            median_val = final_df[col].median()
            final_df[col].fillna(median_val, inplace=True)
    print("缺失值填充完成。")
    print("开始处理异常值...")
    cols_for_strict_cap = ['annual_inc']
    cols_for_standard_cap = ['dti', 'revol_bal', 'tot_cur_bal', 'total_rev_hi_lim',
                             'tot_hi_cred_lim', 'total_bal_ex_mort', 'avg_cur_bal']
    for col in cols_for_strict_cap:
        if col in final_df.columns:
            upper_bound = final_df[col].quantile(0.995)
            final_df[col] = np.clip(final_df[col], a_min=None, a_max=upper_bound)
    for col in cols_for_standard_cap:
        if col in final_df.columns:
            upper_bound = final_df[col].quantile(0.99)
            final_df[col] = np.clip(final_df[col], a_min=None, a_max=upper_bound)
    all_numeric_cols = final_df.select_dtypes(include=np.number).columns.tolist()
    if 'Y' in all_numeric_cols: all_numeric_cols.remove('Y')
    if 'issue_d' in all_numeric_cols: all_numeric_cols.remove('issue_d') # 排除 'issue_d'
    processed_cols = cols_for_strict_cap + cols_for_standard_cap
    remaining_numeric_cols = [col for col in all_numeric_cols if col not in processed_cols]
    for col in remaining_numeric_cols:
        lower_bound = final_df[col].quantile(0.01)
        upper_bound = final_df[col].quantile(0.99)
        final_df[col] = np.clip(final_df[col], lower_bound, upper_bound)
    print("异常值处理完成。")
    print(f"精简前原始形状: {final_df.shape}")
    cols_to_drop_manually = [
        'collections_12_mths_ex_med', 'policy_code', 'open_acc_6m', 'open_act_il',
        'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util',
        'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',
        'total_cu_tl', 'inq_last_12m', 'num_tl_120dpd_2m', 'num_tl_30dpd',
        'fico_range_low', 'grade', 'last_fico_range_high', 'last_fico_range_low'
    ]
    existing_cols_to_drop = [col for col in cols_to_drop_manually if col in final_df.columns]
    final_df_filtered = final_df.drop(columns=existing_cols_to_drop)
    print(f"手动移除了 {len(existing_cols_to_drop)} 个特征。")
    print(f"精简后形状 (准备OOT切分): {final_df_filtered.shape}")
except FileNotFoundError:
    print("--- [阶段一] 数据预处理全部完成 ---")

except FileNotFoundError:
    print("\n错误: 'accepted_2007_to_2018q4.csv' 未找到。请确保文件在脚本所在的目录中。")

--- 库导入和环境设置完成 ---

--- [阶段一] 开始数据加载与预处理 ---
数据加载成功，原始数据形状: (2260701, 151)
筛选2007-2014年数据后，形状为: (466345, 151)
剔除贷后及无关特征后，形状为: (466345, 99)
定义Y并筛选后，数据集形状: (451060, 99)
使用全部 451060 条数据进行OOT划分。
开始进行特征工程与格式转换...
开始处理缺失值...
缺失值填充完成。
开始处理异常值...
异常值处理完成。
精简前原始形状: (451060, 81)
手动移除了 22 个特征。
精简后形状 (准备OOT切分): (451060, 59)


In [3]:
print("\n--- [阶段 2] 开始OOT (Out-of-Time) 切分 ---")
print("使用滚动时间窗口: 训练数据更接近测试数据，以对抗客群偏移")

train_df = final_df_filtered[
    (final_df_filtered['issue_d'] >= '2012-01-01') &
    (final_df_filtered['issue_d'] < '2013-07-01')
].copy()
val_df = final_df_filtered[
    (final_df_filtered['issue_d'] >= '2013-07-01') &
    (final_df_filtered['issue_d'] < '2014-01-01')
].copy()
test_df = final_df_filtered[
    (final_df_filtered['issue_d'] >= '2014-01-01') &
    (final_df_filtered['issue_d'] <= '2014-12-31')
].copy()
y_train = train_df['Y']
X_train = train_df.drop(columns=['Y', 'issue_d'])
y_val = val_df['Y']
X_val = val_df.drop(columns=['Y', 'issue_d'])
y_test = test_df['Y']
X_test = test_df.drop(columns=['Y', 'issue_d'])

print(f"训练集 (2012-2013H1): X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"验证集 (2013H2):     X_val:   {X_val.shape}, y_val:   {y_val.shape}")
print(f"测试集 (2014-OOT):   X_test:  {X_test.shape}, y_test:  {y_test.shape}")

if X_train.empty or X_val.empty or X_test.empty:
    print("\n*** 警告: 数据集为空！请检查日期筛选和切分逻辑！ ***")
    exit()


--- [阶段 2] 开始OOT (Out-of-Time) 切分 ---
使用滚动时间窗口: 训练数据更接近测试数据，以对抗客群偏移
训练集 (2012-2013H1): X_train: (106741, 57), y_train: (106741,)
验证集 (2013H2):     X_val:   (81430, 57), y_val:   (81430,)
测试集 (2014-OOT):   X_test:  (223103, 57), y_test:  (223103,)


In [5]:
print("\n--- [阶段 3] 开始特征工程 (衍生新特征) ---")

def create_stable_ratio_features(df):
    df_new = df.copy()
    df_new['annual_inc_safe'] = df_new['annual_inc'].replace(0, 0.01).fillna(0.01)
    df_new['credit_history_safe'] = df_new['credit_history_months'].replace(0, 0.01).fillna(0.01)
    df_new['loan_to_income_ratio'] = df_new['loan_amnt'] / df_new['annual_inc_safe']
    df_new['installment_to_income_ratio'] = df_new['installment'] / (df_new['annual_inc_safe'] / 12)
    df_new['monthly_debt'] = (df_new['dti'].fillna(0) * (df_new['annual_inc_safe'] / 12))
    df_new['delinq_to_history_ratio'] = df_new['delinq_2yrs'].fillna(0) / df_new['credit_history_safe']
    df_new['fico_x_dti'] = df_new['fico_range_high'].fillna(df_new['fico_range_high'].median()) * df_new['dti'].fillna(0)

    df_new = df_new.drop(columns=['annual_inc_safe', 'credit_history_safe'])
    return df_new

X_train = create_stable_ratio_features(X_train)
X_val = create_stable_ratio_features(X_val)
X_test = create_stable_ratio_features(X_test)
print(f"特征工程完成。 X_train 新形状: {X_train.shape}")


--- [阶段 3] 开始特征工程 (衍生新特征) ---
特征工程完成。 X_train 新形状: (106741, 62)


In [6]:
def calculate_psi(base_array, comparison_array, num_bins=10):
    try:
        base_array = pd.Series(base_array).replace([np.inf, -np.inf], np.nan).dropna()
        comparison_array = pd.Series(comparison_array).replace([np.inf, -np.inf], np.nan).dropna()

        bins = np.percentile(base_array, np.linspace(0, 100, num_bins + 1))
        bins = np.unique(bins)
        bins[0], bins[-1] = -np.inf, np.inf

        if len(bins) <= 2: return 0.0

        base_counts = pd.cut(base_array, bins=bins, right=False).value_counts(normalize=True)
        comp_counts = pd.cut(comparison_array, bins=bins, right=False).value_counts(normalize=True)

        psi_df = pd.DataFrame({'Base': base_counts, 'Comp': comp_counts}).fillna(0)
        psi_df['Base'] = psi_df['Base'].replace(0, 0.0001)
        psi_df['Comp'] = psi_df['Comp'].replace(0, 0.0001)

        psi_df['PSI'] = (psi_df['Comp'] - psi_df['Base']) * np.log(psi_df['Comp'] / psi_df['Base'])
        return psi_df['PSI'].sum()
    except Exception:
        return np.nan
def probability_to_score(prob, base_score=650, base_odds=(1/19), pdo=20):
    factor = pdo / np.log(2)
    offset = base_score - (factor * np.log(base_odds))
    prob = np.clip(prob, 1e-7, 1 - 1e-7)
    odds = prob / (1 - prob)
    score = offset - (factor * np.log(odds))
    return score.astype(int)

print("\n--- [阶段 4] 辅助函数定义完成 ---")


--- [阶段 4] 辅助函数定义完成 ---


In [9]:
print("\n--- [阶段 5] LightGBM 模型准备、调优与训练 ---")
import lightgbm as lgb
cols_to_drop_fix = ['application_type', 'disbursement_method']
X_train = X_train.drop(columns=cols_to_drop_fix, errors='ignore')
X_val = X_val.drop(columns=cols_to_drop_fix, errors='ignore')
X_test = X_test.drop(columns=cols_to_drop_fix, errors='ignore')
print(f"【错误修复】: 已剔除残留的 'object' 列: {cols_to_drop_fix}")
CATEGORICAL_COLS = ['purpose', 'home_ownership', 'verification_status']
CATEGORICAL_COLS = [col for col in CATEGORICAL_COLS if col in X_train.columns]
for col in CATEGORICAL_COLS:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    X_test[col] = X_test[col].astype('category')
print(f"已转换 {len(CATEGORICAL_COLS)} 个类别特征为 'category' dtype。")

BUSINESS_MONOTONICITY = {
    'loan_amnt': 1, 'term': 1, 'int_rate': 1, 'installment': 1,
    'emp_length': -1, 'fico_range_high': -1, 'annual_inc': -1, 'dti': 1,
    'revol_util': 1, 'delinq_2yrs': 1, 'pub_rec_bankruptcies': 1,
    'credit_history_months': -1, 'loan_to_income_ratio': 1,
    'installment_to_income_ratio': 1, 'monthly_debt': 1,
    'delinq_to_history_ratio': 1, 'fico_x_dti': 1
}
all_features = X_train.columns.tolist()
monotone_constraints_list = [BUSINESS_MONOTONICITY.get(f, 0) for f in all_features]
print("单调性约束列表已生成。")
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"计算得到 scale_pos_weight: {scale_pos_weight:.4f}")
def objective_lgb(trial):
    params = {
        'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
        'n_jobs': -1, 'random_state': 42, 'verbose': -1,
        'scale_pos_weight': scale_pos_weight,
        'monotone_constraints': monotone_constraints_list,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9)
    }
    model = lgb.LGBMClassifier(**params, n_estimators=1000)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              categorical_feature=CATEGORICAL_COLS,
              callbacks=[lgb.early_stopping(50, verbose=False)])
    return model.best_score_['valid_0']['auc']
print("开始 Optuna 调优 (LGBM)...")
study_lgb = optuna.create_study(direction='maximize', study_name='lgbm_acard_oot')
study_lgb.optimize(objective_lgb, n_trials=30, show_progress_bar=True)
print(f"\n调优完成！最佳 Validation-AUC: {study_lgb.best_value:.4f}")
print("找到的最佳超参数:", study_lgb.best_params)
print("\n训练最终模型 (使用最佳参数)...")
final_lgb_params = {
    'objective': 'binary', 'metric': 'auc', 'n_jobs': -1, 'random_state': 42,
    'verbose': -1, 'scale_pos_weight': scale_pos_weight,
    'monotone_constraints': monotone_constraints_list, 'n_estimators': 2000
}
final_lgb_params.update(study_lgb.best_params)
model_lgb_final = lgb.LGBMClassifier(**final_lgb_params)
model_lgb_final.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                    categorical_feature=CATEGORICAL_COLS,
                    callbacks=[lgb.early_stopping(50, verbose=100)])
print("最终模型训练完成。")
print(f"最佳迭代次数为: {model_lgb_final.best_iteration_}")

[I 2025-10-20 21:15:01,501] A new study created in memory with name: lgbm_acard_oot



--- [阶段 5] LightGBM 模型准备、调优与训练 ---
【错误修复】: 已剔除残留的 'object' 列: ['application_type', 'disbursement_method']
已转换 3 个类别特征为 'category' dtype。
单调性约束列表已生成。
计算得到 scale_pos_weight: 5.2645
开始 Optuna 调优 (LGBM)...


Best trial: 0. Best value: 0.700608:   3%|▎         | 1/30 [00:03<01:55,  3.99s/it]

[I 2025-10-20 21:15:05,488] Trial 0 finished with value: 0.700608489015905 and parameters: {'learning_rate': 0.021113860267972958, 'num_leaves': 37, 'min_child_samples': 46, 'reg_alpha': 0.31604455921275215, 'reg_lambda': 0.7179658391439359, 'subsample': 0.7763613114680401, 'colsample_bytree': 0.8572269308706624}. Best is trial 0 with value: 0.700608489015905.


Best trial: 0. Best value: 0.700608:   7%|▋         | 2/30 [00:05<01:14,  2.66s/it]

[I 2025-10-20 21:15:07,214] Trial 1 finished with value: 0.7004713976722075 and parameters: {'learning_rate': 0.053103794284708836, 'num_leaves': 37, 'min_child_samples': 98, 'reg_alpha': 0.10619797640313552, 'reg_lambda': 0.24115089341012447, 'subsample': 0.6076736401133732, 'colsample_bytree': 0.6607867890677542}. Best is trial 0 with value: 0.700608489015905.


Best trial: 0. Best value: 0.700608:  10%|█         | 3/30 [00:09<01:21,  3.01s/it]

[I 2025-10-20 21:15:10,653] Trial 2 finished with value: 0.7003903883662295 and parameters: {'learning_rate': 0.028903106171656257, 'num_leaves': 44, 'min_child_samples': 98, 'reg_alpha': 0.4848985533508828, 'reg_lambda': 1.2930201884681602, 'subsample': 0.621677618895444, 'colsample_bytree': 0.819442836393932}. Best is trial 0 with value: 0.700608489015905.


Best trial: 0. Best value: 0.700608:  13%|█▎        | 4/30 [00:10<01:02,  2.40s/it]

[I 2025-10-20 21:15:12,099] Trial 3 finished with value: 0.7000855791708874 and parameters: {'learning_rate': 0.06673809805476873, 'num_leaves': 45, 'min_child_samples': 98, 'reg_alpha': 2.274169417172254, 'reg_lambda': 3.7667105059796953, 'subsample': 0.6679395800435182, 'colsample_bytree': 0.7309270258595176}. Best is trial 0 with value: 0.700608489015905.


Best trial: 0. Best value: 0.700608:  17%|█▋        | 5/30 [00:14<01:11,  2.86s/it]

[I 2025-10-20 21:15:15,783] Trial 4 finished with value: 0.7002353315274715 and parameters: {'learning_rate': 0.03100463902661215, 'num_leaves': 50, 'min_child_samples': 78, 'reg_alpha': 2.0445337359963904, 'reg_lambda': 0.4431139999686318, 'subsample': 0.6902146078889477, 'colsample_bytree': 0.6223802864087923}. Best is trial 0 with value: 0.700608489015905.


Best trial: 5. Best value: 0.700785:  20%|██        | 6/30 [00:16<01:04,  2.67s/it]

[I 2025-10-20 21:15:18,098] Trial 5 finished with value: 0.7007849099472194 and parameters: {'learning_rate': 0.04301147486655616, 'num_leaves': 34, 'min_child_samples': 25, 'reg_alpha': 7.664047080293793, 'reg_lambda': 3.203063621280849, 'subsample': 0.8899396472880036, 'colsample_bytree': 0.7382601409358088}. Best is trial 5 with value: 0.7007849099472194.


Best trial: 6. Best value: 0.701325:  23%|██▎       | 7/30 [00:20<01:11,  3.13s/it]

[I 2025-10-20 21:15:22,165] Trial 6 finished with value: 0.7013248265904758 and parameters: {'learning_rate': 0.020538824054361218, 'num_leaves': 31, 'min_child_samples': 68, 'reg_alpha': 5.511990028050057, 'reg_lambda': 1.9477360150435876, 'subsample': 0.6296555615325125, 'colsample_bytree': 0.6702505548151296}. Best is trial 6 with value: 0.7013248265904758.


Best trial: 6. Best value: 0.701325:  27%|██▋       | 8/30 [00:22<00:59,  2.72s/it]

[I 2025-10-20 21:15:24,014] Trial 7 finished with value: 0.7007450923794676 and parameters: {'learning_rate': 0.063785784685115, 'num_leaves': 44, 'min_child_samples': 90, 'reg_alpha': 1.5260817502665855, 'reg_lambda': 0.10884815911330682, 'subsample': 0.753607854016227, 'colsample_bytree': 0.805517331887073}. Best is trial 6 with value: 0.7013248265904758.


Best trial: 6. Best value: 0.701325:  30%|███       | 9/30 [00:26<01:06,  3.18s/it]

[I 2025-10-20 21:15:28,196] Trial 8 finished with value: 0.7010328291754677 and parameters: {'learning_rate': 0.02062156271519778, 'num_leaves': 34, 'min_child_samples': 41, 'reg_alpha': 0.4091683303984805, 'reg_lambda': 1.6711517261284305, 'subsample': 0.6955002560290658, 'colsample_bytree': 0.6878045195033176}. Best is trial 6 with value: 0.7013248265904758.


Best trial: 6. Best value: 0.701325:  33%|███▎      | 10/30 [00:30<01:04,  3.22s/it]

[I 2025-10-20 21:15:31,525] Trial 9 finished with value: 0.7003071313593 and parameters: {'learning_rate': 0.030014823194974064, 'num_leaves': 50, 'min_child_samples': 96, 'reg_alpha': 0.2729103768542334, 'reg_lambda': 0.2098344333225341, 'subsample': 0.7258040496655453, 'colsample_bytree': 0.628781290480416}. Best is trial 6 with value: 0.7013248265904758.


Best trial: 10. Best value: 0.701475:  37%|███▋      | 11/30 [00:37<01:24,  4.45s/it]

[I 2025-10-20 21:15:38,759] Trial 10 finished with value: 0.7014751382827805 and parameters: {'learning_rate': 0.011072117696650025, 'num_leaves': 21, 'min_child_samples': 68, 'reg_alpha': 8.572311021444573, 'reg_lambda': 9.387485654941184, 'subsample': 0.8202595544014842, 'colsample_bytree': 0.6864734107265067}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  40%|████      | 12/30 [00:44<01:34,  5.25s/it]

[I 2025-10-20 21:15:45,825] Trial 11 finished with value: 0.7014484386338435 and parameters: {'learning_rate': 0.010154639670473278, 'num_leaves': 20, 'min_child_samples': 68, 'reg_alpha': 9.685423821309291, 'reg_lambda': 6.870049112362454, 'subsample': 0.8255940535795313, 'colsample_bytree': 0.7027599827671308}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  43%|████▎     | 13/30 [00:51<01:37,  5.72s/it]

[I 2025-10-20 21:15:52,631] Trial 12 finished with value: 0.70130329907061 and parameters: {'learning_rate': 0.010698183649416755, 'num_leaves': 20, 'min_child_samples': 59, 'reg_alpha': 4.414619410765492, 'reg_lambda': 8.935232508592168, 'subsample': 0.8384540928109505, 'colsample_bytree': 0.7179360283577427}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  47%|████▋     | 14/30 [00:58<01:37,  6.09s/it]

[I 2025-10-20 21:15:59,566] Trial 13 finished with value: 0.7014070719593426 and parameters: {'learning_rate': 0.010587722452769198, 'num_leaves': 20, 'min_child_samples': 75, 'reg_alpha': 9.977228244502788, 'reg_lambda': 9.85084175777685, 'subsample': 0.8193880707443837, 'colsample_bytree': 0.7774448225896671}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  50%|█████     | 15/30 [01:03<01:26,  5.79s/it]

[I 2025-10-20 21:16:04,678] Trial 14 finished with value: 0.7010982208153361 and parameters: {'learning_rate': 0.014130564196435895, 'num_leaves': 26, 'min_child_samples': 56, 'reg_alpha': 3.734722132409122, 'reg_lambda': 5.674700437012853, 'subsample': 0.8145272142071622, 'colsample_bytree': 0.6930785900810643}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  53%|█████▎    | 16/30 [01:04<01:02,  4.43s/it]

[I 2025-10-20 21:16:05,955] Trial 15 finished with value: 0.7007444639899543 and parameters: {'learning_rate': 0.09267525565127159, 'num_leaves': 25, 'min_child_samples': 76, 'reg_alpha': 3.3388318679619036, 'reg_lambda': 4.563067605021998, 'subsample': 0.8943608354967996, 'colsample_bytree': 0.7717980031076062}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  57%|█████▋    | 17/30 [01:10<01:03,  4.92s/it]

[I 2025-10-20 21:16:12,012] Trial 16 finished with value: 0.7005777819451997 and parameters: {'learning_rate': 0.014920164723850068, 'num_leaves': 25, 'min_child_samples': 49, 'reg_alpha': 0.8830850214383954, 'reg_lambda': 2.6303612020272045, 'subsample': 0.8482322242240207, 'colsample_bytree': 0.8984206586078904}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  60%|██████    | 18/30 [01:16<01:03,  5.30s/it]

[I 2025-10-20 21:16:18,197] Trial 17 finished with value: 0.7012464355741367 and parameters: {'learning_rate': 0.01315599565401221, 'num_leaves': 28, 'min_child_samples': 68, 'reg_alpha': 7.167999511748447, 'reg_lambda': 6.559657100624399, 'subsample': 0.7869935030452982, 'colsample_bytree': 0.6471680646559771}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  63%|██████▎   | 19/30 [01:21<00:55,  5.07s/it]

[I 2025-10-20 21:16:22,730] Trial 18 finished with value: 0.7009260271270298 and parameters: {'learning_rate': 0.017708358853635482, 'num_leaves': 22, 'min_child_samples': 85, 'reg_alpha': 1.0010615781098806, 'reg_lambda': 0.7759929680492934, 'subsample': 0.8657309970308429, 'colsample_bytree': 0.7007463595301356}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  67%|██████▋   | 20/30 [01:29<01:00,  6.09s/it]

[I 2025-10-20 21:16:31,211] Trial 19 finished with value: 0.7014727352107956 and parameters: {'learning_rate': 0.010066869818128238, 'num_leaves': 30, 'min_child_samples': 33, 'reg_alpha': 9.678217311193777, 'reg_lambda': 6.62563714255103, 'subsample': 0.7940759558942743, 'colsample_bytree': 0.7399597432578634}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 10. Best value: 0.701475:  70%|███████   | 21/30 [01:34<00:51,  5.74s/it]

[I 2025-10-20 21:16:36,138] Trial 20 finished with value: 0.7011195630408583 and parameters: {'learning_rate': 0.016271631899602384, 'num_leaves': 30, 'min_child_samples': 21, 'reg_alpha': 2.500092017141618, 'reg_lambda': 2.4856640785122206, 'subsample': 0.7891998428272118, 'colsample_bytree': 0.7596247904340977}. Best is trial 10 with value: 0.7014751382827805.


Best trial: 21. Best value: 0.701575:  73%|███████▎  | 22/30 [01:42<00:49,  6.24s/it]

[I 2025-10-20 21:16:43,544] Trial 21 finished with value: 0.7015745734424368 and parameters: {'learning_rate': 0.01097783993183695, 'num_leaves': 23, 'min_child_samples': 38, 'reg_alpha': 9.678171706571536, 'reg_lambda': 6.362513223442218, 'subsample': 0.8160986199723593, 'colsample_bytree': 0.7132666998977618}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575:  77%|███████▋  | 23/30 [01:48<00:44,  6.41s/it]

[I 2025-10-20 21:16:50,346] Trial 22 finished with value: 0.7012308213607168 and parameters: {'learning_rate': 0.012319567237902585, 'num_leaves': 24, 'min_child_samples': 34, 'reg_alpha': 6.019233551356904, 'reg_lambda': 4.744461874736602, 'subsample': 0.752475370024579, 'colsample_bytree': 0.6012599069438278}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575:  80%|████████  | 24/30 [01:55<00:38,  6.37s/it]

[I 2025-10-20 21:16:56,620] Trial 23 finished with value: 0.7014332767227656 and parameters: {'learning_rate': 0.012206802545880366, 'num_leaves': 28, 'min_child_samples': 29, 'reg_alpha': 4.935139535387494, 'reg_lambda': 9.464503119874271, 'subsample': 0.7999450767831182, 'colsample_bytree': 0.7443826314362144}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575:  83%|████████▎ | 25/30 [02:00<00:30,  6.02s/it]

[I 2025-10-20 21:17:01,825] Trial 24 finished with value: 0.7015620516880321 and parameters: {'learning_rate': 0.016806853357866712, 'num_leaves': 23, 'min_child_samples': 37, 'reg_alpha': 9.277252858655926, 'reg_lambda': 6.300829814900735, 'subsample': 0.8497164417915781, 'colsample_bytree': 0.7962456344607113}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575:  87%|████████▋ | 26/30 [02:03<00:20,  5.14s/it]

[I 2025-10-20 21:17:04,921] Trial 25 finished with value: 0.7010573893077284 and parameters: {'learning_rate': 0.024969941050859645, 'num_leaves': 23, 'min_child_samples': 51, 'reg_alpha': 3.5402453736619384, 'reg_lambda': 3.5530295141118797, 'subsample': 0.8572348764502074, 'colsample_bytree': 0.7975387464331455}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575:  90%|█████████ | 27/30 [02:08<00:15,  5.03s/it]

[I 2025-10-20 21:17:09,682] Trial 26 finished with value: 0.7012446309683549 and parameters: {'learning_rate': 0.017601848687641735, 'num_leaves': 22, 'min_child_samples': 41, 'reg_alpha': 6.610122088377899, 'reg_lambda': 5.0330052033832064, 'subsample': 0.839075685388451, 'colsample_bytree': 0.8398371748903494}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575:  93%|█████████▎| 28/30 [02:14<00:10,  5.37s/it]

[I 2025-10-20 21:17:15,840] Trial 27 finished with value: 0.7008411220362485 and parameters: {'learning_rate': 0.012616744565826019, 'num_leaves': 28, 'min_child_samples': 37, 'reg_alpha': 1.238211892603439, 'reg_lambda': 2.1218793299288854, 'subsample': 0.8740960548905913, 'colsample_bytree': 0.7875576677873359}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575:  97%|█████████▋| 29/30 [02:18<00:04,  4.90s/it]

[I 2025-10-20 21:17:19,634] Trial 28 finished with value: 0.701551329935823 and parameters: {'learning_rate': 0.023747640250285702, 'num_leaves': 22, 'min_child_samples': 62, 'reg_alpha': 2.9880152900173615, 'reg_lambda': 7.814198354099138, 'subsample': 0.7676142342341808, 'colsample_bytree': 0.7151132331981793}. Best is trial 21 with value: 0.7015745734424368.


Best trial: 21. Best value: 0.701575: 100%|██████████| 30/30 [02:21<00:00,  4.71s/it]


[I 2025-10-20 21:17:22,842] Trial 29 finished with value: 0.7006234575763634 and parameters: {'learning_rate': 0.024022132607859292, 'num_leaves': 26, 'min_child_samples': 45, 'reg_alpha': 2.6530258496394703, 'reg_lambda': 1.3898014378140755, 'subsample': 0.7618504581232234, 'colsample_bytree': 0.8410898564358434}. Best is trial 21 with value: 0.7015745734424368.

调优完成！最佳 Validation-AUC: 0.7016
找到的最佳超参数: {'learning_rate': 0.01097783993183695, 'num_leaves': 23, 'min_child_samples': 38, 'reg_alpha': 9.678171706571536, 'reg_lambda': 6.362513223442218, 'subsample': 0.8160986199723593, 'colsample_bytree': 0.7132666998977618}

训练最终模型 (使用最佳参数)...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[951]	valid_0's auc: 0.701575
最终模型训练完成。
最佳迭代次数为: 951


In [10]:
print("\n--- [阶段 6] 模型评估 (在 2014 OOT 测试集上) ---")

y_pred_proba_oot = model_lgb_final.predict_proba(X_test)[:, 1]
auc_score_oot = roc_auc_score(y_test, y_pred_proba_oot)
ks_stat_oot = ks_2samp(y_pred_proba_oot[y_test == 0], y_pred_proba_oot[y_test == 1]).statistic

print(f"OOT 测试集 ROC-AUC 评分: {auc_score_oot:.4f}")
print(f"OOT 测试集 KS 统计量: {ks_stat_oot:.4f}")


--- [阶段 6] 模型评估 (在 2014 OOT 测试集上) ---
OOT 测试集 ROC-AUC 评分: 0.7143
OOT 测试集 KS 统计量: 0.3129


In [11]:
print("\n--- [阶段 7] 计算 PSI (稳定性监控) ---")

y_pred_proba_train = model_lgb_final.predict_proba(X_train)[:, 1]
score_psi = calculate_psi(y_pred_proba_train, y_pred_proba_oot)
print(f"模型分 PSI (Train vs OOT): {score_psi:.4f}")

print("\n核心特征 PSI (Train vs OOT):")
feature_psi_results = {}
constrained_numeric_cols = [f for f in all_features if BUSINESS_MONOTONICITY.get(f, 0) != 0 and f not in CATEGORICAL_COLS]
for col in constrained_numeric_cols:
    feature_psi_results[col] = calculate_psi(X_train[col], X_test[col])

psi_series = pd.Series(feature_psi_results).sort_values(ascending=False)
print(psi_series.head(10))


--- [阶段 7] 计算 PSI (稳定性监控) ---
模型分 PSI (Train vs OOT): 0.0034

核心特征 PSI (Train vs OOT):
int_rate                   0.060418
fico_range_high            0.057190
credit_history_months      0.046659
revol_util                 0.038210
delinq_2yrs                0.029577
installment                0.022475
monthly_debt               0.018896
emp_length                 0.018067
delinq_to_history_ratio    0.017019
dti                        0.014411
dtype: float64


In [16]:

print("\n--- [阶段 8] 将概率转换为评分 (Scorecard) ---")

p_real = y_train.mean()
W = (p_real / (1 - p_real)) / (0.5 / (1 - 0.5))
y_prob_calibrated = (y_pred_proba_oot * W) / (1 - y_pred_proba_oot + y_pred_proba_oot * W)
print(f"概率校准完成 (真实坏账率 P_real={p_real:.1%})。")
def calculate_score_parameters_2pt(
    p1_prob=0.05, p1_score=800,  # 锚点1: 5%的违约率 -> 800分
    p2_prob=0.20, p2_score=600   # 锚点2: 20%的违约率 -> 600分
):
    odds1 = p1_prob / (1 - p1_prob)
    odds2 = p2_prob / (1 - p2_prob)
    log_odds1 = np.log(odds1)
    log_odds2 = np.log(odds2)
    # p1_score = Offset - Factor * log_odds1
    # p2_score = Offset - Factor * log_odds2

    # (p1_score - p2_score) = Factor * (log_odds2 - log_odds1)
    Factor = (p1_score - p2_score) / (log_odds2 - log_odds1)

    Offset = p1_score + Factor * log_odds1

    print(f"\n应用“两点锚定法”参数：")
    print(f"  锚点1: P={p1_prob:.0%} -> Score={p1_score}")
    print(f"  锚点2: P={p2_prob:.0%} -> Score={p2_score}")
    print(f"  计算得到 Factor: {Factor:.4f}")
    print(f"  计算得到 Offset: {Offset:.4f}")

    return Offset, Factor

def convert_prob_to_score_2pt(prob, Offset, Factor):
    prob = np.clip(prob, 1e-7, 1 - 1e-7)
    odds = prob / (1 - prob)
    score = Offset - (Factor * np.log(odds))
    return score.astype(int)
offset, factor = calculate_score_parameters_2pt(
    p1_prob=0.05, p1_score=800,
    p2_prob=0.20, p2_score=600
)
scores_oot = convert_prob_to_score_2pt(y_prob_calibrated, offset, factor)
df_results = pd.DataFrame({
    'Actual_Y': y_test.values,
    'Prob_Model_Output': y_pred_proba_oot,
    'Prob_Calibrated': y_prob_calibrated,
    'Score': scores_oot
})

print("\n概率转换为评分示例 (OOT测试集前10条):")
print(df_results.head(10))

print("\n分数分布概览 (OOT测试集):")
print(df_results['Score'].describe())

print("\n好客户 (Y=0) 的分数分布 (OOT):")
print(df_results[df_results['Actual_Y'] == 0]['Score'].describe())

print("\n坏客户 (Y=1) 的分数分布 (OOT):")
print(df_results[df_results['Actual_Y'] == 1]['Score'].describe())


--- [阶段 8] 将概率转换为评分 (Scorecard) ---
概率校准完成 (真实坏账率 P_real=16.0%)。

应用“两点锚定法”参数：
  锚点1: P=5% -> Score=800
  锚点2: P=20% -> Score=600
  计算得到 Factor: 128.3578
  计算得到 Offset: 422.0583

概率转换为评分示例 (OOT测试集前10条):
   Actual_Y  Prob_Model_Output  Prob_Calibrated  Score
0         1           0.244680         0.057966    779
1         0           0.469794         0.144061    650
2         0           0.444750         0.132057    663
3         1           0.513356         0.166929    628
4         0           0.594761         0.218009    586
5         0           0.384820         0.106203    695
6         1           0.556064         0.192199    606
7         0           0.245683         0.058263    779
8         1           0.677589         0.285310    539
9         0           0.344115         0.090628    718

分数分布概览 (OOT测试集):
count    223103.000000
mean        669.755225
std         108.813059
min         359.000000
25%         593.000000
50%         662.000000
75%         738.000000
max        1