# ch05 — 评分卡建模

本 notebook 演示：
1. WOE 分箱与 IV 筛选
2. 逻辑回归训练
3. 分值刻度转换
4. 评分卡明细表输出
5. 通过率-坏率曲线与切分点设计

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from creditrisk.data import TARGET, train_val_split
from creditrisk.features import WOEEncoder
from creditrisk.models import Scorecard
from creditrisk.evaluation import evaluate, ks_table, lift_table
from creditrisk.utils import plot_score_dist, plot_ks_curve, Timer

train_clean = pd.read_parquet('../data/processed/train_clean.parquet')
y = train_clean[TARGET]
X = train_clean.drop(columns=[TARGET])

# 只用数值特征做评分卡演示
num_cols = X.select_dtypes(include='number').columns.tolist()
X_num = X[num_cols]

# 时序划分：最后1年作为验证集
tr, val = train_val_split(train_clean, val_ratio=0.2)
X_tr, y_tr   = tr[num_cols], tr[TARGET]
X_val, y_val = val[num_cols], val[TARGET]

print(f'训练集：{len(tr):,}  验证集：{len(val):,}')

## 1. WOE 编码 & IV 筛选

In [None]:
# 选择 IV > 0.02 的特征参与建模
IV_THRESHOLD = 0.02

with Timer('WOE 编码'):
    enc = WOEEncoder(bins=10, min_bin_pct=0.05, monotonic='auto')
    X_tr_woe  = enc.fit_transform(X_tr, y_tr)
    X_val_woe = enc.transform(X_val)

iv_df = enc.iv_summary()
print(f'\nIV 汇总（前15）：')
print(iv_df.head(15).to_string(index=False))

In [None]:
# 基于 IV 筛选特征
good_features = iv_df[iv_df['IV'] >= IV_THRESHOLD]['feature'].tolist()
# 加 _woe 后缀（WOEEncoder 输出）
woe_cols = [f + '_woe' for f in good_features]
woe_cols = [c for c in woe_cols if c in X_tr_woe.columns]

print(f'IV >= {IV_THRESHOLD} 的特征数：{len(good_features)}')

X_tr_sc  = X_tr_woe[woe_cols]
X_val_sc = X_val_woe[woe_cols]

## 2. 逻辑回归 & 评分卡

In [None]:
with Timer('评分卡训练'):
    sc = Scorecard(base_score=600, base_odds=50, pdo=20, C=0.1)
    sc.fit(X_tr_sc, y_tr)

# 预测概率 & 分值
val_prob  = sc.predict_proba(X_val_sc)
val_score = sc.predict_score(X_val_sc)

print(f'验证集分值范围：{val_score.min()} ~ {val_score.max()}')
print(f'中位数分值：{np.median(val_score):.0f}')

In [None]:
# 综合评估
metrics = evaluate(y_val.values, val_prob, label='LR Scorecard')

## 3. 评分分布可视化

In [None]:
plot_score_dist(y_val.values, val_prob)
plot_ks_curve(y_val.values, val_prob, label='LR Scorecard')

## 4. KS 明细表

In [None]:
ks_tbl = ks_table(y_val.values, val_prob, n_deciles=10)
print(ks_tbl.to_string(index=False))

## 5. 通过率-坏率曲线

业务决策的核心工具：在目标通过率下预期坏率是多少？

In [None]:
# 计算不同切分点下的通过率与坏率
results = []
for pct in np.arange(0.3, 0.91, 0.05):
    thr = np.percentile(val_prob, pct * 100)       # 低分 = 通过
    approved = val_prob <= thr
    results.append({
        '通过率':        f'{approved.mean():.1%}',
        '阈值（概率）':  f'{thr:.4f}',
        '通过客户坏率':  f'{y_val[approved].mean():.4%}',
        '拒绝客户坏率':  f'{y_val[~approved].mean():.4%}',
    })

pd.DataFrame(results)

## 6. 评分卡明细表

In [None]:
card = sc.card_table(enc)
print('评分卡明细（前20行）：')
card.head(20)