### 基于sklearn决策树的最优分箱与IV值计算-Python实现
https://zhuanlan.zhihu.com/p/58824825

In [1]:
import pandas
data=pandas.read_csv('GiveMeSomeCredit-cs-training.csv')
# train=pandas.read_csv('GiveMeSomeCredit-cs-training.csv')
test=pandas.read_csv('GiveMeSomeCredit-cs-test.csv')

In [3]:
# 2. 获得最优分箱边界值函数的实现：
import pandas as pd 
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
    '''
        利用决策树获得最优分箱的边界值列表
    '''
    boundary = []  # 待return的分箱边界值列表
    
    x = x.fillna(nan).values  # 填充缺失值
    y = y.values
    
    clf = DecisionTreeClassifier(criterion='entropy',    #“信息熵”最小化准则划分
                                 max_leaf_nodes=6,       # 最大叶子节点数
                                 min_samples_leaf=0.05)  # 叶子节点样本数量最小占比

    clf.fit(x.reshape(-1, 1), y)  # 训练决策树
    
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold
    
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min()
    max_x = x.max() + 0.1  # +0.1是为了考虑后续groupby操作时，能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]

    return boundary

In [4]:
# 测试optimal_binning_boundary函数：
optimal_binning_boundary(x=data['RevolvingUtilizationOfUnsecuredLines'],
                         y=data['SeriousDlqin2yrs'])

[0.0,
 0.11458224803209305,
 0.21776090562343597,
 0.49497613310813904,
 0.6981423199176788,
 0.8596274554729462,
 50708.1]

In [5]:
# 3. 获得某个变量各个分箱的WOE、IV值函数的实现：
def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
    '''
        计算变量各个分箱的WOE、IV值，返回一个DataFrame
    '''
    x = x.fillna(nan)
    boundary = optimal_binning_boundary(x, y, nan)        # 获得最优分箱边界值列表
    df = pd.concat([x, y], axis=1)                        # 合并x、y为一个DataFrame，方便后续计算
    df.columns = ['x', 'y']                               # 特征变量、目标变量字段的重命名
    df['bins'] = pd.cut(x=x, bins=boundary, right=False)  # 获得每个x值所在的分箱区间
    
    grouped = df.groupby('bins')['y']                     # 统计各分箱区间的好、坏、总客户数量
    result_df = grouped.agg([('good',  lambda y: (y == 0).sum()), 
                             ('bad',   lambda y: (y == 1).sum()),
                             ('total', 'count')])

    result_df['good_pct'] = result_df['good'] / result_df['good'].sum()       # 好客户占比
    result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum()          # 坏客户占比
    result_df['total_pct'] = result_df['total'] / result_df['total'].sum()    # 总客户占比

    result_df['bad_rate'] = result_df['bad'] / result_df['total']             # 坏比率
    
    result_df['woe'] = np.log(result_df['good_pct'] / result_df['bad_pct'])              # WOE
    result_df['iv'] = (result_df['good_pct'] - result_df['bad_pct']) * result_df['woe']  # IV
    
    print(f"该变量IV = {result_df['iv'].sum()}")
    
    return result_df

In [6]:
# 测试feature_woe_iv函数：
feature_woe_iv(x=data['RevolvingUtilizationOfUnsecuredLines'], 
               y=data['SeriousDlqin2yrs'])

该变量IV = 1.1025918750620314


Unnamed: 0_level_0,good,bad,total,good_pct,bad_pct,total_pct,bad_rate,woe,iv
bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"[0.0, 0.115)",66466,1226,67692,0.474845,0.122282,0.45128,0.018111,1.356659,0.478308
"[0.115, 0.218)",15776,486,16262,0.112707,0.048474,0.108413,0.029886,0.843761,0.054197
"[0.218, 0.495)",23162,1245,24407,0.165474,0.124177,0.162713,0.05101,0.287103,0.011856
"[0.495, 0.698)",10499,1100,11599,0.075007,0.109715,0.077327,0.094836,-0.380305,0.0132
"[0.698, 0.86)",6716,1097,7813,0.04798,0.109416,0.052087,0.140407,-0.824361,0.050645
"[0.86, 50708.1)",17355,4872,22227,0.123987,0.485937,0.14818,0.219193,-1.365899,0.494386


In [None]:
# 如上图所示，变量RevolvingUtilizationOfUnsecuredLines，分箱WOE趋势单调，bad_rate风险排序性较好，IV值>1.0则说明该变量预测能力很强。