<h1>10.2 通过IV值筛选特征<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import numpy as np
import pandas as pd
import scipy
import scipy.stats as st

def auto_bin(DF, X, Y, n=5, iv=True, detail=False,q=20):
    """
    自动最优分箱函数，基于卡方检验的分箱

    参数：
    DF: DataFrame 数据框
    X: 需要分箱的列名
    Y: 分箱数据对应的标签 Y 列名
    n: 保留分箱个数
    iv: 是否输出执行过程中的 IV 值
    detail: 是否输出合并的细节信息
    q: 初始分箱的个数
:
    区间为前开后闭 (]

    返回值：

    """


#     DF = DF[[X,Y]].copy()

    # 按照等频对需要分箱的列进行分箱
    DF["qcut"],bins = pd.qcut(DF[X], retbins=True, q=q, duplicates="drop")
    # 统计每个分段 0，1的数量
    coount_y0 = DF.loc[DF[Y]==0].groupby(by="qcut")[Y].count()
    coount_y1 = DF.loc[DF[Y]==1].groupby(by="qcut")[Y].count()
    # num_bins值分别为每个区间的上界，下界，0的频次，1的频次
    num_bins = [*zip(bins,bins[1:],coount_y0,coount_y1)]

    # 定义计算woe的函数
    def get_woe(num_bins):
        # 通过num_bins数据计算woe
        columns = ["min","max","count_0","count_1"]
        df = pd.DataFrame(num_bins,columns=columns)

        df["total"] = df.count_0 + df.count_1
        df["percentage"] = df.total / df.total.sum()
        df["bad_rate"] = df.count_1 / df.total
        df["woe"] = np.log((df.count_0/df.count_0.sum()) /
                           (df.count_1/df.count_1.sum()))
        return df

    # 创建计算IV值函数
    def get_iv(bins_df):
        rate = ((bins_df.count_0/bins_df.count_0.sum()) -
                (bins_df.count_1/bins_df.count_1.sum()))
        IV = np.sum(rate * bins_df.woe)
        return IV


    # 确保每个分组的数据都包含有0和1
    for i in range(20): # 初始分组不会超20
        # 如果是第一个组没有 0 或 1，向后合并
        if 0 in num_bins[0][2:]:
            num_bins[0:2] = [(
                num_bins[0][0],
                num_bins[1][1],
                num_bins[0][2]+num_bins[1][2],
                num_bins[0][3]+num_bins[1][3])]
            continue

        # 其他组出现没有0或1，向前合并
        for i in range(len(num_bins)):
            if 0 in num_bins[i][2:]:
                num_bins[i-1:i+1] = [(
                    num_bins[i-1][0],
                    num_bins[i][1],
                    num_bins[i-1][2]+num_bins[i][2],
                    num_bins[i-1][3]+num_bins[i][3])]
                break
        # 循环结束都没有出现则提前结束外层循环
        else:
            break

    # 重复执行循环至分箱保留n组：
    while len(num_bins) > n:
        # 获取num_bins 两两之间的卡方检验的置信度（或卡方值）
        pvs = []
        for i in range(len(num_bins)-1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i+1][2:]
            # 0返回chi2值，1返回p值。
            pv = st.chi2_contingency([x1,x2])[1]
            # chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
            pvs.append(pv)

        # 通过 p 值进行处理。合并 p 值最大的两组
        i = pvs.index(max(pvs))
        num_bins[i:i+2] = [(
            num_bins[i][0],
            num_bins[i+1][1],
            num_bins[i][2]+num_bins[i+1][2],
            num_bins[i][3]+num_bins[i+1][3])]

        # 打印合并后的分箱信息
        bins_df = get_woe(num_bins)
        if iv:
            print(f"{X} 分{len(num_bins):2}组 IV 值: ",get_iv(bins_df))
        if detail:
            print(bins_df)
    # 返回分组后的信息
    return get_woe(num_bins) 

In [2]:
train_set = pd.read_csv('../data/train.csv')
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
get_woe = auto_bin(train_set,'Age',"Survived")
get_woe

Age 分19组 IV 值:  0.17643940819015408
Age 分18组 IV 值:  0.1759629254743842
Age 分17组 IV 值:  0.17551035471762413
Age 分16组 IV 值:  0.1753738755214429
Age 分15组 IV 值:  0.17462694330452538
Age 分14组 IV 值:  0.1731879629384554
Age 分13组 IV 值:  0.17152785944320284
Age 分12组 IV 值:  0.16672006613068047
Age 分11组 IV 值:  0.16268531506409778
Age 分10组 IV 值:  0.15471117317074778
Age 分 9组 IV 值:  0.1450931278513105
Age 分 8组 IV 值:  0.1405003279044501
Age 分 7组 IV 值:  0.12645403552539317
Age 分 6组 IV 值:  0.12448562206331787
Age 分 5组 IV 值:  0.11011146146198407


Unnamed: 0,min,max,count_0,count_1,total,percentage,bad_rate,woe
0,0.42,4.0,13,27,40,0.056022,0.675,-1.11074
1,4.0,17.0,39,34,73,0.102241,0.465753,-0.242651
2,17.0,34.0,230,135,365,0.511204,0.369863,0.152952
3,34.0,36.0,19,22,41,0.057423,0.536585,-0.526456
4,36.0,80.0,123,72,195,0.273109,0.369231,0.155666
