# 特征工程

In [None]:
"""读取数据以及导入包"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
import pandas as pd
import seaborn as sns
#读取寻训练数据，为了使不同的端可以正常使用数据，故设置两段读取
#从kaggle训练集库读取
try:
    data = pd.read_csv()#设置某列作为索引
except FileNotFoundError:#除非找不到文件，再从该目录下查找文件
    data = pd.read_csv("cs-training.csv",index_col=0)#设置某列作为索引
    
    
"""定义根据IV曲线选取每个特征的最佳分箱个数函数"""
class Stack():   #定义类
    def __init__(self):  #产生一个空的容器
        self.__list = []
    def push(self, item):  #入栈
        self.__list.append(item)
    def pop(self):  #出栈
        return self.__list.pop()
    def speek(self):  #返回栈顶元素
        return self.__list[-1]
    def is_empty(self):  #判断是否已为空
        return not self.__list
    def size(self):  #返回栈中元素个数
        return len(self.__list)
    
def graphforbestbin(DF,X,Y,n=5,q=20,graph=True):
    '''
    自动最优分箱函数，基于卡方检验的分箱

    参数：
    DF: 需要输入的数据
    X: 需要分箱的列名
    Y: 分箱数据对应的标签 Y 列名
    n: 保留分箱个数
    q: 初始分箱的个数
    graph: 是否要画出IV图像
    区间为前开后闭 (]
    '''
    DF = DF[[X,Y]].copy()#[[X,Y]]是将这两列单独拼接成一个新的表
    #开始分箱
    DF['qcut'],bins = pd.qcut(DF[X],retbins=True,duplicates='drop',q=q)
    coount_y0 = DF.loc[DF[Y] == 0].groupby('qcut').count()[Y]#取出为0的计数结果
    coount_y1 = DF.loc[DF[Y] == 1].groupby('qcut').count()[Y]#取出为1的计数结果
    #将序列解压
    num_bins = [*zip(bins,bins[1:],coount_y0,coount_y1)]
    
    #将num_bins合并直到没有0样本情况存在
    def get_newbins(bins):
        class Stack():
            #定义类
            def __init__(self):  #产生一个空的容器
                self.__list = []
            def push(self, item):  #入栈
                self.__list.append(item)
            def pop(self):  #出栈
                return self.__list.pop()
            def speek(self):  #返回栈顶元素
                return self.__list[-1]
            def is_empty(self):  #判断是否已为空
                return not self.__list
            def size(self):  #返回栈中元素个数
                return len(self.__list)

        s = Stack()
        #首先将列表翻转，降序
        bins = sorted(num_bins,reverse=True)
        #全部入栈
        for i in bins:
            s.push(i)
        lis =[]
        while not s.is_empty():
            s1 = s.pop()
            if 0 in s1[2:]:
                s2 = s.pop()
                s3 = (s1[0],s2[1],s1[2]+s2[2],s1[3]+s2[3])
                s.push(s3)
            else:
                lis.append(s1)
        return lis

    num_bins = get_newbins(num_bins)
    
    def get_woe(num_bins):
        columns = ["min","max","count_0","count_1"]
        df = pd.DataFrame(num_bins,columns=columns)
        df["total"] = df.count_0 + df.count_1
        df["percentage"] = df.total / df.total.sum()
        df["bad_rate"] = df.count_1 / df.total
        df["good%"] = df.count_0/df.count_0.sum()
        df["bad%"] = df.count_1/df.count_1.sum()
        df["woe"] = np.log(df["good%"] / df["bad%"])
        return df

    def get_iv(df):
        rate = df["good%"] - df["bad%"]
        iv = np.sum(rate * df.woe)
        return iv

    IV = []
    axisx = []
    while len(num_bins) > n:
        pvs = []
        for i in range(len(num_bins)-1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i+1][2:]
            pv = scipy.stats.chi2_contingency([x1,x2])[1]
            pvs.append(pv)

        i = pvs.index(max(pvs))
        num_bins[i:i+2] = [(
            num_bins[i][0],
            num_bins[i+1][1],
            num_bins[i][2]+num_bins[i+1][2],
            num_bins[i][3]+num_bins[i+1][3])]
        global bins_dfs
        bins_dfs = pd.DataFrame(get_woe(num_bins))
        axisx.append(len(num_bins))
        IV.append(get_iv(bins_dfs))

    if graph:
        plt.figure()
        plt.plot(axisx,IV)
        plt.xticks(axisx)
        plt.xlabel("number of box")
        plt.ylabel("IV")
        plt.grid(True)
        plt.show()
    return bins_dfs

"""绘制IV曲线判断最佳分箱数目"""
for i in model_data.columns[1:-1]:
    print(i)
    graphforbestbin(model_data,i,"SeriousDlqin2yrs",n=2,q=20)

"""得到分箱数与分箱区间"""
auto_col_bins = {"RevolvingUtilizationOfUnsecuredLines":5,
                "age":4,
                "DebtRatio":4,
                "MonthlyIncome":3,
                "NumberOfOpenCreditLinesAndLoans":5}
#不能使用自动分箱的变量
hand_bins = {"NumberOfTime30-59DaysPastDueNotWorse":[0,1,2,13]
            ,"NumberOfTimes90DaysLate":[0,2,17]
            ,"NumberRealEstateLoansOrLines":[0,1,2,3,32]
            ,"NumberOfTime60-89DaysPastDueNotWorse":[0,1,9]
            ,"NumberOfDependents":[0,1,2,3,20]}
"""将所有的变量自动分箱"""
bins_of_col = {}
#生成自动分箱的分享区间以及分箱后的IV值
for col in auto_col_bins:
    bins_df = graphforbestbin(model_data,col,
                             "SeriousDlqin2yrs",
                             n=auto_col_bins[col],
                             q=20,
                             graph=False)
    bins_list = sorted(set(bins_df['min']).union(bins_df['max']))
    #保证区间覆盖使用 np.inf 替换最大值 -np.inf 替换最小值
    bins_list[0],bins_list[-1] = -np.inf,np.inf
    bins_of_col[col] = bins_list

#合并手动分箱数据    
bins_of_col.update(hand_bins)

"""计算各箱的WOE并映射到数据中"""
for col in bins_of_col:
    model_woe[col] = pd.cut(model_data[col],bins_of_col[col]).apply(woeall[col])
#添加标签
model_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]

In [21]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
import pandas as pd
import seaborn as sns
#读取寻训练数据，为了使不同的端可以正常使用数据，故设置两段读取
#从kaggle训练集库读取
try:
    model_data = pd.read_csv("model_data.csv")#设置某列作为索引
except FileNotFoundError:#除非找不到文件，再从该目录下查找文件
    path = "D:\Desktop\python练习文档\python教学\python量化金融\Python-Quantitative-Finance\逻辑回归制作信用评分卡\\"
    model_data = pd.read_csv(path+"cs-training.csv")#设置某列作为索引
    

In [22]:
model_data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,DebtRatio,MonthlyIncome,NumberOfDependents
0,0,1,0.233067,54,11566.75693,102.878733,0.0
1,1,1,0.026404,66,0.32115,4300.0,1.188335
2,2,0,0.438251,85,0.30134,5000.0,1.0
3,3,0,0.693844,27,0.043536,2250.0,0.0
4,4,0,0.008259,73,0.074116,8000.0,0.0
