In [1]:

'''数值型变量EDA'''
def var_eda(df,need_target,target='y'):
    """
    about:变量EAD分析
    :param df: Dataframe数据集
    :param need_target: 取值1或者0(True或者Fasle)，代表是否需要展示Y相关指标，如坏样本水平、KS和IV值等
    :param target: 0/1二分类目标变量，如果need_target=0或者False，采用默认值定义即可
    :return: Dataframe
    """
    import pandas as pd 
    import numpy as np
    print('正在进行变量描述统计...')
    describe = df.describe().T
    sample_num = df.shape[0]
    describe['varname'] = describe.index
    describe['missRate'] = 1-describe['count'] / sample_num
    describe = describe.reset_index(drop=True) 
    
    print('只对以下变量计算集中度等指标')
    notempty_var_list = list(describe.loc[describe['count'] > 0].varname)
    print(notempty_var_list)
    
    print('正在计算变量集中度...')
    sumCount = len(df.index)
    colsDf = pd.DataFrame({'tmp':['tmp',np.nan,np.nan]})
    
    for col in notempty_var_list: 
#         print(col)
        valueCountDict = {}
        colDat = df.loc[:,col]
        colValueCounts = pd.value_counts(colDat).sort_values(ascending=False)
        concentElement = colValueCounts.index[0] #获取水平最高的占比水平
        valueCountDict[col] = [concentElement,colValueCounts.iloc[0],colValueCounts.iloc[0]* 1.0 / sumCount]
        colDf = pd.DataFrame(valueCountDict)
        colsDf = colsDf.join(colDf)
    colsDf = (colsDf.rename(index={0:'concentricElement',1:'concentricCount',2:'concentricRate'})).drop('tmp',axis=1) 

    concentricRate = pd.DataFrame(colsDf.T['concentricRate'])
    concentricRate['index'] = concentricRate.index 
    
    print('正在计算变量水平个数...')
    
    uniqDf=pd.DataFrame([])
    for col in notempty_var_list:
        valueCountDict = {}
        uniqDf[col]=[len(df[col].unique())] 

    uniqDf = uniqDf.T
    uniqDf['varname'] = uniqDf.index 
    uniqDf.rename(columns={0:'levels'},inplace=True)

    if need_target==False:
        print('正在汇总指标...')
        var_des = pd.merge(describe,concentricRate,how='left',left_on=['varname'],right_on=['index'])
        var_des = pd.merge(var_des,uniqDf,how='left',left_on=['varname'],right_on=['varname'])
        var_des['missRate'] = var_des['missRate'].apply(lambda x:"{:.1%}".format(x))
        var_des['concentricRate'] = var_des['concentricRate'].apply(lambda x:"{:.1%}".format(x))
        var_des = var_des[['varname','count','levels','missRate','concentricRate','mean','25%','50%','75%','std','min','max']]   
        print('描述统计完毕...')
        
    if need_target==True:   
        notempty_var_list.remove(target)
        print('正在计算坏样本比例...')
        badrateDf=pd.DataFrame([])
        badrateDf2=pd.DataFrame([])
        for col in notempty_var_list:
            tmp=df[pd.isna(df[col])]
            badrateDf[col]=[tmp[target].mean()]

            tmp=df[pd.notna(df[col])]
            badrateDf2[col]=[tmp[target].mean()]

        badrateDf = badrateDf.T
        badrateDf['varname'] = badrateDf.index 
        badrateDf.rename(columns={0:'missbadRate'},inplace=True)
        badrateDf2 = badrateDf2.T
        badrateDf2['varname'] = badrateDf2.index 
        badrateDf2.rename(columns={0:'nomissbadRate'},inplace=True)
        badrateDf3 = pd.merge(badrateDf,badrateDf2,how='left',left_on=['varname'],right_on=['varname'])

        print('正在计算变量ks值...')
        from scipy.stats import ks_2samp
        colsDf = pd.DataFrame({'tmp':[np.nan]})
        ks_value=lambda x,y:ks_2samp(x[y==True],x[y==False]).statistic
        for col in notempty_var_list: 
    #         print(col)
            ksDict = {}
            if df[col].dtypes in ('int64','float64','int','float'): ###!!!!!!!!!!!!!!!!!!!!!!!!!!!
                colDat = df.loc[:,[col,target]]
                ks_val = ks_value(colDat[col],colDat[target]) #计算ks取值
                ksDict[col] = [ks_val] # 得到ks值
                colDf = pd.DataFrame(ksDict)
                colsDf = colsDf.join(colDf) #join 方法添加
            else:
                ksDict[col] = [''] # 得到ks值
                colDf = pd.DataFrame(ksDict)
                colsDf = colsDf.join(colDf) #join 方法添加

        #通过循环得到所有变量的取值的频率、占比
        colsDf = (colsDf.rename(index={0:'KS'})).drop('tmp',axis=1) #替换index的值，同时删除多余的tmp

        Ks_Value = pd.DataFrame(colsDf.T['KS'])
        Ks_Value['varname'] = Ks_Value.index

        print('正在计算变量iv值...')
        colsDf=pd.DataFrame([])
        for col in notempty_var_list:
            bin_bad_temp = pd.DataFrame(df.groupby(col)[target].sum()).rename(columns={target: 'bad#'})
            bin_bad=bin_bad_temp.sort_values(by=col,na_position='first')
            bin_tota_temp = pd.DataFrame(df.groupby(col).size()).rename(columns={0: 'total#'})
            bin_total=bin_tota_temp.sort_values(by=col,na_position='first')   
            bin_dis = pd.concat([bin_total, bin_bad], axis=1)
            bin_dis['good#'] = bin_dis[['bad#', 'total#']].apply(lambda x: x[1] - x[0], axis=1)
            bin_dis['bad%%'] = bin_dis['bad#'] / sum(bin_dis['bad#'])
            bin_dis['good%%'] = bin_dis['good#'] / sum(bin_dis['good#'])

            bin_dis['good%%'] = bin_dis['good%%'].apply(lambda x : 0.000000000000000001 if x ==0 else x) ##异常情况赋值，为了WOE计算不报错
            bin_dis['bad%%'] = bin_dis['bad%%'].apply(lambda x : 0.000000000000000001 if x ==0 else x)  
            bin_dis['WOEi'] = bin_dis[['bad%%', 'good%%']].apply(lambda x: np.log(x[0] / x[1]), axis=1)
            bin_dis['IVi'] = bin_dis[['bad%%', 'good%%', 'WOEi']].apply(lambda x: (x[0] - x[1]) * x[2], axis=1)  
            colsDf[col]=[float('%.2f' % sum(bin_dis['IVi']))] 

        IV_value = colsDf.T
        IV_value['varname'] = IV_value.index 
        IV_value.rename(columns={0:'IV'},inplace=True)

        print('正在汇总指标...')
        var_des = pd.merge(describe,concentricRate,how='left',left_on=['varname'],right_on=['index'])
        var_des = pd.merge(var_des,uniqDf,how='left',left_on=['varname'],right_on=['varname'])
        var_des = pd.merge(var_des,badrateDf3,how='left',left_on=['varname'],right_on=['varname'])
        var_des = pd.merge(var_des,Ks_Value,how='left',left_on=['varname'],right_on=['varname'])
        var_des = pd.merge(var_des,IV_value,how='left',left_on=['varname'],right_on=['varname'])

        for i in ['missRate','concentricRate','missbadRate','nomissbadRate','KS']:
            var_des[i] = var_des[i].apply(lambda x:"{:.1%}".format(x))

        var_des = var_des[['varname','count','levels','missRate','concentricRate','missbadRate','nomissbadRate','KS','IV','mean','25%','75%','std','min','max']]   
        print('描述统计完毕...')
    
    return var_des

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import make_classification

In [20]:
classsification=make_classification(n_classes =2)

In [27]:
X=np.array(classsification[0])

In [28]:
y=classsification[1]

In [54]:
y=pd.DataFrame(y,columns=['y'])

In [69]:
df=pd.concat([pd.DataFrame(X,columns=list('abcdefghijklmnopqrst')),y],axis=1)

In [70]:
df.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'y'],
      dtype='object')

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       100 non-null    float64
 1   b       100 non-null    float64
 2   c       100 non-null    float64
 3   d       100 non-null    float64
 4   e       100 non-null    float64
 5   f       100 non-null    float64
 6   g       100 non-null    float64
 7   h       100 non-null    float64
 8   i       100 non-null    float64
 9   j       100 non-null    float64
 10  k       100 non-null    float64
 11  l       100 non-null    float64
 12  m       100 non-null    float64
 13  n       100 non-null    float64
 14  o       100 non-null    float64
 15  p       100 non-null    float64
 16  q       100 non-null    float64
 17  r       100 non-null    float64
 18  s       100 non-null    float64
 19  t       100 non-null    float64
 20  y       100 non-null    int32  
dtypes: float64(20), int32(1)
memory usage: 1

In [72]:
var_eda(df,1,target='y')

正在进行变量描述统计...
只对以下变量计算集中度等指标
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'y']
正在计算变量集中度...
正在计算变量水平个数...
正在计算坏样本比例...
正在计算变量ks值...
正在计算变量iv值...
正在汇总指标...
描述统计完毕...


Unnamed: 0,varname,count,levels,missRate,concentricRate,missbadRate,nomissbadRate,KS,IV,mean,25%,75%,std,min,max
0,a,100.0,100,0.0%,1.0%,nan%,51.0%,12.5%,75.07,-0.107741,-0.818219,0.570487,1.028089,-2.248023,2.352691
1,b,100.0,100,0.0%,1.0%,nan%,51.0%,13.3%,75.07,0.099509,-0.603502,0.70598,1.019067,-2.596201,2.355142
2,c,100.0,100,0.0%,1.0%,nan%,51.0%,17.5%,75.07,0.09497,-0.402204,0.70944,0.861764,-2.367263,2.423142
3,d,100.0,100,0.0%,1.0%,nan%,51.0%,22.1%,75.07,0.026449,-0.657786,0.672317,0.961732,-2.140909,2.993571
4,e,100.0,100,0.0%,1.0%,nan%,51.0%,15.6%,75.07,0.198646,-0.43279,0.808159,0.996817,-2.361152,2.994534
5,f,100.0,100,0.0%,1.0%,nan%,51.0%,23.3%,75.07,0.014029,-0.551824,0.791647,1.015389,-2.709253,2.169639
6,g,100.0,100,0.0%,1.0%,nan%,51.0%,14.4%,75.07,-0.064244,-0.708793,0.524462,1.046387,-2.91976,3.521128
7,h,100.0,100,0.0%,1.0%,nan%,51.0%,15.0%,75.07,-0.103196,-0.874068,0.582111,1.073113,-2.698461,2.901226
8,i,100.0,100,0.0%,1.0%,nan%,51.0%,9.6%,75.07,0.226156,-0.457439,0.922017,1.005406,-2.0911,2.227731
9,j,100.0,100,0.0%,1.0%,nan%,51.0%,10.7%,75.07,-0.171629,-1.053152,0.563544,0.974983,-2.287773,2.29591
