In [None]:
def missrate_by_month(x_with_month, month_col, x_cols):
    """
    按月统计缺失率
    :param x_cols: x变量列名
    :param month_col: 月份时间列名
    :param x_with_month: 包含月份的数据
    :return:
    """
    df = x_with_month.groupby(month_col)[x_cols].apply(lambda x: x.isna().sum() / len(x))
    df = df.T
    df['miss_rate_std'] = df.std(axis=1)
    return df

In [None]:
# 定义分箱方法
Combiner = toad.transform.Combiner()
Combiner.fit(all_x_y,
             y=data_utils.label,
             n_bins=6,
             method='quantile',
             empty_separate=True)
# 计算psi
var_psi = toad.metrics.PSI(all_x_y.iloc[:500, :],
                           all_x_y.iloc[500:, :],
                           combiner=Combiner)
var_psi_df = var_psi.to_frame(name='psi')

selected_cols = var_psi[var_psi_df.psi < 0.1].index.tolist()
print("各特征的psi值计算结果: \n", var_psi_df)
print("设置psi阈值为0.1, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols)


In [3]:
import sys
sys.path.append("./")
sys.path.append("../")

import toad
import pandas as pd
from utils import data_utils


# 导入添加month列的数据
model_data = data_utils.get_data()

x = model_data[data_utils.x_cols]
y = model_data[data_utils.label]

# 分箱
Combiner = toad.transform.Combiner()
x_cat = Combiner.fit_transform(x, y, n_bins=6, method='quantile', empty_separate=True)

# 合并标签和month
x_cat_with_month = x_cat.merge(model_data[['month', 'creditability']], left_index=True, right_index=True)

# 单个特征对比逾期率
feature_col = 'age.in.years'
x_cat_one = x_cat_with_month[[feature_col, 'month', 'creditability']]
feature_var = x_cat_one.pivot_table(index=feature_col,
                                columns='month',
                                values='creditability',
                                aggfunc=['mean'])
print("特征'age.in.years'的按月分箱逾期率统计结果: \n", feature_var)


# 计算特征按月逾期率波动值
def variation_by_month(df, time_col, columns, label_col):
    variation_dict = {}
    for col in columns:
        feature_v = df.pivot_table(
            index=col, columns=time_col, values=label_col, aggfunc=['mean'])
        variation_dict[col] = feature_v.rank().std(axis=1).mean()

    return pd.DataFrame([variation_dict], index=['variation']).T


var_badrate = variation_by_month(x_cat_with_month, 'month', data_utils.x_cols, 'creditability')
print("各特征按月逾期率的标准差: \n", var_badrate)

selected_cols = var_badrate[var_badrate['variation'] < 0.8].index.tolist()
print("设置标准差阈值为0.8, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols)


特征'age.in.years'的按月分箱逾期率统计结果: 
                   mean                                        
month          2020-01   2020-02   2020-03   2020-04   2020-05
age.in.years                                                  
0             0.407407  0.304348  0.352941  0.439024  0.541667
1             0.393939  0.314286  0.407407  0.210526  0.250000
2             0.394737  0.218750  0.307692  0.350000  0.302326
3             0.250000  0.232558  0.285714  0.216216  0.300000
4             0.166667  0.266667  0.280000  0.311111  0.250000
5             0.297297  0.352941  0.218750  0.230769  0.153846
各特征按月逾期率的标准差: 
                                                     variation
duration.in.month                                    0.837724
credit.amount                                        1.378574
age.in.years                                         1.445965
present.residence.since                              1.189319
number.of.existing.credits.at.this.bank              0.447214
installment.r

In [5]:
import sys
sys.path.append("./")
sys.path.append("../")

import numpy as np
import pandas as pd
from utils.data_utils import stamp_to_date
from utils.data_utils import date_to_week


def data_preprocess(data, time_col, back_time, dtypes_dict):
    """
    数据预处理函数
    :param data: 待处理的数据
    :param time_col: 回溯依据的时间列名称
    :param back_time: 特征计算时间，datetime.datetime时间格式
    :param dtypes_dict: 指定列字段类型的字典，如{'col1':int}
    :return: 清洗完成的数据
    """
    # 删除time_col为空的行
    data = data[~data[time_col].isin(['nan', np.nan, 'NAN', 'null', 'NULL', 'Null'])]
    # 将时间列的时间戳转为日期格式
    data[time_col] = data[time_col].apply(stamp_to_date)
    # 过滤订单创建时间在back_time之后的数据，避免特征穿越
    data = data[data[time_col] <= back_time]
    # 删除整条缺失的数据
    data.dropna(how='all', inplace=True)
    # 空字符串替换为np.nan
    data.replace('', np.nan, inplace=True)
    # 单个字段缺失填充为0
    data.fillna(0, inplace=True)
    # 去重
    data.drop_duplicates(keep='first', inplace=True)
    # 字段格式转换
    data = data.astype(dtypes_dict)
    # 补充字段
    data['create_time_week'] = data[time_col].apply(date_to_week)
    data['is_weekend'] = data['create_time_week'].apply(lambda x: 1 if x > 5 else 0)

    return data