# 最终的优化版本（特征工程和模型融合）
    1. 数据预处理
            1.1 判断数据是否合规
            1.2 缺失值的检验 
            1.3 字段类型的探索
    特征衍生
    特征筛选

In [89]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from tqdm import tqdm


In [90]:
data_train = pd.read_csv('./data/train.csv')
data_test_a = pd.read_csv('./data/testA.csv')
print(data_train.shape, data_test_a.shape)

(800000, 47) (200000, 46)


In [91]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  800000 non-null  int64  
 1   loanAmnt            800000 non-null  float64
 2   term                800000 non-null  int64  
 3   interestRate        800000 non-null  float64
 4   installment         800000 non-null  float64
 5   grade               800000 non-null  object 
 6   subGrade            800000 non-null  object 
 7   employmentTitle     799999 non-null  float64
 8   employmentLength    753201 non-null  object 
 9   homeOwnership       800000 non-null  int64  
 10  annualIncome        800000 non-null  float64
 11  verificationStatus  800000 non-null  int64  
 12  issueDate           800000 non-null  object 
 13  isDefault           800000 non-null  int64  
 14  purpose             800000 non-null  int64  
 15  postCode            799999 non-nul

In [92]:
# 1.1. 判断数据是否合规 （主要是看数据id是否重复和重复列） --> 是否和官方字典是否保持一致
data_train['id'].nunique() == data_train.shape[0]

True

In [93]:
# 查看数据是否重复
data_train.duplicated().sum()

0

In [94]:
# 1.2 缺失值的检验
# 但此结果只代表没有None或者Nan，并不排除可能存在别的值来表示缺失值的情况，因此后面我们要对其进一步分析
def missing (data):
    """
      计算每一列的缺失值的占比 
    """
    missing_number = data.isnull().sum().sort_values(ascending=False)
    missing_percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number,missing_percent], axis=1, keys=['Missing_number', 'Missing_percent'])

    return missing_values
  
missing_trian = missing(data_train)
missing_trian


Unnamed: 0,Missing_number,Missing_percent
n11,69752,0.08719
employmentLength,46799,0.058499
n8,40271,0.050339
n7,40270,0.050338
n1,40270,0.050338
n2,40270,0.050338
n3,40270,0.050338
n5,40270,0.050338
n6,40270,0.050338
n14,40270,0.050338


In [95]:
# 1.3 数据类型的探索及处理
# 1.3.1 离散特征字段和数值型字段
category_fea = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
numerical_fea = list(filter(lambda x: x not in category_fea, list(data_train.columns)))

label = 'isDefault'

#还有另外两个比较特别的时间序列

numerical_fea.remove(label)

In [96]:
# 1.3.2 时间对象的处理
startdate = pd.to_datetime(data_train['issueDate'].min(), format='%Y-%m-%d')

for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
    data['issueDate_year'] = data['issueDate'].dt.year
    data['issueDate_month'] = data['issueDate'].dt.month
    # data_train['issueDate_day'].value_counts() # 唯一值，所以可以删去
    data.drop('issueDate', axis=1, inplace=True)


In [97]:
def employmenLength_to_int(s):
    if pd.isnull(s):
        return s
    else: 
        return np.int8(s.split()[0])

for data in [data_train, data_test_a]:
    data['employmentLength'].replace('10+ years', "10 years", inplace=True)
    data['employmentLength'].replace('< 1 year', "1 years", inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(lambda x: employmenLength_to_int(x))

In [98]:
# 1.3.3 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
                 
data = pd.concat([data_train,data_test_a], axis=0)

for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 47903
policyCode 类型数： 1


In [99]:
# policyCode 类型数： 1 直接删除
for data in [data_train, data_test_a]:
    data.drop('policyCode', axis=1, inplace=True)

# regionCode  已经是编码了所以可直接跳过

In [100]:
# 1.3.4 one-hot 独热编码（小样本+五顺序意义）和 get.dumnps() 作用差不多
# id gender         id gender_M  gender_F
# 1   M       ->     1    1        0
# 2   F              2    0        1

def cate_colName(Transformer, category_cols, drop='if_bianry'):
    """
    离散字段独热编码后字段命名函数
    ：param Transformer: 独热编码转换器
    ：parame category_cols: 输入转换器的离散变量
    ：param drop: 是否具有二分类参数（二分类不需要进行编码）
    """
    cate_col_new = []
    col_value = Transformer.categories_

    for i, j in enumerate(category_cols):
        if (drop == 'if_bianry') and len(col_value[i]) == 2:
            cate_col_new.append(j)
        else:
            for f in col_value[i]:
                feture_name = j + '_' + str(f)
                cate_col_new.append(feture_name)

    return cate_col_new


In [101]:
cate_features_no_sequence = ['homeOwnership','verificationStatus', 'purpose']

data = pd.concat([data_train, data_test_a], axis=0)

hot_data = pd.DataFrame(data,columns=cate_features_no_sequence)

ohe = OneHotEncoder()
ohe.fit_transform(hot_data)

hot_all = pd.DataFrame(ohe.transform(hot_data).toarray(), columns=cate_colName(ohe, cate_features_no_sequence))

In [102]:
data_train.shape

(800000, 48)

In [103]:
# 分离 train_hot 和 test_hot

hot_train = hot_all.loc[:data_train.shape[0]-1, :]
data_train = pd.concat([data_train, hot_train], axis=1, join='outer')

data_train


Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,purpose_11,purpose_12,purpose_13
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,18000.0,5,18.49,461.90,D,D2,219843.0,5.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10.0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,799995,25000.0,3,14.49,860.41,C,C4,2659.0,7.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
799996,799996,17000.0,3,7.90,531.94,A,A4,29205.0,10.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
799997,799997,6000.0,3,13.33,203.12,C,C3,2582.0,10.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
799998,799998,19200.0,3,6.92,592.14,A,A4,151.0,10.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
hot_test = hot_all.loc[data_train.shape[0]:, :]
hot_test = hot_test.reset_index(drop=True)
data_test_a = pd.concat([data_test_a, hot_test], axis=1, join='outer')

data_test_a

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,purpose_11,purpose_12,purpose_13
0,800000,14000.0,3,10.99,458.28,B,B3,7027.0,10.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,800001,20000.0,5,14.65,472.14,C,C5,60426.0,10.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,800002,12000.0,3,19.99,445.91,D,D4,23547.0,2.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,800003,17500.0,5,14.31,410.02,C,C4,636.0,4.0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,800004,35000.0,3,17.09,1249.42,D,D1,368446.0,1.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,999995,7000.0,3,11.14,229.64,B,B2,330967.0,7.0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199996,999996,6000.0,3,6.24,183.19,A,A2,38930.0,1.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199997,999997,14000.0,5,15.88,339.57,C,C4,282016.0,8.0,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199998,999998,8000.0,3,18.06,289.47,D,D2,97.0,4.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
# 1.3.4  encoder 或者自映射编码(作用于顺序序列，且取值范围较多)
for data in [data_train, data_test_a]:
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

le = LabelEncoder()
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')

100%|██████████| 4/4 [00:08<00:00,  2.01s/it]

Label Encoding 完成





In [109]:
print(data_train.shape, data_test_a.shape)

(800000, 71) (200000, 70)
