## 特征工程
特征工程是从row data中提取特征的过程，这些特征应从多种角度描述数据，使得基于这些特征训练的模型具有优秀的性能，特征工程流程包括特征处理、特征选择、特征变换、特征合成等。

### 特征处理
1. 数据清洗

对重复特征、无用特征进行删除：

In [None]:
# 读取数据
import pandas as pd
train_data = pd.read_csv("./train.csv", encoding='utf-8')
test_data = pd.read_csv("./testA.csv", encoding='utf-8')

In [None]:
# 重复特征
train_data = train_data.drop(['n2.1'],axis=1)
test_data = test_data.drop(['n2.1', 'n2.2', 'n2.3'],axis=1)

In [None]:
# 找出可能存在的方差为0的无用，删除
train_one_value_fea = [col for col in train_data.columns if train_data[col].nunique() <= 1]
test_one_value_fea = [col for col in test_data.columns if test_data[col].nunique() <= 1]
print(train_one_value_fea, test_one_value_fea)

train_data = train_data.drop(['policyCode'],axis=1)
test_data = test_data.drop(['policyCode'],axis=1)

2. 内存压缩

对数据值大小修改数据类型

In [None]:
# 内存压缩
def memory_compress(df, verbose=True):
    start_mem = df.memory_usage().sum()/1024**2
    numerics = ['int16','int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            col_min = df[col].min()
            col_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if col_min>np.iinfo(np.int8).min and col_max<np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif col_min>np.iinfo(np.int16).min and col_max<np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif col_min>np.iinfo(np.int32).min and col_max<np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif col_min>np.iinfo(np.int64).min and col_max<np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if col_min>np.finfo(np.float16).min and col_max<np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif col_min>np.finfo(np.float32).min and col_max<np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum()/1024**2
    print('start_mem: {:.2f} MB'.format(start_mem))
    print('end_mem: {:.2f} MB'.format(end_mem))
    print('compress ratio: {:.2%} MB'.format((start_mem-end_mem)/start_mem))
    return df
train_data = memory_compress(train_data)
test_data = memory_compress(test_data)

In [None]:
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
data.info()

3. 非数值类型特征处理

对object字段进行处理 ['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']

In [None]:
# EDA中发现grade、subGrade与违约存在着明显的序号关系

# 贷款等级
data['grade'] = data['grade'].map({'A':1,'B':7,'C':13,'D':19,'E':25,'F':31,'G':37})

# 贷款子等级
data['subGrade'] = data['subGrade'].map({'A1':2,'A2':3,'A3':4,'A4':5,'A5':6,
                                        'B1':8,'B2':9,'B3':10,'B4':11,'B5':12,
                                        'C1':14,'C2':15,'C3':16,'C4':17,'C5':18,
                                        'D1':20,'D2':21,'D3':22,'D4':23,'D5':24,
                                        'E1':26,'E2':27,'E3':28,'E4':29,'E5':30,
                                        'F1':32,'F2':33,'F3':34,'F4':35,'F5':36,
                                        'G1':38,'G2':39,'G3':40,'G4':41,'G5':42,})
# 就业年限
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'] = data['employmentLength'].fillna(data['employmentLength'].median())
data['employmentLength'].value_counts(dropna= False).sort_index()

# 贷款发放的月份
#train_data['issueDate'].apply(lambda x:str(x[-2:])).value_counts()
start_date = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
data['issueDate'] = data['issueDate'].apply(lambda x: x-start_date).dt.days

# earliesCreditLine  借款人最早报告的信用额度开立的月份
start_date = pd.to_datetime(data['earliesCreditLine'], format='%b-%Y').min()
data['earliesCreditLine'] = pd.to_datetime(data['earliesCreditLine'], format='%b-%Y')
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x: x-start_date).dt.days

4. 特征选择

删除EDA中相关性系数大于0.9的变量：

In [None]:
redundancy_fea = ['id','installment','ficoRangeHigh','n10','n9','grade', 'subGrade']
#redundancy_fea = ['ficoRangeHigh']
#redundancy_fea = ['id','ficoRangeHigh']

train_data = train_data.drop(redundancy_fea, axis=1)
test_data = test_data.drop(redundancy_fea, axis=1)

5.特征转换

- 归一化、Box-Cox转换，对非决策树类模型有效
- 高维稀疏类别特征使用Label Encode转换
- 低维类别变量使用One-hot编码
- 数值跨域大且稀疏特征使用分箱

In [None]:
def scale_01(col):
    if col not in ['term', 'grade', 'subGrade', 'homeOwnership', 'verificationStatus','purpose','initialListStatus']:
        return (col-col.min())/(col.max()-col.min())
data = data.apply(scale_01, axis=0)

def box_cox(data):
    for col in data.columns:
        if col not in ['term', 'grade', 'subGrade', 'ficoRangeLow','homeOwnership', 'verificationStatus','purpose','initialListStatus']:
            print(col)
            _, opt_lambda = stats.boxcox(np.array(data[col].dropna())+2)
            data[col] = stats.boxcox(data[col]+2, lmbda=opt_lambda)
    return data
data = box_cox(data)

In [None]:
# 高维稀疏类别特征
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in ['employmentTitle', 'postCode', 'title','regionCode']:
    #data[col+'_cnts'] = data.groupby([col])['id'].transform('count')
    #data[col+'_rank'] = data.groupby([col])['id'].rank(ascending=False).astype(np.int32)
    data[col+'_le'] = le.fit_transform(data[col])
    data.pop(col)

In [None]:
# 低维类别变量
data = pd.get_dummies(data, columns=['purpose','initialListStatus','applicationType'], drop_first=True)

In [None]:
from sklearn.tree import DecisionTreeClassifier
def binning(x, y, nan):
    boundary = []
    x = x.fillna(nan).values
    y = y.values
    
    clf = DecisionTreeClassifier(criterion='entropy',
                                 max_leaf_nodes=6,
                                 min_samples_leaf=0.05)

    clf.fit(x.reshape(-1, 1), y)
    
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold
    
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:
            boundary.append(threshold[i])
    boundary.sort()
    min_x = x.min()
    max_x = x.max() + 0.1
    boundary = [min_x] + boundary + [max_x]
    return boundary

# 特征分桶
bin_list = ['employmentLength', 'issueDate', 'earliesCreditLine', 'delinquency_2years', 'pubRec', 'pubRecBankruptcies']
offset = 800000
for col in bin_list:
    boundary = binning(x=data[col][:offset], y=data['isDefault'][:offset], nan=data[col][:offset].median())
    #data[col] = pd.cut(x=data[col], bins=boundary, right=False, labels=False)
    data[col+'_bin'] = pd.cut(x=data[col], bins=boundary, right=False, labels=False)
    print('col: {}   boundary{}'.format(col, boundary))

6. 特征合成

In [None]:
# 一些特征组合
data['totalLoanAmnt'] = data['loanAmnt'].mul(data['interestRate'].apply(lambda x:1.0+0.01*x))
data['loanAmntYear'] = data['totalLoanAmnt']/data['term']
data['freeWealthYear'] = data['annualIncome']-data['loanAmntYear']

n_pos_list = ['n0', 'n1', 'n2', 'n4', 'n6', 'n7', 'n13', 'n14']
n_neg_list = ['n5','n8']

data['n_pos']=data[n_pos_list].apply(lambda x:x.sum(),axis=1)
data['n_neg']=data[n_neg_list].apply(lambda x:x.sum(),axis=1)

7. 不平衡数据集处理

In [None]:
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)
train_data, target = smote_tomek.fit_resample(train_data, target)