# Feature Engineering

https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.9.3b30b135yt4vF1&postId=129321

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

In [6]:
data_train = pd.read_csv('originalDataset/train.csv')
data_test_a = pd.read_csv('originalDataset/testA.csv')

In [10]:
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

see null values: 

In [9]:
data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n3                  

Fill the null. Some other schemes can be used. 

In [11]:
#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

process the dates: 

In [12]:
#转化成时间格式
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

change the `object` type into a numerical type: 

In [13]:
data_train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        52489
10+ years    262753
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64

In [14]:
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0]) ## mind this. use the int type in np, not general python int. 
    
for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [15]:
for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x: int(x[-4:]))

**Mind this**: the `earliesCreditLine` was MMM-YYYY, but after the following preprocess, this column only has YYYY. 

So in the future, we may find other ways to preprocess this column. 

In [16]:
data_train['earliesCreditLine'].sample(5)

174473    1994
132434    1997
26880     1993
13986     1989
461969    1998
Name: earliesCreditLine, dtype: int64

Use label encoding to change other object type columns. 

In [17]:
for data in [data_train, data_test_a]:
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

In [18]:
# 类型数在2之上，又不是高维稀疏的,且纯分类特征
for data in [data_train, data_test_a]:
    data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

Find outlier, especially for numerical data that are outside of the range (3 * standard error). 

In [19]:
def find_outliers_by_3segama(data,fea):
    stdError = np.std(data[fea])
    meanVal = np.mean(data[fea])
    lowerBound = meanVal - 3*stdError
    higherBound = meanVal + 3*stdError
    data[fea+'_outliers'] = data[fea].apply(lambda x: str('异常值') if x < lowerBound or x > higherBound else str('正常值'))
    return data

`isDefault` is the label column. 

The meaning of the outputs of the following cell is that: _in each column, rows are grouped by `正常值` or `异常值`, and the `sum()` will be the sum of multiple '1's and '0's. E.g., in column `interestRate_outliers`, there are 150000+ 正常值 rows that are positive labeled, and there are 2916 异常值 rows that are positive labeled._

In [22]:
data_train = data_train.copy()
for fea in numerical_fea:
    data_train = find_outliers_by_3segama(data_train,fea)
    print(data_train[fea+'_outliers'].value_counts())
    print("-"*10)
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
    print('*'*10)
    print()

正常值    800000
Name: id_outliers, dtype: int64
----------
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********

正常值    800000
Name: loanAmnt_outliers, dtype: int64
----------
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********

正常值    800000
Name: term_outliers, dtype: int64
----------
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********

正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
----------
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********

正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
----------
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********

正常值    800000
Name: employmentTitle_outliers, dtype: int64
----------
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********

正常值    799701
异常值       299
Name: homeOwnership_outliers, dtype: int64
----------
homeOwnership_outliers
异常值      

In [23]:
data_train.head(3).append(data_train.tail(3))

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,n5_outliers,n6_outliers,n7_outliers,n8_outliers,n9_outliers,n10_outliers,n11_outliers,n12_outliers,n13_outliers,n14_outliers
0,0,35000.0,5,19.52,917.97,5,E2,320.0,2.0,2,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
1,1,18000.0,5,18.49,461.9,4,D2,219843.0,5.0,0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
2,2,12000.0,5,16.99,298.17,4,D3,31698.0,8.0,0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
799997,799997,6000.0,3,13.33,203.12,3,C3,2582.0,10.0,1,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
799998,799998,19200.0,3,6.92,592.14,1,A4,151.0,10.0,0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
799999,799999,9000.0,3,11.06,294.91,2,B3,13.0,5.0,0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值


Delete 异常值, which is abnormal values. 

Only the rows that all numerical columns are normal values will be kept. Other rows will be given up. 

**Mind this**: sometimes abnormal values cannot be removed. They should also be kept, because sometimes the abnormal values can lead to discoveries. 

In [24]:
#删除异常值
for fea in numerical_fea:
    data_train = data_train[data_train[fea+'_outliers']=='正常值']
    data_train = data_train.reset_index(drop=True) 

Distribute the data into bins: 

In [25]:
# 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)

In [26]:
## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))

In [27]:
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)

Combinatorial features（交互特征）is the combination of original features. https://www.msra.cn/zh-cn/news/features/kdd-2018-xdeepfm#:~:text=%E7%89%B9%E5%BE%81%E4%BA%A4%E4%BA%92%E6%8C%87%E7%9A%84%E6%98%AF,user_id%2C%20item_id%5D%E7%9A%84%E8%81%94%E7%B3%BB%E3%80%82

**Mind this**: there may be multiple ways of combining features. Try more methods. 

In [28]:
for col in ['grade', 'subGrade']: 
    temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_mean'].to_dict()

    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)


In [30]:
    
# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')