# Setting working directory

## Load the Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Change the workding dir to: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl'`

In [None]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl')
!ls

3.0-FeatureEngineering-original.ipynb		  originalDataset
3.1-FeatureEngineering-LagrangeInterpolate.ipynb  preprocessedData
3.2-FeatureEngineering-OtherChanges.ipynb	  submissionResults
4-Tweaking.ipynb				  wasted
5.1-Ensemble-Stacking.ipynb


## Go to this place for original dataset: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl/originalDataset'`

# Importing libraries

In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 43kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
# from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
# import tqdm
warnings.filterwarnings('ignore')

# Starting feature engineering

## Load original dataset

In [None]:
data_train = pd.read_csv('originalDataset/train.csv')
data_test_a = pd.read_csv('originalDataset/testA.csv')

## Load preprocessed dataset

In [None]:
data_train = pd.read_csv('preprocessedData/lagrangeInterpolated_train-1.csv')
data_test_a = pd.read_csv('preprocessedData/lagrangeInterpolated_test-1.csv')

In [None]:
data_train_cp = data_train.copy()
data_test_a_cp = data_test_a.copy()

In [None]:
# data_train.isnull().sum()

## Numerical features and category features

In [None]:
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

## Fill the null. 

**Mind this**: Some other filling schemes can be used. 

In [None]:
data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n3                  

In [None]:
data_train["n14"].head()

0    2.0
1    NaN
2    4.0
3    1.0
4    4.0
Name: n14, dtype: float64

Change the infinite number into NaN. 

In [None]:
# data_train.replace([np.inf, -np.inf], np.nan, inplace=True)
# data_test_a.replace([np.inf, -np.inf], np.nan, inplace=True)

### Lagrange interpolation

In [None]:
# 创建函数，做插值，以空值前后5个数据（共10个数据）为例做插值  
from scipy.interpolate import lagrange  

## https://www.programmersought.com/article/37145216331/
def fillNanWithLagr(col,nv=-1,k=3):
    # col "fill column vector", nv "empty value, default -1", k "Lagrangian interval, default 3"
    # Get the null position
    if nv is np.nan:
        tar = col[col.isnull()].index.tolist()
    else:
        tar = col[col==nv].index.tolist()

    for idx in tqdm.tqdm(tar, position=0, leave=True):
      ## you can also use tqdm.tqdm_notebook(). The graphic info is more beautiful. 
    # for idx in tqdm.tqdm(tar):
                # Get Lagrange interval
                # The empty value position is removed here, because the filled empty value cannot be used to calculate other empty values
        # print(idx, end="")
        rel = col.iloc[
            list(
                set(list(range(idx-k,idx))+list(range(idx+1,idx+k+1))).difference(set(tar))
            )
        ]
                # Keep a reasonable range
        rel = rel[rel>=0][rel<len(col)]
                # Fill in Lagrangian mean
        # lagrange(arg1,arg2)(arg3)
                # arg1 "Iterable object index", arg2 "Iterable object conversion table", arg3 "Fill position"
        col.iloc[idx] = lagrange(rel.index,list(rel))(idx)
    return col

In [None]:
# # data_train_cp = data_train.copy()
# # data_test_a_cp = data_test_a.copy()

# for i, data in enumerate([data_train, data_test_a]):
#     print("in the {} dataset:".format(i + 1))
#     for fea in numerical_fea:
#         if fea == "id":
#             continue
#         print(fea)
#         data[fea] = fillNanWithLagr(data[fea], np.nan, 5)

In [None]:
# data_train.to_csv("preprocessedData/lagrangeInterpolated_train.csv", index=False)
# data_test_a.to_csv("preprocessedData/lagrangeInterpolated_test.csv", index=False)

### Median interpolation

In [None]:
#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

In [None]:
# data_train = data_train.fillna(axis = 0, method = "ffill")

## Process the dates

From the `original dates` to `number of days from a certain starting date`.

In [None]:
#转化成时间格式
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

## Change the `object` type into a numerical type 

In [None]:
# data_train['employmentLength'].value_counts(dropna=False).sort_index()

In [None]:
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0]) ## mind this. use the int type in np, not general python int. 
    
for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

AttributeError: ignored

**Mind this**: the `earliesCreditLine` was MMM-YYYY, but after the following preprocess, this column only has YYYY. 

So in the future, we may find other ways to preprocess this column. 

### Change the MMM-YYYY to int(YYYY)

In [None]:
for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x: int(x[-4:]))

In [None]:
# data_train['earliesCreditLine'].sample(5)

### How about keep the month? 

In [None]:
startmonth = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
for data in [data_train, data_test_a]:
    data["earliesCreditLine"] = data["earliesCreditLine"].apply(lambda x: datetime.datetime.strptime(x, '%b-%Y'))
    data["earliesCreditLineDT"] = data["earliesCreditLine"].apply(lambda x: x-startdate).dt.days

In [None]:
data_train["earliesCreditLineDT"].head()

## Encode the objective type columns

### Label encoding

There are 2 methods. Just run one of them. 

In [None]:
# ## Method 1: 
# ## directly use the mapping.

# for data in [data_train, data_test_a]:
#     data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

In [None]:
# http://sofasofa.io/forum_main_post.php?postid=1001659
## Method 2: with the help of LableEncoder()

#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade', 'grade']): #, 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'
    le = LabelEncoder()
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')

100%|██████████| 5/5 [00:08<00:00,  1.73s/it]

Label Encoding 完成





In [None]:
data_train["grade"].head()

0    4
1    3
2    3
3    0
4    2
Name: grade, dtype: int64

### One hot encoding

`pd.get_dummies`: https://blog.csdn.net/maymay_/article/details/80198468

originally the onehot encoding features: 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'. 

In [None]:
# 类型数在2之上，又不是高维稀疏的,且纯分类特征

data_train = pd.get_dummies(data_train, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_test_a = pd.get_dummies(data_test_a, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

# for data in [data_train, data_test_a]:
#     data = pd.get_dummies(data, 
#                           columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode' 
#                                         #  'grade', 'subGrade', 
#                                         #  'employmentTitle', 'postCode', 'title'
#                                          ]
#                           , drop_first=True)

In [None]:
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,annualIncome,issueDate,isDefault,postCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,...,regionCode_11,regionCode_12,regionCode_13,regionCode_14,regionCode_15,regionCode_16,regionCode_17,regionCode_18,regionCode_19,regionCode_20,regionCode_21,regionCode_22,regionCode_23,regionCode_24,regionCode_25,regionCode_26,regionCode_27,regionCode_28,regionCode_29,regionCode_30,regionCode_31,regionCode_32,regionCode_33,regionCode_34,regionCode_35,regionCode_36,regionCode_37,regionCode_38,regionCode_39,regionCode_40,regionCode_41,regionCode_42,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50
0,0,35000.0,5,19.52,917.97,4,21,192026,2.0,110000.0,2014-07-01,1,44,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,Aug-2001,2,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,18000.0,5,18.49,461.9,3,16,104734,5.0,46000.0,2012-08-01,0,65,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,May-2002,6327,1.0,-29.500403,-8.833409,3.833404,3.833404,10.0,-41.000365,15.499512,12.00019,-18.00007,3.833404,13.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,12000.0,5,16.99,298.17,3,17,189510,8.0,74000.0,2015-10-01,0,266,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,May-2006,1,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,11000.0,3,7.26,340.96,0,3,249632,10.0,118000.0,2015-08-01,0,56,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,May-1999,26201,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,3000.0,3,12.99,101.07,2,11,256268,7.385708,29000.0,2016-03-01,0,227,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,Aug-1977,911,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Not doing this step

In [None]:
## Nothing is done here

### Doing both of them

In [None]:
## Do both of them.

## Define a function here for finding outliers.

Especially for numerical data that are outside of the range (3 * standard error). 

In [None]:
def find_outliers_by_3segama(data,fea):
    stdError = np.std(data[fea])
    meanVal = np.mean(data[fea])
    lowerBound = meanVal - 3*stdError
    higherBound = meanVal + 3*stdError
    data[fea+'_outliers'] = data[fea].apply(lambda x: str('异常值') if x < lowerBound or x > higherBound else str('正常值'))
    return data

## See some of the labels' situation. 

`isDefault` is the label.

The meaning of the outputs of the following cell is that: _in each column, rows are grouped by `正常值` or `异常值`, and the `sum()` will be the sum of multiple '1's and '0's. E.g., in column `interestRate_outliers`, there are 150000+ 正常值 rows that are positive labeled, and there are 2916 异常值 rows that are positive labeled._

In [None]:
data_train = data_train.copy()
for fea in numerical_fea:
    data_train = find_outliers_by_3segama(data_train,fea)
    print(data_train[fea+'_outliers'].value_counts())
    print("-"*10)
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
    print('*'*10)
    print()

In [None]:
data_train.head(3).append(data_train.tail(3))

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,loanAmnt_outliers,term_outliers,interestRate_outliers,installment_outliers,employmentTitle_outliers,homeOwnership_outliers,annualIncome_outliers,verificationStatus_outliers,purpose_outliers,postCode_outliers,regionCode_outliers,dti_outliers,delinquency_2years_outliers,ficoRangeLow_outliers,ficoRangeHigh_outliers,openAcc_outliers,pubRec_outliers,pubRecBankruptcies_outliers,revolBal_outliers,revolUtil_outliers,totalAcc_outliers,initialListStatus_outliers,applicationType_outliers,title_outliers,policyCode_outliers,n0_outliers,n1_outliers,n2_outliers,n3_outliers,n4_outliers,n5_outliers,n6_outliers,n7_outliers,n8_outliers,n9_outliers,n10_outliers,n11_outliers,n12_outliers,n13_outliers,n14_outliers
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
799997,799997,6000.0,3,13.33,203.12,C,C3,2582.0,10.0,1,65000.0,2,2015-10-01,1,0,47.0,17,12.11,1.0,670.0,674.0,5.0,0.0,0.0,6381.0,51.9,36.0,1,0,2002-07-01,0.0,1.0,2.0,1.0,4.0,4.0,1.0,4.0,26.0,4.0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
799998,799998,19200.0,3,6.92,592.14,A,A4,151.0,10.0,0,96000.0,2,2015-02-01,0,4,34.0,18,29.25,0.0,675.0,679.0,16.0,0.0,0.0,69702.0,61.3,37.0,1,0,1994-01-01,4.0,1.0,0.0,5.0,8.0,8.0,7.0,10.0,6.0,12.0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值
799999,799999,9000.0,3,11.06,294.91,B,B3,13.0,5.0,0,120000.0,0,2018-08-01,0,4,62.0,13,8.99,0.0,695.0,699.0,7.0,0.0,0.0,8420.0,72.6,13.0,0,0,2002-02-01,4.0,1.0,2.0,2.0,3.0,3.0,2.0,3.0,4.0,4.0,...,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值,正常值


## Delete 异常值, which is abnormal values. 

Only the rows that all numerical columns are normal values will be kept. Other rows will be given up. 

**Mind this**: sometimes abnormal values cannot be removed. They should also be kept, because sometimes the abnormal values can lead to discoveries. 

### Actually delete abnormal values

In [None]:
#删除异常值
for fea in numerical_fea:
    data_train = data_train[data_train[fea+'_outliers']=='正常值']
    data_train = data_train.reset_index(drop=True) 

In [None]:
data_train.shape

(612742, 92)

### Don't delete any abnormal values


In [None]:
## do nothing at all here. 

## Distribute the data into bins 

Following cells are fake codes. 

**Mind this**: Multiple methods can be used. 

### Some examples

In [None]:
# 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)

In [None]:
## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))

The `pd.qcut()` is cut the numbers into bins. 

The functionality of parameter `labels=False/True` can be seen from the following cells.  

In [None]:
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
data["loanAmnt_bin3"].head()

0    5
1    7
2    4
3    6
4    9
Name: loanAmnt_bin3, dtype: int64

In [None]:
data_train_tst = data_train.copy()
data_train_tst['loanAmnt_bin3'] = pd.qcut(data_train_tst['loanAmnt'], 10, labels=False)
data_train_tst["loanAmnt_bin3"].head()

0    9
1    7
2    4
3    0
4    4
Name: loanAmnt_bin3, dtype: int64

In [None]:
data_train_tst = data_train.copy()
data_train_tst['loanAmnt_bin3'] = pd.qcut(data_train_tst['loanAmnt'], 10)
data_train_tst["loanAmnt_bin3"].head()

0    (25000.0, 40000.0]
1    (17500.0, 20000.0]
2    (10000.0, 12000.0]
3     (499.999, 5000.0]
4    (10000.0, 12000.0]
Name: loanAmnt_bin3, dtype: category
Categories (10, interval[float64]): [(499.999, 5000.0] < (5000.0, 6500.0] < (6500.0, 8500.0] <
                                     (8500.0, 10000.0] ... (15000.0, 17500.0] < (17500.0, 20000.0] <
                                     (20000.0, 25000.0] < (25000.0, 40000.0]]

### Actually don't bin at all

In [None]:
## do nothing at all

## Combinatorial features

（交互特征）is the combination of original features. https://www.msra.cn/zh-cn/news/features/kdd-2018-xdeepfm#:~:text=%E7%89%B9%E5%BE%81%E4%BA%A4%E4%BA%92%E6%8C%87%E7%9A%84%E6%98%AF,user_id%2C%20item_id%5D%E7%9A%84%E8%81%94%E7%B3%BB%E3%80%82

**Mind this**: there may be multiple ways of combining features. Try more methods. 

In [None]:
## 这里的target mean, 就跟我之前那个kaggle项目用的target encoding在思路上有一定的类似之处.
## 但是这里的具体算法又不太一样. 
## 这里的算法是: 
### 将col进行分组, 求每一个分组里面的label的总和的平均值是多少
## 然后再映射一下, 构建新的列. 
## 这里就是一种combination的方法了, 可以学习一下. 
for col in ['grade', 'subGrade']: 
    temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    print(temp_dict.head())
    temp_dict = temp_dict[col + '_target_mean'].to_dict()
    print(col, " ", temp_dict, sum(temp_dict.values()), "\n")

    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)


   grade  grade_target_mean
0      0           0.060375
1      1           0.132992
2      2           0.225020
3      3           0.303852
4      4           0.384291
grade   {0: 0.06037476460858794, 1: 0.13299242586332322, 2: 0.22501959333914529, 3: 0.3038517241090638, 4: 0.38429061641005374, 5: 0.45352437936283, 6: 0.4970171513795675} 2.0570706550725717 

   subGrade  subGrade_target_mean
0         0              0.031919
1         1              0.045697
2         2              0.055882
3         3              0.067221
4         4              0.085399
subGrade   {0: 0.031919410243544714, 1: 0.04569698065449286, 2: 0.05588170381814169, 3: 0.06722064148991205, 4: 0.08539886975949533, 5: 0.10292105138974093, 6: 0.11226174056571778, 7: 0.12923868312757203, 8: 0.1486388238145246, 9: 0.16564893291126315, 10: 0.19135984870870515, 11: 0.20689215602957423, 12: 0.22457598712877924, 13: 0.25011293820021685, 14: 0.2615487780647725, 15: 0.27798153120702074, 16: 0.2975723763570567, 17: 0.3040

In [None]:
data_train["grade"].head()

0    4
1    3
2    3
3    0
4    2
Name: grade, dtype: int64

In [None]:
# 其他衍生变量 mean 和 std
## 我认为, 这一部的操作增加的是什么呢? 
## 除号右边那个部分得到的是: 按照nx分组, 每一组求一个平均数, 然后把这个平均数赋给每一行数据, 让每一行数据自己认领一个属于自己的平均数值. 
## 结合上除号, 就得到的是: 每一行数据的等级分除以每一行数据对应的平均分, 得到的商. 
## 学习一下, 如果要进行特征组合, 多采用这里面用到的方法, 诸如groupby, transform之类. 
for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')

In [None]:
data_train[['grade', 'subGrade', 'grade_target_mean', "subGrade_target_mean"]].head()

Unnamed: 0,grade,subGrade,grade_target_mean,subGrade_target_mean
0,4,21,0.384291,0.376903
1,3,16,0.303852,0.297572
2,3,17,0.303852,0.304015
3,0,3,0.060375,0.067221
4,2,11,0.22502,0.206892


In [None]:
# df.groupby(["n0"])['grade'].transform('mean')

In [None]:
# df.groupby(["n0"])['grade'].mean()

## ~Fill the empty cells again~

Use the row above the n/a row to fill the n/a cells. In another word, fill the n/a with previous value.

There are some cells used previously. I think after they are used, there is no need to run the empty cell again. 

**Mind this**: other methods may used. 

In [None]:
# data_train = data_train.fillna(axis = 0, method = "ffill")

## Delete the features that will not be used to fit the model

No `xx_outliers` features, no labels, no original dates. 

In [None]:
features = [f for f in data_train.columns if f not in ['id','issueDate', "earliesCreditLine", 'isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']

In [None]:
# for data in [data_train, data_test_a]:
#     data.drop(["issueDate", "id"], axis = 1, inplace = True)

## Optimize the memory size of the dataset 

In [None]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
x_train_small = reduce_mem_usage(x_train)
x_test_small = reduce_mem_usage(x_test)

Memory usage of dataframe is 516800128.00 MB
Memory usage after optimization is: 332000128.00 MB
Decreased by 35.8%
Memory usage of dataframe is 129200128.00 MB
Memory usage after optimization is: 83000128.00 MB
Decreased by 35.8%


In [None]:
x_train_small.shape

(800000, 142)

## Change the distribution of the data

### Log all of the numerical data

## ~Calculating covariance~

The covariance is more useful when you want to select features. I guess. Perhaps we don't have to anything here. 

In [None]:
# x_train = data_train.drop(['isDefault'], axis=1)
# #计算协方差
# data_corr = x_train.corrwith(data_train["isDefault"]) #计算相关性
# data_corr

In [None]:
# result = pd.DataFrame(columns=['features', 'corr'])
# result['features'] = data_corr.index
# result['corr'] = data_corr.values
# result

Visualize the correlation: 

In [None]:
# numerical_fea

In [None]:
# # 当然也可以直接看图
# numerical_fea.remove("id")
# data_numeric = data_train[numerical_fea]
# correlation = data_numeric.corr()

# f, ax = plt.subplots(figsize = (7, 7))
# plt.title('Correlation of Numeric Features with Price',y=1,size=16)
# sns.heatmap(correlation,square = True,  vmax=0.8)

In [None]:
# list(data_train.columns)

## ~Select some features~

**Mind this**: other methods may be used. But I have tried the following code, it seems to degrade the performance. So I don't suggest to use the feature selection if we have enough computational resources. 

In [None]:
# from sklearn.feature_selection import SelectKBest
# from scipy.stats import pearsonr
# #选择K个最好的特征，返回选择特征后的数据
# #第一个参数为计算评估特征是否好的函数，该函数输入特征矩阵和目标向量，
# #输出二元组（评分，P值）的数组，数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
# #参数k为选择的特征个数

# selector = SelectKBest(k=50)
# selector.fit(
#     x_train_small,
#     y_train
# )

In [None]:
# colNums = selector.get_support(True)

In [None]:
# selectedFeatures = []
# for i, col in enumerate(list(x_train_small.columns)):
#     if i in colNums:
#         selectedFeatures.append(col)
# len(selectedFeatures)

In [None]:
# x_train_small_featureSelected = x_train_small[selectedFeatures]
# x_test_small_featureSelected = x_train_small[selectedFeatures]

In [None]:
## Some other feature selection methods

# from sklearn.feature_selection import VarianceThreshold
# #其中参数threshold为方差的阈值
# VarianceThreshold(threshold=3).fit_transform(
#     data_train[['grade', 'subGrade', 'grade_target_mean', "subGrade_target_mean"]],
#     data_train["isDefault"]
# )

# Save preprocessed data

In [None]:
x_train.head()

In [None]:
x_train_small.to_csv("preprocessedData/x_train-1105-2.csv", index=False)
x_test_small.to_csv("preprocessedData/x_test-1105-2.csv", index=False)
y_train.to_csv("preprocessedData/y_train-1105-2.csv", index=False)