# Setting working directory

## Load the Google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Change the workding dir to: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl'`

In [2]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl')
!ls


2.0-EDA-1.ipynb
3.0-FeatureEngineering-original.ipynb
3.1-FeatureEngineering-LagrangeInterpolate.ipynb
3.2-FeatureEngineering-From3.1-Lgrg+onehot.ipynb
3.3-FeatureEngineering-From3.2+Log1p.ipynb
3.4-FeatureEngineering.ipynb
3.5-FeatureEngineering-backToOrigin.ipynb
3.6-FeatureEngineering-brandNewScheme.ipynb
3.7-FeatureEngineering-GoBack-1.ipynb
3.7-FeatureEngineering-GoBack.ipynb
4.0-Tweaking-Greedy.ipynb
4.1-Tweaking-Bayesian.ipynb
5.1-Ensemble-Stacking.ipynb
5.2-Ensemble-Stacking-weightedKFold.ipynb
originalDataset
preprocessedData
submissionResults
Untitled
wasted


## Go to this place for original dataset: 

`'/content/drive/My Drive/Colab Notebooks/MachineLearningPractice/FinanceRiskControl/originalDataset'`

# Importing libraries

# Old school feature engineering

In [None]:
# !pip install catboost

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
# from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
# import tqdm
warnings.filterwarnings('ignore')

In [None]:
data_train = pd.read_csv('originalDataset/train.csv')
data_test_a = pd.read_csv('originalDataset/testA.csv')

In [None]:
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

In [None]:
#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

In [None]:
#转化成时间格式
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

In [None]:
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [None]:
for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [None]:
# 部分类别特征
cate_features = ['homeOwnership', 'verificationStatus', 'initialListStatus', 'applicationType', 
                 'regionCode', 'employmentTitle', 'purpose', 'postCode', 'title', 
                 'grade', 'subGrade', ## I regard this as serial type. 
                 'policyCode', ## This is useless
                 ] ## n11, n12 is missing
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

homeOwnership 类型数： 6
verificationStatus 类型数： 3
initialListStatus 类型数： 2
applicationType 类型数： 2
regionCode 类型数： 51
employmentTitle 类型数： 79282
purpose 类型数： 14
postCode 类型数： 889
title 类型数： 12058
grade 类型数： 7
subGrade 类型数： 35
policyCode 类型数： 1


In [None]:
for data in [data_train, data_test_a]:
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

In [None]:
# 类型数在2之上，又不是高维稀疏的,且纯分类特征
## 注意，这个方法跑了其实没什么卵用。不知道为什么。
## 就暂且不跑了得了。
# for data in [data_train, data_test_a]:
#     data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

## applicationType, initialListStatus, policyCode未处理。它们属于纯分类，但是类型数在2以下。
## 为什么不处理这样的特征呢？迷，迷，迷

In [None]:
data_train = pd.get_dummies(data_train, columns=['homeOwnership', 'verificationStatus', "applicationType", "initialListStatus", "policyCode"], drop_first=True)
data_test_a = pd.get_dummies(data_test_a, columns=['homeOwnership', 'verificationStatus', "applicationType", "initialListStatus", "policyCode"], drop_first=True)


In [None]:
# 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000
for data in [data_train, data_test_a]:
    data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
    data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
    data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)

In [None]:
for col in ['grade', 'subGrade']: 
    temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_mean'].to_dict()

    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)

In [None]:
# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')

In [None]:
#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
    le = LabelEncoder()
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')

100%|██████████| 4/4 [00:07<00:00,  1.77s/it]

Label Encoding 完成





In [None]:
# 删除不需要的数据
for data in [data_train, data_test_a]:
    data.drop(['issueDate','id'], axis=1,inplace=True)

In [None]:
"纵向用缺失值上面的值替换缺失值"
data_train = data_train.fillna(axis=0,method='ffill')

In [None]:
features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']

# Some hands-on model fitting...

Load pretrained dataset if applicable. 

In [7]:
x_train = pd.read_csv("preprocessedData/x_train-1110-3_7-1.csv")
x_test = pd.read_csv("preprocessedData/x_test-1110-3_7-1.csv")
y_train = pd.read_csv("preprocessedData/y_train-1110-3_7-1.csv")

## Regular

In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 50,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': "gpu_hist", #'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            # https://stackoverflow.com/questions/55579610/xgboost-attributeerror-dataframe-object-has-no-attribute-feature-names
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(xgb.DMatrix(test_x) , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        ## 下面这个test，可以考虑像如下这样进行，或者是求平均亦可。
        test += test_pred / kf.n_splits ## Minke's invention, no basis or reference.
        # test += test_pred
        ##
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    print("what is kf.n_splits?", kf.n_splits)
    return train, test # / kf.n_splits

In [None]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

## Weighted

In [None]:
def cv_model_weighted(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = [] # np.zeros(test_x.shape[0])
    sumTest = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': "gpu_hist", #'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            # https://stackoverflow.com/questions/55579610/xgboost-attributeerror-dataframe-object-has-no-attribute-feature-names
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(xgb.DMatrix(test_x) , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        ## 下面这个test，可以考虑像如下这样进行，或者是求平均亦可。
        # test += test_pred / kf.n_splits ## Minke's invention, no basis or reference.
        test.append(test_pred)
        sumTest += test_pred
        ##
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    print("what is kf.n_splits?", kf.n_splits)

    print("the cv_scores: ", cv_scores)
    sum(cv_scores)
    props = [i / sum(cv_scores) for i in cv_scores]
    print("the proportion of cv scores: ", props)
    print("the sum of proportion of cv scores", sum(props))
    prop_test = np.zeros(test_x.shape[0])
    for test_score, prop in zip(test, props):
        prop_test += test_score * prop

    return train, prop_test #, sumTest / kf.n_splits

In [None]:
def lgb_model_weighted(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model_weighted(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model_weighted(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model_weighted(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model_weighted(x_train, y_train, x_test):
    cat_train, cat_test = cv_model_weighted(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

In [None]:
xgb_train, xgb_test = xgb_model_weighted(x_train, y_train, x_test)

************************************ 1 ************************************
[0]	train-auc:0.695464	eval-auc:0.696327
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.731149	eval-auc:0.728776
[400]	train-auc:0.739646	eval-auc:0.733051
[600]	train-auc:0.744984	eval-auc:0.734734
[800]	train-auc:0.749163	eval-auc:0.735674
[1000]	train-auc:0.752928	eval-auc:0.736365
[1200]	train-auc:0.756385	eval-auc:0.736759
[1400]	train-auc:0.759639	eval-auc:0.737057
[1600]	train-auc:0.762694	eval-auc:0.737201
[1800]	train-auc:0.76569	eval-auc:0.737356
[2000]	train-auc:0.768603	eval-auc:0.73738
[2200]	train-auc:0.771428	eval-auc:0.737388
Stopping. Best iteration:
[2120]	train-auc:0.770308	eval-auc:0.737444

[0.7374438644326278]
************************************ 2 ************************************
[0]	train-auc:0.696544	eval-auc:0.693111
Multiple eval metrics have been passed: 'eval-auc' wil

In [None]:
testA_result = pd.read_csv('originalDataset/sample_submit.csv')
testA_result_pred = testA_result.copy()

In [None]:
testA_result_pred["isDefault"] = xgb_test
testA_result_pred.to_csv("submissionResults/xgboost-1109-3.7-prop-7365.csv", index=False)

## Return scores and test rsts

In [None]:
def cv_model_scoreAndTest(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = [] # np.zeros(test_x.shape[0])
    sumTest = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': "gpu_hist", #'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            # https://stackoverflow.com/questions/55579610/xgboost-attributeerror-dataframe-object-has-no-attribute-feature-names
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(xgb.DMatrix(test_x) , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        ## 下面这个test，可以考虑像如下这样进行，或者是求平均亦可。
        # test += test_pred / kf.n_splits ## Minke's invention, no basis or reference.
        test.append(test_pred)
        sumTest += test_pred
        ##
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    print("what is kf.n_splits?", kf.n_splits)

    print("the cv_scores: ", cv_scores)
    # sum(cv_scores)
    # props = [i / sum(cv_scores) for i in cv_scores]
    # print("the proportion of cv scores: ", props)
    # print("the sum of proportion of cv scores", sum(props))
    # prop_test = np.zeros(test_x.shape[0])
    # for test_score, prop in zip(test, props):
    #     prop_test += test_score * prop

    return train, cv_scores, test # train, prop_test #, sumTest / kf.n_splits

In [None]:
def lgb_model_scoreAndTest(x_train, y_train, x_test):
    lgb_train, lgb_cvs, lgb_test = cv_model_scoreAndTest(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_cvs, lgb_test

def xgb_model_scoreAndTest(x_train, y_train, x_test):
    xgb_train, xgb_cvs, xgb_test = cv_model_scoreAndTest(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_cvs, xgb_test

def cat_model_scoreAndTest(x_train, y_train, x_test):
    cat_train, cat_cvs, cat_test = cv_model_scoreAndTest(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_cvs, cat_test 

In [None]:
def generateFinalTestRst(cvs, test):
    cv_scores = cvs
    sum(cv_scores)
    props = [i / sum(cv_scores) for i in cv_scores]
    print("avg cv scores: ", sum(cv_scores) / len(cv_scores))
    print("the proportion of cv scores: ", props)
    print("the sum of proportion of cv scores", sum(props))
    prop_test = np.zeros(test[0].shape[0])
    avg_test = np.zeros(test[0].shape[0])
    for test_score, prop in zip(test, props):
        prop_test += test_score * prop
        avg_test += test_score / len(test)
    return prop_test, avg_test

In [None]:
xgb_train, xgb_cvs, xgb_test = xgb_model_scoreAndTest(x_train, y_train, x_test)

************************************ 1 ************************************


XGBoostError: ignored

In [None]:
prop_test, avg_test = generateFinalTestRst(xgb_cvs, xgb_test)

avg cv scores:  0.7365043219004344
the proportion of cv scores:  [0.2002589926857028, 0.19934515620891693, 0.20024694871612178, 0.2002125034793091, 0.1999363989099493]
the sum of proportion of cv scores 0.9999999999999999


In [None]:
testA_result = pd.read_csv('originalDataset/sample_submit.csv')
testA_result_pred_prop = testA_result.copy()
testA_result_pred_avg = testA_result.copy()

In [None]:
testA_result_pred_prop["isDefault"] = prop_test
testA_result_pred_prop.to_csv("submissionResults/xgboost-1109-3.7-prop-1-7365.csv", index=False)

In [None]:
testA_result_pred_avg["isDefault"] = avg_test
testA_result_pred_avg.to_csv("submissionResults/xgboost-1109-3.7-avg-1-7365.csv", index=False)

## Change parameters

In [8]:
def cv_model_scoreAndTest_paraChanged(clf, train_x, train_y, test_x, clf_name, params):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = [] # np.zeros(test_x.shape[0])
    sumTest = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            # params = {
            #     'boosting_type': 'gbdt',
            #     'objective': 'binary',
            #     'metric': 'auc',
            #     'min_child_weight': 5,
            #     'num_leaves': 2 ** 5,
            #     'lambda_l2': 10,
            #     'feature_fraction': 0.8,
            #     'bagging_fraction': 0.8,
            #     'bagging_freq': 4,
            #     'learning_rate': 0.1,
            #     'seed': 2020,
            #     'nthread': 28,
            #     'n_jobs':24,
            #     'silent': True,
            #     'verbose': -1,
            # }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            # params = {
            #     'booster': 'gbtree',
            #     'objective': 'binary:logistic',
            #     'eval_metric': 'auc',
            #     'gamma': 2, # 1,
            #     'min_child_weight': 0.5046, #0, #1.5,
            #     'max_depth': 10, #7, #5,
            #     'lambda': 20, # 10,
            #     'subsample': 0.9768, # 0.92, #0.7,
            #     'colsample_bytree': 0.9771, # 0.8, #0.7,
            #     'colsample_bylevel': 0.8972, # 0.9, #0.7,
            #     'eta': 0.0001, # 0.06, #0.04,
            #     'tree_method': "gpu_hist", #'exact',
            #     'seed': 2020,
            #     'nthread': 36,
            #     "silent": True,
            # }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            # https://stackoverflow.com/questions/55579610/xgboost-attributeerror-dataframe-object-has-no-attribute-feature-names
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(xgb.DMatrix(test_x) , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            # params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
            #           'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        ## 下面这个test，可以考虑像如下这样进行，或者是求平均亦可。
        # test += test_pred / kf.n_splits ## Minke's invention, no basis or reference.
        test.append(test_pred)
        sumTest += test_pred
        ##
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    print("what is kf.n_splits?", kf.n_splits)

    print("the cv_scores: ", cv_scores)

    return train, cv_scores, test 

In [9]:
# def lgb_model_scoreAndTest_paraChanged(x_train, y_train, x_test):
#     lgb_train, lgb_cvs, lgb_test = cv_model_scoreAndTest_paraChanged(lgb, x_train, y_train, x_test, "lgb")
#     return lgb_train, lgb_cvs, lgb_test

# def xgb_model_scoreAndTest_paraChanged(x_train, y_train, x_test):
#     xgb_train, xgb_cvs, xgb_test = cv_model_scoreAndTest_paraChanged(xgb, x_train, y_train, x_test, "xgb")
#     return xgb_train, xgb_cvs, xgb_test

# def cat_model_scoreAndTest_paraChanged(x_train, y_train, x_test):
#     cat_train, cat_cvs, cat_test = cv_model_scoreAndTest_paraChanged(CatBoostRegressor, x_train, y_train, x_test, "cat")
#     return cat_train, cat_cvs, cat_test 

In [10]:
def generateFinalTestRst(cvs, test):
    cv_scores = cvs
    sum(cv_scores)
    props = [i / sum(cv_scores) for i in cv_scores]
    print("avg cv scores: ", sum(cv_scores) / len(cv_scores))
    print("the proportion of cv scores: ", props)
    print("the sum of proportion of cv scores", sum(props))
    prop_test = np.zeros(test[0].shape[0])
    avg_test = np.zeros(test[0].shape[0])
    for test_score, prop in zip(test, props):
        prop_test += test_score * prop
        avg_test += test_score / len(test)
    return prop_test, avg_test

In [11]:
xgb_train, xgb_cvs, xgb_test = cv_model_scoreAndTest_paraChanged(
    xgb, x_train, y_train, x_test, "xgb", 
    {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 1,
        'min_child_weight': 0, #1.5,
        'max_depth': 7, #5,
        'lambda': 10,
        'subsample': 0.92, #0.7,
        'colsample_bytree': 0.8, #0.7,
        'colsample_bylevel': 0.9, #0.7,
        'eta': 0.06, #0.04,
        'tree_method': "gpu_hist", #'exact',
        'seed': 2020,
        'nthread': 36,
        "silent": True,
    }, 
)

************************************ 1 ************************************
[0]	train-auc:0.703189	eval-auc:0.701729
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.755353	eval-auc:0.734433
[400]	train-auc:0.773971	eval-auc:0.736497
[600]	train-auc:0.788326	eval-auc:0.73683
[800]	train-auc:0.801306	eval-auc:0.736931
Stopping. Best iteration:
[728]	train-auc:0.796883	eval-auc:0.737024

[0.7370241247368534]
************************************ 2 ************************************
[0]	train-auc:0.704231	eval-auc:0.697082
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.756459	eval-auc:0.730702
[400]	train-auc:0.774747	eval-auc:0.732742
[600]	train-auc:0.789555	eval-auc:0.733445
[800]	train-auc:0.802347	eval-auc:0.733675
[1000]	train-auc:0.814594	eval-auc:0.733525
Stop

In [14]:
prop_test, avg_test = generateFinalTestRst(xgb_cvs, xgb_test)

avg cv scores:  0.7357959329772146
the proportion of cv scores:  [0.20033384032300078, 0.19942553970420232, 0.20028786442873053, 0.20010798151430056, 0.19984477402976594]
the sum of proportion of cv scores 1.0000000000000002


In [15]:
testA_result = pd.read_csv('originalDataset/sample_submit.csv')
# testA_result_pred_prop = testA_result.copy()
testA_result_pred_avg = testA_result.copy()

In [16]:
# testA_result_pred_prop["isDefault"] = prop_test
# testA_result_pred_prop.to_csv("submissionResults/xgboost-1109-3.7-prop-1-7365.csv", index=False)

In [17]:
testA_result_pred_avg["isDefault"] = avg_test
testA_result_pred_avg.to_csv("submissionResults/xgboost-1110-3.7-avg-hp1-7357.csv", index=False)

# -----(Later parts are useless)-------

# Starting feature engineering

## Load dataset

### Load original dataset

In [None]:
# l = []
# for i in range(50):
#     l.append(data_train.copy())
#     l.append(data_test_a.copy())

### Load preprocessed dataset

In [None]:
# data_train = pd.read_csv('preprocessedData/lagrangeInterpolated_train-1.csv')
# data_test_a = pd.read_csv('preprocessedData/lagrangeInterpolated_test-1.csv')

In [None]:
# data_train_cp = data_train.copy()
# data_test_a_cp = data_test_a.copy()

In [None]:
# data_train = data_train_cp.copy()
# data_test_a = data_test_a_cp.copy()

In [None]:
data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n3                  

## Numerical features and category features

In [None]:
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

In [None]:
category_fea

['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']

In [None]:
# #过滤数值型类别特征
# def get_numerical_serial_fea(data,feas):
#     numerical_serial_fea = []
#     numerical_noserial_fea = []
#     for fea in feas:
#         temp = data[fea].nunique()
#         if temp <= 10:
#             numerical_noserial_fea.append(fea)
#             continue
#         numerical_serial_fea.append(fea)
#     return numerical_serial_fea,numerical_noserial_fea
# numerical_serial_fea,numerical_noserial_fea = get_numerical_serial_fea(data_train,numerical_fea)

In [None]:
# numerical_fea

In [None]:
# numerical_noserial_fea

In [None]:
# category_fea

## Fill the null. 

**Mind this**: Some other filling schemes can be used. 

In [None]:
# data_train.isnull().sum()

In [None]:
# data_train["n14"].head()

Change the infinite number into NaN. 

In [None]:
# data_train.replace([np.inf, -np.inf], np.nan, inplace=True)
# data_test_a.replace([np.inf, -np.inf], np.nan, inplace=True)

### Lagrange interpolation

In [None]:
# 创建函数，做插值，以空值前后5个数据（共10个数据）为例做插值  
from scipy.interpolate import lagrange  

## https://www.programmersought.com/article/37145216331/
def fillNanWithLagr(col,nv=-1,k=3):
    # col "fill column vector", nv "empty value, default -1", k "Lagrangian interval, default 3"
    # Get the null position
    if nv is np.nan:
        tar = col[col.isnull()].index.tolist()
    else:
        tar = col[col==nv].index.tolist()

    for idx in tqdm.tqdm(tar, position=0, leave=True):
      ## you can also use tqdm.tqdm_notebook(). The graphic info is more beautiful. 
    # for idx in tqdm.tqdm(tar):
                # Get Lagrange interval
                # The empty value position is removed here, because the filled empty value cannot be used to calculate other empty values
        # print(idx, end="")
        rel = col.iloc[
            list(
                set(list(range(idx-k,idx))+list(range(idx+1,idx+k+1))).difference(set(tar))
            )
        ]
                # Keep a reasonable range
        rel = rel[rel>=0][rel<len(col)]
                # Fill in Lagrangian mean
        # lagrange(arg1,arg2)(arg3)
                # arg1 "Iterable object index", arg2 "Iterable object conversion table", arg3 "Fill position"
        col.iloc[idx] = lagrange(rel.index,list(rel))(idx)
    return col

In [None]:
# # data_train_cp = data_train.copy()
# # data_test_a_cp = data_test_a.copy()

# for i, data in enumerate([data_train, data_test_a]):
#     print("in the {} dataset:".format(i + 1))
#     for fea in numerical_fea:
#         if fea == "id":
#             continue
#         print(fea)
#         data[fea] = fillNanWithLagr(data[fea], np.nan, 5)

In [None]:
# data_train.to_csv("preprocessedData/lagrangeInterpolated_train.csv", index=False)
# data_test_a.to_csv("preprocessedData/lagrangeInterpolated_test.csv", index=False)

### Median interpolation

In [None]:
#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

`employmentLength` cannot be filled by `mode()`. I don't know why. 

In [None]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  800000 non-null  int64  
 1   loanAmnt            800000 non-null  float64
 2   term                800000 non-null  int64  
 3   interestRate        800000 non-null  float64
 4   installment         800000 non-null  float64
 5   grade               800000 non-null  object 
 6   subGrade            800000 non-null  object 
 7   employmentTitle     800000 non-null  float64
 8   employmentLength    753201 non-null  object 
 9   homeOwnership       800000 non-null  int64  
 10  annualIncome        800000 non-null  float64
 11  verificationStatus  800000 non-null  int64  
 12  issueDate           800000 non-null  object 
 13  isDefault           800000 non-null  int64  
 14  purpose             800000 non-null  int64  
 15  postCode            800000 non-nul

In [None]:
# data_train = data_train.fillna(axis = 0, method = "ffill")

## Classify the features into multiple categories

In [None]:
total_list = sorted(list(data_train.columns))

In [None]:
numerical_category_fewValues = [
    "homeOwnership", 
    "verificationStatus",
    "initialListStatus",
    "applicationType",
    "n11",
    "n12",
]

In [None]:
numerical_category_manyValues = [
    "regionCode",
    "employmentTitle",
    "purpose",
    "postCode",
    "title",
]

In [None]:
date_type = [
    "issueDate", 
    "earliesCreditLine"
]

In [None]:
numerical_serial = [
    "loanAmnt","interestRate","installment","annualIncome","dti","delinquency_2years","ficoRangeLow","ficoRangeHigh","openAcc",
    "pubRec","pubRecBankruptcies","revolBal","revolUtil","totalAcc","n0","n1","n2","n3",
    "n4","n5","n6","n7","n8","n9","n10","n13","n14",
    "term", 
]

In [None]:
object_serial = [
    "grade",
    "subGrade", 
    "employmentLength"
]

In [None]:
# len(numerical_category_fewValues) + len(numerical_category_manyValues) + len(date_type) + len(numerical_serial) + len(object_serial)
new_cates = numerical_category_fewValues + numerical_category_manyValues + date_type + numerical_serial + object_serial


In [None]:
set(total_list) - set(new_cates)

{'id', 'isDefault', 'policyCode'}

In [None]:
# data_train = data_train.head(200)
# data_test_a = data_test_a.head(200)

## Change `object_serial` 


In [None]:
for col in tqdm(['subGrade', 'grade']): #, 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'
    le = LabelEncoder()
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))

100%|██████████| 2/2 [00:01<00:00,  1.41it/s]


In [None]:
data_train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        52489
10+ years    262753
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64

In [None]:
data_train[["employmentLength"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 1 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   employmentLength  753201 non-null  object
dtypes: object(1)
memory usage: 6.1+ MB


In [None]:
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0]) ## mind this. use the int type in np, not general python int. 
    
for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

`employmentLength` can be filled by `mean()`. Great.

In [None]:
data_train['employmentLength'].fillna(data_train["employmentLength"].mean(), inplace = True)

In [None]:
data_test_a['employmentLength'].fillna(data_test_a["employmentLength"].mean(), inplace = True)

In [None]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  800000 non-null  int64  
 1   loanAmnt            800000 non-null  float64
 2   term                800000 non-null  int64  
 3   interestRate        800000 non-null  float64
 4   installment         800000 non-null  float64
 5   grade               800000 non-null  int64  
 6   subGrade            800000 non-null  int64  
 7   employmentTitle     800000 non-null  float64
 8   employmentLength    800000 non-null  float64
 9   homeOwnership       800000 non-null  int64  
 10  annualIncome        800000 non-null  float64
 11  verificationStatus  800000 non-null  int64  
 12  issueDate           800000 non-null  object 
 13  isDefault           800000 non-null  int64  
 14  purpose             800000 non-null  int64  
 15  postCode            800000 non-nul

## Change `numerical_serial` 


In [None]:
# Currently don't have anything to do. 

## Change `date_type` 


### Split the year and month

In [None]:
#转化成时间格式
for data in [data_train, data_test_a]:
    # data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    # #构造时间特征
    data['issueYear'] = data['issueDate'].apply(lambda x: int(x.split("-")[0]))
    data['issueMonth'] = data['issueDate'].apply(lambda x: int(x.split("-")[1]))

In [None]:
data_train[["issueYear", "issueMonth"]].head()

Unnamed: 0,issueYear,issueMonth
0,2014,7
1,2012,8
2,2015,10
3,2015,8
4,2016,3


In [None]:
#转化成时间格式
def monthMapping(monthStr):
    dic = {
        "Jan": 1, 
        "Feb": 2, 
        "Mar": 3,
        "Apr": 4,
        "May": 5, 
        "Jun": 6, 
        "Jul": 7, 
        "Aug": 8, 
        "Sep": 9,
        "Oct": 10, 
        "Nov": 11,
        "Dec": 12
    }
    return dic[monthStr]

for data in [data_train, data_test_a]:
    # data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    # #构造时间特征
    data['earliesCreditLineYear'] = data['earliesCreditLine'].apply(lambda x: int(x.split("-")[1]))
    data['earliesCreditLineMonth'] = data['earliesCreditLine'].apply(lambda x: monthMapping(x.split("-")[0]))

In [None]:
data_train[["earliesCreditLineYear", "earliesCreditLineMonth"]].head()

Unnamed: 0,earliesCreditLineYear,earliesCreditLineMonth
0,2001,8
1,2002,5
2,2006,5
3,1999,5
4,1977,8


### Change the original date

In [None]:
#转化成时间格式
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

In [None]:
startmonth = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
for data in [data_train, data_test_a]:
    data["earliesCreditLine"] = data["earliesCreditLine"].apply(lambda x: datetime.datetime.strptime(x, '%b-%Y'))
    data["earliesCreditLineDT"] = data["earliesCreditLine"].apply(lambda x: x-startdate).dt.days

### Change the `date_type` list

In [None]:
date_type += [
    "issueDateDT", "earliesCreditLineDT", "earliesCreditLineYear", "earliesCreditLineMonth", 
    "issueYear", "issueMonth"
]
date_type

['issueDate',
 'earliesCreditLine',
 'issueDateDT',
 'earliesCreditLineDT',
 'earliesCreditLineYear',
 'earliesCreditLineMonth',
 'issueYear',
 'issueMonth']

In [None]:
data_train[date_type].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 8 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   issueDate               800000 non-null  datetime64[ns]
 1   earliesCreditLine       800000 non-null  datetime64[ns]
 2   issueDateDT             800000 non-null  int64         
 3   earliesCreditLineDT     800000 non-null  int64         
 4   earliesCreditLineYear   800000 non-null  int64         
 5   earliesCreditLineMonth  800000 non-null  int64         
 6   issueYear               800000 non-null  int64         
 7   issueMonth              800000 non-null  int64         
dtypes: datetime64[ns](2), int64(6)
memory usage: 48.8 MB


### Target and avg encoding

In [None]:
serials = object_serial + numerical_serial
import random
half_serials = random.sample(serials, len(serials)//4)

In [None]:
half_serials

['revolBal', 'n0', 'n8', 'dti', 'ficoRangeHigh', 'n6', 'n1']

In [None]:
for col in date_type: 
    for numFea in ["isDefault"]: # half_serials + ["isDefault"]: # object_serial + numerical_serial + ["isDefault"]:
        temp_dict = data_train.groupby([col])[numFea].agg(['mean']).reset_index().rename(columns={'mean': col + '_{}_mean'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_mean'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_mean'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_mean'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueYear,issueMonth,earliesCreditLineYear,earliesCreditLineMonth,issueDateDT,earliesCreditLineDT,issueDate_isDefault_mean,earliesCreditLine_isDefault_mean,issueDateDT_isDefault_mean,earliesCreditLineDT_isDefault_mean,earliesCreditLineYear_isDefault_mean,earliesCreditLineMonth_isDefault_mean,issueYear_isDefault_mean,issueMonth_isDefault_mean
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,2014,7,2001,8,2587,-2130,0.188505,0.202982,0.188505,0.202982,0.198575,0.195978,0.184278,0.20555
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,13.0,5.0,13.0,0.0,0.0,0.0,2.0,2012,8,2002,5,1888,-1857,0.164165,0.217822,0.164165,0.217822,0.204387,0.203027,0.159931,0.199575
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,2015,10,2006,5,3044,-396,0.191459,0.22043,0.191459,0.22043,0.221997,0.203027,0.202053,0.191226
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,2015,8,1999,5,2983,-2953,0.197707,0.198912,0.197707,0.198912,0.193196,0.203027,0.202053,0.199575
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,2016,3,1977,8,3196,-10896,0.217402,0.229268,0.217402,0.229268,0.180527,0.195978,0.233084,0.202784


In [None]:
for col in date_type: 
    for numFea in ["isDefault"]: #object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['max']).reset_index().rename(columns={'max': col + '_{}_max'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_max'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_max'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_max'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueYear,issueMonth,earliesCreditLineYear,earliesCreditLineMonth,issueDateDT,earliesCreditLineDT,issueDate_isDefault_mean,earliesCreditLine_isDefault_mean,issueDateDT_isDefault_mean,earliesCreditLineDT_isDefault_mean,earliesCreditLineYear_isDefault_mean,earliesCreditLineMonth_isDefault_mean,issueYear_isDefault_mean,issueMonth_isDefault_mean,issueDate_isDefault_max,earliesCreditLine_isDefault_max,issueDateDT_isDefault_max,earliesCreditLineDT_isDefault_max,earliesCreditLineYear_isDefault_max,earliesCreditLineMonth_isDefault_max,issueYear_isDefault_max,issueMonth_isDefault_max
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,2014,7,2001,8,2587,-2130,0.188505,0.202982,0.188505,0.202982,0.198575,0.195978,0.184278,0.20555,1,1,1,1,1,1,1,1
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,13.0,5.0,13.0,0.0,0.0,0.0,2.0,2012,8,2002,5,1888,-1857,0.164165,0.217822,0.164165,0.217822,0.204387,0.203027,0.159931,0.199575,1,1,1,1,1,1,1,1
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,2015,10,2006,5,3044,-396,0.191459,0.22043,0.191459,0.22043,0.221997,0.203027,0.202053,0.191226,1,1,1,1,1,1,1,1
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,2015,8,1999,5,2983,-2953,0.197707,0.198912,0.197707,0.198912,0.193196,0.203027,0.202053,0.199575,1,1,1,1,1,1,1,1
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,2016,3,1977,8,3196,-10896,0.217402,0.229268,0.217402,0.229268,0.180527,0.195978,0.233084,0.202784,1,1,1,1,1,1,1,1


In [None]:
for col in date_type: 
    for numFea in ["isDefault"]: #object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['min']).reset_index().rename(columns={'min': col + '_{}_min'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_min'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_min'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_min'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueYear,issueMonth,earliesCreditLineYear,earliesCreditLineMonth,issueDateDT,earliesCreditLineDT,issueDate_isDefault_mean,earliesCreditLine_isDefault_mean,issueDateDT_isDefault_mean,earliesCreditLineDT_isDefault_mean,earliesCreditLineYear_isDefault_mean,earliesCreditLineMonth_isDefault_mean,issueYear_isDefault_mean,issueMonth_isDefault_mean,issueDate_isDefault_max,earliesCreditLine_isDefault_max,issueDateDT_isDefault_max,earliesCreditLineDT_isDefault_max,earliesCreditLineYear_isDefault_max,earliesCreditLineMonth_isDefault_max,issueYear_isDefault_max,issueMonth_isDefault_max,issueDate_isDefault_min,earliesCreditLine_isDefault_min,issueDateDT_isDefault_min,earliesCreditLineDT_isDefault_min,earliesCreditLineYear_isDefault_min,earliesCreditLineMonth_isDefault_min,issueYear_isDefault_min,issueMonth_isDefault_min
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,2014,7,2001,8,2587,-2130,0.188505,0.202982,0.188505,0.202982,0.198575,0.195978,0.184278,0.20555,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,13.0,5.0,13.0,0.0,0.0,0.0,2.0,2012,8,2002,5,1888,-1857,0.164165,0.217822,0.164165,0.217822,0.204387,0.203027,0.159931,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,2015,10,2006,5,3044,-396,0.191459,0.22043,0.191459,0.22043,0.221997,0.203027,0.202053,0.191226,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,2015,8,1999,5,2983,-2953,0.197707,0.198912,0.197707,0.198912,0.193196,0.203027,0.202053,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,2016,3,1977,8,3196,-10896,0.217402,0.229268,0.217402,0.229268,0.180527,0.195978,0.233084,0.202784,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0


In [None]:
for col in date_type: 
    for numFea in ["isDefault"]: #half_serials + ["isDefault"]: # for numFea in object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['std']).reset_index().rename(columns={'std': col + '_{}_std'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_std'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_std'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_std'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,n13,n14,issueYear,issueMonth,earliesCreditLineYear,earliesCreditLineMonth,issueDateDT,earliesCreditLineDT,issueDate_isDefault_mean,earliesCreditLine_isDefault_mean,issueDateDT_isDefault_mean,earliesCreditLineDT_isDefault_mean,earliesCreditLineYear_isDefault_mean,earliesCreditLineMonth_isDefault_mean,issueYear_isDefault_mean,issueMonth_isDefault_mean,issueDate_isDefault_max,earliesCreditLine_isDefault_max,issueDateDT_isDefault_max,earliesCreditLineDT_isDefault_max,earliesCreditLineYear_isDefault_max,earliesCreditLineMonth_isDefault_max,issueYear_isDefault_max,issueMonth_isDefault_max,issueDate_isDefault_min,earliesCreditLine_isDefault_min,issueDateDT_isDefault_min,earliesCreditLineDT_isDefault_min,earliesCreditLineYear_isDefault_min,earliesCreditLineMonth_isDefault_min,issueYear_isDefault_min,issueMonth_isDefault_min,issueDate_isDefault_std,earliesCreditLine_isDefault_std,issueDateDT_isDefault_std,earliesCreditLineDT_isDefault_std,earliesCreditLineYear_isDefault_std,earliesCreditLineMonth_isDefault_std,issueYear_isDefault_std,issueMonth_isDefault_std
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,0.0,2.0,2014,7,2001,8,2587,-2130,0.188505,0.202982,0.188505,0.202982,0.198575,0.195978,0.184278,0.20555,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.391127,0.402255,0.391127,0.402255,0.398931,0.396954,0.387712,0.404106
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,0.0,2.0,2012,8,2002,5,1888,-1857,0.164165,0.217822,0.164165,0.217822,0.204387,0.203027,0.159931,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.370482,0.412817,0.370482,0.412817,0.403257,0.402256,0.366548,0.399684
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,0.0,4.0,2015,10,2006,5,3044,-396,0.191459,0.22043,0.191459,0.22043,0.221997,0.203027,0.202053,0.191226,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.393457,0.414602,0.393457,0.414602,0.415594,0.402256,0.401532,0.393269
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,0.0,1.0,2015,8,1999,5,2983,-2953,0.197707,0.198912,0.197707,0.198912,0.193196,0.203027,0.202053,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.39828,0.399242,0.39828,0.399242,0.39481,0.402256,0.401532,0.399684
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,0.0,4.0,2016,3,1977,8,3196,-10896,0.217402,0.229268,0.217402,0.229268,0.180527,0.195978,0.233084,0.202784,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.412485,0.421391,0.412485,0.421391,0.384706,0.396954,0.422797,0.402076


## Change `numerical_category_fewValues` 


### Target and avg encoding

In [None]:
for col in numerical_category_fewValues: 
    for numFea in ["isDefault"]: #half_serials + ["isDefault"]: # for numFea in object_serial + numerical_serial + ["isDefault"]:
        temp_dict = data_train.groupby([col])[numFea].agg(['mean']).reset_index().rename(columns={'mean': col + '_{}_mean'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_mean'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_mean'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_mean'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,issueDateDT,earliesCreditLineDT,issueDate_isDefault_mean,earliesCreditLine_isDefault_mean,issueDateDT_isDefault_mean,earliesCreditLineDT_isDefault_mean,earliesCreditLineYear_isDefault_mean,earliesCreditLineMonth_isDefault_mean,issueYear_isDefault_mean,issueMonth_isDefault_mean,issueDate_isDefault_max,earliesCreditLine_isDefault_max,issueDateDT_isDefault_max,earliesCreditLineDT_isDefault_max,earliesCreditLineYear_isDefault_max,earliesCreditLineMonth_isDefault_max,issueYear_isDefault_max,issueMonth_isDefault_max,issueDate_isDefault_min,earliesCreditLine_isDefault_min,issueDateDT_isDefault_min,earliesCreditLineDT_isDefault_min,earliesCreditLineYear_isDefault_min,earliesCreditLineMonth_isDefault_min,issueYear_isDefault_min,issueMonth_isDefault_min,issueDate_isDefault_std,earliesCreditLine_isDefault_std,issueDateDT_isDefault_std,earliesCreditLineDT_isDefault_std,earliesCreditLineYear_isDefault_std,earliesCreditLineMonth_isDefault_std,issueYear_isDefault_std,issueMonth_isDefault_std,homeOwnership_isDefault_mean,verificationStatus_isDefault_mean,initialListStatus_isDefault_mean,applicationType_isDefault_mean,n11_isDefault_mean,n12_isDefault_mean
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,2587,-2130,0.188505,0.202982,0.188505,0.202982,0.198575,0.195978,0.184278,0.20555,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.391127,0.402255,0.391127,0.402255,0.398931,0.396954,0.387712,0.404106,0.2078,0.237858,0.202008,0.198493,0.199514,0.199433
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,1888,-1857,0.164165,0.217822,0.164165,0.217822,0.204387,0.203027,0.159931,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.370482,0.412817,0.370482,0.412817,0.403257,0.402256,0.366548,0.399684,0.171535,0.237858,0.196024,0.198493,0.199514,0.199433
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,3044,-396,0.191459,0.22043,0.191459,0.22043,0.221997,0.203027,0.202053,0.191226,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.393457,0.414602,0.393457,0.414602,0.415594,0.402256,0.401532,0.393269,0.171535,0.237858,0.202008,0.198493,0.199514,0.199433
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,2983,-2953,0.197707,0.198912,0.197707,0.198912,0.193196,0.203027,0.202053,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.39828,0.399242,0.39828,0.399242,0.39481,0.402256,0.401532,0.399684,0.232107,0.209412,0.196024,0.198493,0.199514,0.199433
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,3196,-10896,0.217402,0.229268,0.217402,0.229268,0.180527,0.195978,0.233084,0.202784,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.412485,0.421391,0.412485,0.421391,0.384706,0.396954,0.422797,0.402076,0.232107,0.237858,0.202008,0.198493,0.199514,0.199433


In [None]:
for col in numerical_category_fewValues: 
    for numFea in ["isDefault"]: #object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['max']).reset_index().rename(columns={'max': col + '_{}_max'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_max'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_max'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_max'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,earliesCreditLineYear_isDefault_mean,earliesCreditLineMonth_isDefault_mean,issueYear_isDefault_mean,issueMonth_isDefault_mean,issueDate_isDefault_max,earliesCreditLine_isDefault_max,issueDateDT_isDefault_max,earliesCreditLineDT_isDefault_max,earliesCreditLineYear_isDefault_max,earliesCreditLineMonth_isDefault_max,issueYear_isDefault_max,issueMonth_isDefault_max,issueDate_isDefault_min,earliesCreditLine_isDefault_min,issueDateDT_isDefault_min,earliesCreditLineDT_isDefault_min,earliesCreditLineYear_isDefault_min,earliesCreditLineMonth_isDefault_min,issueYear_isDefault_min,issueMonth_isDefault_min,issueDate_isDefault_std,earliesCreditLine_isDefault_std,issueDateDT_isDefault_std,earliesCreditLineDT_isDefault_std,earliesCreditLineYear_isDefault_std,earliesCreditLineMonth_isDefault_std,issueYear_isDefault_std,issueMonth_isDefault_std,homeOwnership_isDefault_mean,verificationStatus_isDefault_mean,initialListStatus_isDefault_mean,applicationType_isDefault_mean,n11_isDefault_mean,n12_isDefault_mean,homeOwnership_isDefault_max,verificationStatus_isDefault_max,initialListStatus_isDefault_max,applicationType_isDefault_max,n11_isDefault_max,n12_isDefault_max
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,0.198575,0.195978,0.184278,0.20555,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.391127,0.402255,0.391127,0.402255,0.398931,0.396954,0.387712,0.404106,0.2078,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,0.204387,0.203027,0.159931,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.370482,0.412817,0.370482,0.412817,0.403257,0.402256,0.366548,0.399684,0.171535,0.237858,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,0.221997,0.203027,0.202053,0.191226,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.393457,0.414602,0.393457,0.414602,0.415594,0.402256,0.401532,0.393269,0.171535,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,0.193196,0.203027,0.202053,0.199575,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.39828,0.399242,0.39828,0.399242,0.39481,0.402256,0.401532,0.399684,0.232107,0.209412,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,0.180527,0.195978,0.233084,0.202784,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.412485,0.421391,0.412485,0.421391,0.384706,0.396954,0.422797,0.402076,0.232107,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1


In [None]:
for col in numerical_category_fewValues: 
    for numFea in ["isDefault"]: #object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['min']).reset_index().rename(columns={'min': col + '_{}_min'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_min'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_min'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_min'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,issueDateDT_isDefault_max,earliesCreditLineDT_isDefault_max,earliesCreditLineYear_isDefault_max,earliesCreditLineMonth_isDefault_max,issueYear_isDefault_max,issueMonth_isDefault_max,issueDate_isDefault_min,earliesCreditLine_isDefault_min,issueDateDT_isDefault_min,earliesCreditLineDT_isDefault_min,earliesCreditLineYear_isDefault_min,earliesCreditLineMonth_isDefault_min,issueYear_isDefault_min,issueMonth_isDefault_min,issueDate_isDefault_std,earliesCreditLine_isDefault_std,issueDateDT_isDefault_std,earliesCreditLineDT_isDefault_std,earliesCreditLineYear_isDefault_std,earliesCreditLineMonth_isDefault_std,issueYear_isDefault_std,issueMonth_isDefault_std,homeOwnership_isDefault_mean,verificationStatus_isDefault_mean,initialListStatus_isDefault_mean,applicationType_isDefault_mean,n11_isDefault_mean,n12_isDefault_mean,homeOwnership_isDefault_max,verificationStatus_isDefault_max,initialListStatus_isDefault_max,applicationType_isDefault_max,n11_isDefault_max,n12_isDefault_max,homeOwnership_isDefault_min,verificationStatus_isDefault_min,initialListStatus_isDefault_min,applicationType_isDefault_min,n11_isDefault_min,n12_isDefault_min
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.391127,0.402255,0.391127,0.402255,0.398931,0.396954,0.387712,0.404106,0.2078,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.370482,0.412817,0.370482,0.412817,0.403257,0.402256,0.366548,0.399684,0.171535,0.237858,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.393457,0.414602,0.393457,0.414602,0.415594,0.402256,0.401532,0.393269,0.171535,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.39828,0.399242,0.39828,0.399242,0.39481,0.402256,0.401532,0.399684,0.232107,0.209412,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0.412485,0.421391,0.412485,0.421391,0.384706,0.396954,0.422797,0.402076,0.232107,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0


In [None]:
for col in numerical_category_fewValues: 
    for numFea in ["isDefault"]: #half_serials + ["isDefault"]: # for numFea in object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['std']).reset_index().rename(columns={'std': col + '_{}_std'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_std'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_std'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_std'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,issueDate_isDefault_min,earliesCreditLine_isDefault_min,issueDateDT_isDefault_min,earliesCreditLineDT_isDefault_min,earliesCreditLineYear_isDefault_min,earliesCreditLineMonth_isDefault_min,issueYear_isDefault_min,issueMonth_isDefault_min,issueDate_isDefault_std,earliesCreditLine_isDefault_std,issueDateDT_isDefault_std,earliesCreditLineDT_isDefault_std,earliesCreditLineYear_isDefault_std,earliesCreditLineMonth_isDefault_std,issueYear_isDefault_std,issueMonth_isDefault_std,homeOwnership_isDefault_mean,verificationStatus_isDefault_mean,initialListStatus_isDefault_mean,applicationType_isDefault_mean,n11_isDefault_mean,n12_isDefault_mean,homeOwnership_isDefault_max,verificationStatus_isDefault_max,initialListStatus_isDefault_max,applicationType_isDefault_max,n11_isDefault_max,n12_isDefault_max,homeOwnership_isDefault_min,verificationStatus_isDefault_min,initialListStatus_isDefault_min,applicationType_isDefault_min,n11_isDefault_min,n12_isDefault_min,homeOwnership_isDefault_std,verificationStatus_isDefault_std,initialListStatus_isDefault_std,applicationType_isDefault_std,n11_isDefault_std,n12_isDefault_std
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,0,0,0,0,0,0,0,0,0.391127,0.402255,0.391127,0.402255,0.398931,0.396954,0.387712,0.404106,0.2078,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.405735,0.425773,0.401498,0.398866,0.399635,0.399575
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,0,0,0,0,0,0,0,0,0.370482,0.412817,0.370482,0.412817,0.403257,0.402256,0.366548,0.399684,0.171535,0.237858,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.396987,0.398866,0.399635,0.399575
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,0,0,0,0,0,0,0,0,0.393457,0.414602,0.393457,0.414602,0.415594,0.402256,0.401532,0.393269,0.171535,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.401498,0.398866,0.399635,0.399575
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,0,0,0,0,0,0,0,0,0.39828,0.399242,0.39828,0.399242,0.39481,0.402256,0.401532,0.399684,0.232107,0.209412,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.40689,0.396987,0.398866,0.399635,0.399575
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,0,0,0,0,0,0,0,0,0.412485,0.421391,0.412485,0.421391,0.384706,0.396954,0.422797,0.402076,0.232107,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.425773,0.401498,0.398866,0.399635,0.399575


## Change `numerical_category_manyValues` 


### Target and avg encoding

In [None]:
for col in numerical_category_manyValues: 
    for numFea in ["isDefault"]: #half_serials + ["isDefault"]: # for numFea in object_serial + numerical_serial + ["isDefault"]:
        temp_dict = data_train.groupby([col])[numFea].agg(['mean']).reset_index().rename(columns={'mean': col + '_{}_mean'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_mean'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_mean'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_mean'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,earliesCreditLineMonth_isDefault_min,issueYear_isDefault_min,issueMonth_isDefault_min,issueDate_isDefault_std,earliesCreditLine_isDefault_std,issueDateDT_isDefault_std,earliesCreditLineDT_isDefault_std,earliesCreditLineYear_isDefault_std,earliesCreditLineMonth_isDefault_std,issueYear_isDefault_std,issueMonth_isDefault_std,homeOwnership_isDefault_mean,verificationStatus_isDefault_mean,initialListStatus_isDefault_mean,applicationType_isDefault_mean,n11_isDefault_mean,n12_isDefault_mean,homeOwnership_isDefault_max,verificationStatus_isDefault_max,initialListStatus_isDefault_max,applicationType_isDefault_max,n11_isDefault_max,n12_isDefault_max,homeOwnership_isDefault_min,verificationStatus_isDefault_min,initialListStatus_isDefault_min,applicationType_isDefault_min,n11_isDefault_min,n12_isDefault_min,homeOwnership_isDefault_std,verificationStatus_isDefault_std,initialListStatus_isDefault_std,applicationType_isDefault_std,n11_isDefault_std,n12_isDefault_std,regionCode_isDefault_mean,employmentTitle_isDefault_mean,purpose_isDefault_mean,postCode_isDefault_mean,title_isDefault_mean
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,0,0,0,0.391127,0.402255,0.391127,0.402255,0.398931,0.396954,0.387712,0.404106,0.2078,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.405735,0.425773,0.401498,0.398866,0.399635,0.399575,0.21177,0.165923,0.29519,0.193253,0.305024
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,0,0,0,0.370482,0.412817,0.370482,0.412817,0.403257,0.402256,0.366548,0.399684,0.171535,0.237858,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.396987,0.398866,0.399635,0.399575,0.159262,0.2,0.21137,0.154922,0.071429
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,0,0,0,0.393457,0.414602,0.393457,0.414602,0.415594,0.402256,0.401532,0.393269,0.171535,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.401498,0.398866,0.399635,0.399575,0.197254,0.0,0.21137,0.202395,0.217978
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,0,0,0,0.39828,0.399242,0.39828,0.399242,0.39481,0.402256,0.401532,0.399684,0.232107,0.209412,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.40689,0.396987,0.398866,0.399635,0.399575,0.211695,0.5,0.169278,0.23775,0.175291
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,0,0,0,0.412485,0.421391,0.412485,0.421391,0.384706,0.396954,0.422797,0.402076,0.232107,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.425773,0.401498,0.398866,0.399635,0.399575,0.215436,0.261745,0.229653,0.207317,0.241175


In [None]:
for col in numerical_category_manyValues: 
    for numFea in ["isDefault"]: #object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['max']).reset_index().rename(columns={'max': col + '_{}_max'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_max'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_max'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_max'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,issueDateDT_isDefault_std,earliesCreditLineDT_isDefault_std,earliesCreditLineYear_isDefault_std,earliesCreditLineMonth_isDefault_std,issueYear_isDefault_std,issueMonth_isDefault_std,homeOwnership_isDefault_mean,verificationStatus_isDefault_mean,initialListStatus_isDefault_mean,applicationType_isDefault_mean,n11_isDefault_mean,n12_isDefault_mean,homeOwnership_isDefault_max,verificationStatus_isDefault_max,initialListStatus_isDefault_max,applicationType_isDefault_max,n11_isDefault_max,n12_isDefault_max,homeOwnership_isDefault_min,verificationStatus_isDefault_min,initialListStatus_isDefault_min,applicationType_isDefault_min,n11_isDefault_min,n12_isDefault_min,homeOwnership_isDefault_std,verificationStatus_isDefault_std,initialListStatus_isDefault_std,applicationType_isDefault_std,n11_isDefault_std,n12_isDefault_std,regionCode_isDefault_mean,employmentTitle_isDefault_mean,purpose_isDefault_mean,postCode_isDefault_mean,title_isDefault_mean,regionCode_isDefault_max,employmentTitle_isDefault_max,purpose_isDefault_max,postCode_isDefault_max,title_isDefault_max
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,0.391127,0.402255,0.398931,0.396954,0.387712,0.404106,0.2078,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.405735,0.425773,0.401498,0.398866,0.399635,0.399575,0.21177,0.165923,0.29519,0.193253,0.305024,1,1,1,1,1
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,0.370482,0.412817,0.403257,0.402256,0.366548,0.399684,0.171535,0.237858,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.396987,0.398866,0.399635,0.399575,0.159262,0.2,0.21137,0.154922,0.071429,1,1,1,1,1
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,0.393457,0.414602,0.415594,0.402256,0.401532,0.393269,0.171535,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.401498,0.398866,0.399635,0.399575,0.197254,0.0,0.21137,0.202395,0.217978,1,0,1,1,1
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,0.39828,0.399242,0.39481,0.402256,0.401532,0.399684,0.232107,0.209412,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.40689,0.396987,0.398866,0.399635,0.399575,0.211695,0.5,0.169278,0.23775,0.175291,1,1,1,1,1
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,0.412485,0.421391,0.384706,0.396954,0.422797,0.402076,0.232107,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.425773,0.401498,0.398866,0.399635,0.399575,0.215436,0.261745,0.229653,0.207317,0.241175,1,1,1,1,1


In [None]:
for col in numerical_category_manyValues: 
    for numFea in ["isDefault"]: #object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['min']).reset_index().rename(columns={'min': col + '_{}_min'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_min'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_min'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_min'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,issueMonth_isDefault_std,homeOwnership_isDefault_mean,verificationStatus_isDefault_mean,initialListStatus_isDefault_mean,applicationType_isDefault_mean,n11_isDefault_mean,n12_isDefault_mean,homeOwnership_isDefault_max,verificationStatus_isDefault_max,initialListStatus_isDefault_max,applicationType_isDefault_max,n11_isDefault_max,n12_isDefault_max,homeOwnership_isDefault_min,verificationStatus_isDefault_min,initialListStatus_isDefault_min,applicationType_isDefault_min,n11_isDefault_min,n12_isDefault_min,homeOwnership_isDefault_std,verificationStatus_isDefault_std,initialListStatus_isDefault_std,applicationType_isDefault_std,n11_isDefault_std,n12_isDefault_std,regionCode_isDefault_mean,employmentTitle_isDefault_mean,purpose_isDefault_mean,postCode_isDefault_mean,title_isDefault_mean,regionCode_isDefault_max,employmentTitle_isDefault_max,purpose_isDefault_max,postCode_isDefault_max,title_isDefault_max,regionCode_isDefault_min,employmentTitle_isDefault_min,purpose_isDefault_min,postCode_isDefault_min,title_isDefault_min
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,0.404106,0.2078,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.405735,0.425773,0.401498,0.398866,0.399635,0.399575,0.21177,0.165923,0.29519,0.193253,0.305024,1,1,1,1,1,0,0,0,0,0
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,0.399684,0.171535,0.237858,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.396987,0.398866,0.399635,0.399575,0.159262,0.2,0.21137,0.154922,0.071429,1,1,1,1,1,0,0,0,0,0
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,0.393269,0.171535,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.401498,0.398866,0.399635,0.399575,0.197254,0.0,0.21137,0.202395,0.217978,1,0,1,1,1,0,0,0,0,0
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,0.399684,0.232107,0.209412,0.196024,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.40689,0.396987,0.398866,0.399635,0.399575,0.211695,0.5,0.169278,0.23775,0.175291,1,1,1,1,1,0,0,0,0,0
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,0.402076,0.232107,0.237858,0.202008,0.198493,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.425773,0.401498,0.398866,0.399635,0.399575,0.215436,0.261745,0.229653,0.207317,0.241175,1,1,1,1,1,0,0,0,0,0


In [None]:
for col in numerical_category_manyValues: 
    for numFea in ["isDefault"]: #half_serials + ["isDefault"]: # for numFea in object_serial + numerical_serial:
        temp_dict = data_train.groupby([col])[numFea].agg(['std']).reset_index().rename(columns={'std': col + '_{}_std'.format(numFea)})
        temp_dict.index = temp_dict[col].values
        # print(temp_dict.head())
        temp_dict = temp_dict[col + '_{}_std'.format(numFea)].to_dict()
        # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")
        data_train[col + '_{}_std'.format(numFea)] = data_train[col].map(temp_dict)
        data_test_a[col + '_{}_std'.format(numFea)] = data_test_a[col].map(temp_dict)
data_train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,...,n11_isDefault_mean,n12_isDefault_mean,homeOwnership_isDefault_max,verificationStatus_isDefault_max,initialListStatus_isDefault_max,applicationType_isDefault_max,n11_isDefault_max,n12_isDefault_max,homeOwnership_isDefault_min,verificationStatus_isDefault_min,initialListStatus_isDefault_min,applicationType_isDefault_min,n11_isDefault_min,n12_isDefault_min,homeOwnership_isDefault_std,verificationStatus_isDefault_std,initialListStatus_isDefault_std,applicationType_isDefault_std,n11_isDefault_std,n12_isDefault_std,regionCode_isDefault_mean,employmentTitle_isDefault_mean,purpose_isDefault_mean,postCode_isDefault_mean,title_isDefault_mean,regionCode_isDefault_max,employmentTitle_isDefault_max,purpose_isDefault_max,postCode_isDefault_max,title_isDefault_max,regionCode_isDefault_min,employmentTitle_isDefault_min,purpose_isDefault_min,postCode_isDefault_min,title_isDefault_min,regionCode_isDefault_std,employmentTitle_isDefault_std,purpose_isDefault_std,postCode_isDefault_std,title_isDefault_std
0,0,35000.0,5,19.52,917.97,4,21,320.0,2.0,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001-08-01,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,...,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.405735,0.425773,0.401498,0.398866,0.399635,0.399575,0.21177,0.165923,0.29519,0.193253,0.305024,1,1,1,1,1,0,0,0,0,0,0.408579,0.372178,0.456153,0.394945,0.460451
1,1,18000.0,5,18.49,461.9,3,16,219843.0,5.0,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002-05-01,1723.0,1.0,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,...,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.396987,0.398866,0.399635,0.399575,0.159262,0.2,0.21137,0.154922,0.071429,1,1,1,1,1,0,0,0,0,0,0.365931,0.40161,0.408281,0.361878,0.262265
2,2,12000.0,5,16.99,298.17,3,17,31698.0,8.0,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006-05-01,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,...,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.376977,0.425773,0.401498,0.398866,0.399635,0.399575,0.197254,0.0,0.21137,0.202395,0.217978,1,0,1,1,1,0,0,0,0,0,0.397929,,0.408281,0.401899,0.412873
3,3,11000.0,3,7.26,340.96,0,3,46854.0,10.0,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999-05-01,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,...,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.40689,0.396987,0.398866,0.399635,0.399575,0.211695,0.5,0.169278,0.23775,0.175291,1,1,1,1,1,0,0,0,0,0,0.408525,0.707107,0.374999,0.426092,0.380217
4,4,3000.0,3,12.99,101.07,2,11,54.0,5.965205,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977-08-01,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,...,0.199514,0.199433,1,1,1,1,1,1,0,0,0,0,0,0,0.422177,0.425773,0.401498,0.398866,0.399635,0.399575,0.215436,0.261745,0.229653,0.207317,0.241175,1,1,1,1,1,0,0,0,0,0,0.411128,0.439589,0.420647,0.405502,0.427842


## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

## Delete the features that will not be used to fit the model

No `xx_outliers` features, no labels, no original dates. 

In [None]:
features = [f for f in data_train.columns if f not in ['id', 'isDefault', "policyCode"] + numerical_category_fewValues + numerical_category_manyValues + date_type and '_outliers' not in f]
y_train = data_train['isDefault']
data_train = data_train[features]
data_test_a = data_test_a[features]

In [None]:
data_train.shape

(800000, 107)

In [None]:
# for data in [data_train, data_test_a]:
#     data.drop(["issueDate", "id"], axis = 1, inplace = True)

## Optimize the memory size of the dataset

No need to do this here. It is useless. 

You need to do it after you load the data for training model. So you have to use this method in other notebooks. 

In [None]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# data_train = reduce_mem_usage(data_train)
# data_test_a = reduce_mem_usage(data_test_a)

Memory usage of dataframe is 684800128.00 MB
Memory usage after optimization is: 141600128.00 MB
Decreased by 79.3%
Memory usage of dataframe is 171200128.00 MB
Memory usage after optimization is: 37800128.00 MB
Decreased by 77.9%


## Select some features

**Mind this**: other methods may be used. But I have tried the following code, it seems to degrade the performance. So I don't suggest to use the feature selection if we have enough computational resources. 

In [None]:
data_test_a = data_test_a.fillna(axis = 0, method = "ffill")
data_train = data_train.fillna(axis = 0, method = "ffill")

In [None]:
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
#选择K个最好的特征，返回选择特征后的数据
#第一个参数为计算评估特征是否好的函数，该函数输入特征矩阵和目标向量，
#输出二元组（评分，P值）的数组，数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数

selector = SelectKBest(k=80)
selector.fit(
    data_train,
    y_train
)

SelectKBest(k=80, score_func=<function f_classif at 0x7fa732766598>)

In [None]:
colNums = selector.get_support(True)

In [None]:
selectedFeatures = []
for i, col in enumerate(list(data_train.columns)):
    if i in colNums:
        selectedFeatures.append(col)
len(selectedFeatures)

80

In [None]:
data_train = data_train[selectedFeatures]
data_test_a = data_test_a[selectedFeatures]

In [None]:
## Some other feature selection methods

# from sklearn.feature_selection import VarianceThreshold
# #其中参数threshold为方差的阈值
# VarianceThreshold(threshold=3).fit_transform(
#     data_train[['grade', 'subGrade', 'grade_target_mean', "subGrade_target_mean"]],
#     data_train["isDefault"]
# )

Not many empty values.

In [None]:
max(list(data_train.isnull().sum()))

0

## yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy

In [None]:
data_train.shape

(800000, 80)

## Training separate classifiers

In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y.iloc[train_index], train_x.iloc[valid_index], train_y.iloc[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 50,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': "gpu_hist", #'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            # https://stackoverflow.com/questions/55579610/xgboost-attributeerror-dataframe-object-has-no-attribute-feature-names
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(xgb.DMatrix(test_x) , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        ## 下面这个test，可以考虑像如下这样进行，或者是求平均亦可。
        # test += test_pred / kf.n_splits ## Minke's invention, no basis or reference.
        test += test_pred
        ##
        train[valid_index] = val_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    print("what is kf.n_splits?", kf.n_splits)
    return train, test / kf.n_splits

In [None]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

### XGBoost

At least for XGBoost, the `test_pred` will be probabilities, not the classification results. See https://blog.csdn.net/weixin_42320576/article/details/88414238 

In [None]:
xgb_train, xgb_test = xgb_model(data_train, y_train, data_test_a)

************************************ 1 ************************************
[0]	train-auc:0.854859	eval-auc:0.855357
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.889598	eval-auc:0.888492
[400]	train-auc:0.893319	eval-auc:0.890639
[600]	train-auc:0.895652	eval-auc:0.891516
[800]	train-auc:0.897541	eval-auc:0.891916
[1000]	train-auc:0.899124	eval-auc:0.892168
[1200]	train-auc:0.900659	eval-auc:0.892273
[1400]	train-auc:0.902124	eval-auc:0.892358
[1600]	train-auc:0.903513	eval-auc:0.892396
[1800]	train-auc:0.904893	eval-auc:0.892437
Stopping. Best iteration:
[1769]	train-auc:0.904687	eval-auc:0.892451

[0.8924506147008837]
************************************ 2 ************************************
[0]	train-auc:0.856342	eval-auc:0.853643
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds

### LGB

In [None]:
lgb_train, lgb_test = lgb_model(x_train_small, y_train, x_test_small)

### CAT

In [None]:
cat_train, cat_test = cat_model(x_train_small, y_train, x_test_small)

## See results of individual model

In [None]:
testA_result = pd.read_csv('originalDataset/sample_submit.csv')
testA_result_pred = testA_result.copy()

In [None]:
testA_result_pred["isDefault"] = xgb_test
# testA_result_pred

In [None]:
testA_result_pred.to_csv("submissionResults/xgboost-1109-3.6.csv", index=False)

## zzzzzz(Stops here)zzzzzzzzzzz

## Deal with outliers

### Define a function here for finding outliers.

Especially for numerical data that are outside of the range (3 * standard error). 

In [None]:
def find_outliers_by_3segama(data,fea):
    stdError = np.std(data[fea])
    meanVal = np.mean(data[fea])
    lowerBound = meanVal - 3*stdError
    higherBound = meanVal + 3*stdError
    data[fea+'_outliers'] = data[fea].apply(lambda x: str('异常值') if x < lowerBound or x > higherBound else str('正常值'))
    return data

### See some of the labels' situation. 

`isDefault` is the label.

The meaning of the outputs of the following cell is that: _in each column, rows are grouped by `正常值` or `异常值`, and the `sum()` will be the sum of multiple '1's and '0's. E.g., in column `interestRate_outliers`, there are 150000+ 正常值 rows that are positive labeled, and there are 2916 异常值 rows that are positive labeled._

In [None]:
data_train = data_train.copy()
for fea in numerical_fea:
    data_train = find_outliers_by_3segama(data_train,fea)
    print(data_train[fea+'_outliers'].value_counts())
    print("-"*10)
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
    print('*'*10)
    print()

In [None]:
data_train.head(3).append(data_train.tail(3))

### Delete 异常值, which is abnormal values. 

Only the rows that all numerical columns are normal values will be kept. Other rows will be given up. 

**Mind this**: sometimes abnormal values cannot be removed. They should also be kept, because sometimes the abnormal values can lead to discoveries. 

#### Actually delete abnormal values

In [None]:
#删除异常值
for fea in numerical_fea:
    data_train = data_train[data_train[fea+'_outliers']=='正常值']
    data_train = data_train.reset_index(drop=True) 

In [None]:
data_train.shape

(612742, 92)

#### Don't delete any abnormal values


In [None]:
## do nothing at all here. 

## Distribute the data into bins 

Following cells are fake codes. 

**Mind this**: Multiple methods can be used. 

### Some examples

In [None]:
# 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)

In [None]:
## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))

The `pd.qcut()` is cut the numbers into bins. 

The functionality of parameter `labels=False/True` can be seen from the following cells.  

In [None]:
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
data["loanAmnt_bin3"].head()

0    5
1    7
2    4
3    6
4    9
Name: loanAmnt_bin3, dtype: int64

In [None]:
data_train_tst = data_train.copy()
data_train_tst['loanAmnt_bin3'] = pd.qcut(data_train_tst['loanAmnt'], 10, labels=False)
data_train_tst["loanAmnt_bin3"].head()

0    9
1    7
2    4
3    0
4    4
Name: loanAmnt_bin3, dtype: int64

In [None]:
data_train_tst = data_train.copy()
data_train_tst['loanAmnt_bin3'] = pd.qcut(data_train_tst['loanAmnt'], 10)
data_train_tst["loanAmnt_bin3"].head()

0    (25000.0, 40000.0]
1    (17500.0, 20000.0]
2    (10000.0, 12000.0]
3     (499.999, 5000.0]
4    (10000.0, 12000.0]
Name: loanAmnt_bin3, dtype: category
Categories (10, interval[float64]): [(499.999, 5000.0] < (5000.0, 6500.0] < (6500.0, 8500.0] <
                                     (8500.0, 10000.0] ... (15000.0, 17500.0] < (17500.0, 20000.0] <
                                     (20000.0, 25000.0] < (25000.0, 40000.0]]

### Actually don't bin at all

In [None]:
## do nothing at all

## Combinatorial features

（交互特征）is the combination of original features. https://www.msra.cn/zh-cn/news/features/kdd-2018-xdeepfm#:~:text=%E7%89%B9%E5%BE%81%E4%BA%A4%E4%BA%92%E6%8C%87%E7%9A%84%E6%98%AF,user_id%2C%20item_id%5D%E7%9A%84%E8%81%94%E7%B3%BB%E3%80%82

**Mind this**: there may be multiple ways of combining features. Try more methods. 

Categorical features: use target encoding:  

In [None]:
## 这里的target mean, 就跟我之前那个kaggle项目用的target encoding在思路上有一定的类似之处.
## 但是这里的具体算法又不太一样. 
## 这里的算法是: 
### 将col进行分组, 求每一个分组里面的label的总和的平均值是多少
## 然后再映射一下, 构建新的列. 
## 这里就是一种combination的方法了, 可以学习一下. 
for col in ['subGrade', 'grade']: # , 'employmentTitle', 'postCode', 'title'
    temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    print(temp_dict.head())
    temp_dict = temp_dict[col + '_target_mean'].to_dict()
    print(col, " ", temp_dict, sum(temp_dict.values()), "\n")

    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)

   subGrade  subGrade_target_mean
0         0              0.031919
1         1              0.045697
2         2              0.055882
3         3              0.067221
4         4              0.085399
subGrade   {0: 0.031919410243544714, 1: 0.04569698065449286, 2: 0.05588170381814169, 3: 0.06722064148991205, 4: 0.08539886975949533, 5: 0.10292105138974093, 6: 0.11226174056571778, 7: 0.12923868312757203, 8: 0.1486388238145246, 9: 0.16564893291126315, 10: 0.19135984870870515, 11: 0.20689215602957423, 12: 0.22457598712877924, 13: 0.25011293820021685, 14: 0.2615487780647725, 15: 0.27798153120702074, 16: 0.2975723763570567, 17: 0.30401537804357115, 18: 0.32286295472822746, 19: 0.3347348357439175, 20: 0.35523321956769055, 21: 0.37690255766514985, 22: 0.3874599542334096, 23: 0.4022430712822172, 24: 0.4191609846296082, 25: 0.4264978902953587, 26: 0.45599078341013827, 27: 0.45680738048644115, 28: 0.4774396642182581, 29: 0.48299319727891155, 30: 0.4661739624786811, 31: 0.4809098294069862, 32: 

In [None]:
data_train.shape

(800000, 51)

In [None]:
# data_train[['grade', 'subGrade', 'grade_target_mean', "subGrade_target_mean"]].head()

In [None]:
# 其他衍生变量 mean 和 std
## 我认为, 这一部的操作增加的是什么呢? 
## 除号右边那个部分得到的是: 按照nx分组, 每一组求一个平均数, 然后把这个平均数赋给每一行数据, 让每一行数据自己认领一个属于自己的平均数值. 
## 结合上除号, 就得到的是: 每一行数据的等级分除以每一行数据对应的平均分, 得到的商. 
## 学习一下, 如果要进行特征组合, 多采用这里面用到的方法, 诸如groupby, transform之类. 
for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
        df['grade_to_max_' + item] = df['grade'] / df.groupby([item])['grade'].transform('max')
        df['grade_to_min_' + item] = df['grade'] / df.groupby([item])['grade'].transform('min')
        
        # df['subGrade_to_mean_' + item] = df['subGrade'] / df.groupby([item])['subGrade'].transform('mean')
        # df['subGrade_to_std_' + item] = df['subGrade'] / df.groupby([item])['subGrade'].transform('std')

In [None]:
data_train.shape

(800000, 111)

In [None]:
# df.groupby(["n0"])['grade'].transform('mean')

In [None]:
# df.groupby(["n0"])['grade'].mean()

In [None]:
# for col in ['grade', 'subGrade', 'issueDate', 'earliesCreditLine']: # ['grade', 'subGrade']: # , 'employmentTitle', 'postCode', 'title'
#     # temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
#     # temp_dict.index = temp_dict[col].values
#     # print(temp_dict.head())
#     # temp_dict = temp_dict[col + '_target_mean'].to_dict()
#     # print(col, " ", temp_dict, sum(temp_dict.values()), "\n")

#     data_train.drop(col, axis=1, inplace=True)
#     data_test_a.drop(col, axis=1, inplace = True)

In [None]:
data_train.shape

(800000, 111)

## ~Fill the empty cells again~

Use the row above the n/a row to fill the n/a cells. In another word, fill the n/a with previous value.

There are some cells used previously. I think after they are used, there is no need to run the empty cell again. 

**Mind this**: other methods may used. 

In [None]:
# data_train = data_train.fillna(axis = 0, method = "ffill")

## Delete the features that will not be used to fit the model

No `xx_outliers` features, no labels, no original dates. 

In [None]:
features = [f for f in data_train.columns if f not in ['id','issueDate', "earliesCreditLine", 'isDefault', "policyCode"] + numerical_noserial_fea and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']

In [None]:
x_train.shape

(800000, 99)

In [None]:
# for data in [data_train, data_test_a]:
#     data.drop(["issueDate", "id"], axis = 1, inplace = True)

## Change the distribution of the data

In [None]:
x_train_cp1 = x_train.copy()
x_test_cp1 = x_test.copy()

In [None]:
x_train = x_train_cp1.copy()
x_test = x_test.copy()

### Log all of the numerical data

In [None]:
for data in [x_train, x_test]:
    for fea in numerical_serial_fea:
        if fea in ["id", "purpose", "regionCode"]: ## 这几个, 没法进行log操作.
            continue
        data[fea] = data[fea].apply(np.log1p)

In [None]:
x_train.head()

Unnamed: 0,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,annualIncome,postCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDateDT,...,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50,grade_target_mean,subGrade_target_mean,grade_to_mean_n0,grade_to_std_n0,grade_to_mean_n1,grade_to_std_n1,grade_to_mean_n2,grade_to_std_n2,grade_to_mean_n3,grade_to_std_n3,grade_to_mean_n4,grade_to_std_n4,grade_to_mean_n5,grade_to_std_n5,grade_to_mean_n6,grade_to_std_n6,grade_to_mean_n7,grade_to_std_n7,grade_to_mean_n8,grade_to_std_n8,grade_to_mean_n9,grade_to_std_n9,grade_to_mean_n10,grade_to_std_n10,grade_to_mean_n11,grade_to_std_n11,grade_to_mean_n12,grade_to_std_n12,grade_to_mean_n13,grade_to_std_n13,grade_to_mean_n14,grade_to_std_n14
0,10.463132,5,3.0214,6.823253,4,21,12.165391,1.098612,11.608245,3.806662,2.893146,0.0,6.594413,6.59987,2.079442,0.0,0.0,10.09324,3.910021,3.332205,0,0,1.098612,1.0,0.0,1.098612,1.098612,1.098612,1.609438,2.302585,2.197225,1.609438,2.564949,1.098612,2.079442,0.0,0.0,0.0,1.098612,2587,...,0,0,0,0,0,0,0,0,0.384291,0.376903,2.343157,3.08747,2.340208,3.132197,2.483341,3.09007,2.483341,3.09007,2.290512,3.12296,2.361338,3.124309,2.321539,3.098429,2.272681,3.148661,2.293558,3.124742,2.476104,3.091823,2.290444,3.184266,2.291804,3.090643,2.292831,3.091214,2.302359,3.088676,2.290853,3.168627
1,9.798183,5,2.969902,6.137511,3,16,11.559189,1.791759,10.736418,4.189655,3.361417,0.0,6.552508,6.558198,2.639057,0.0,0.0,9.622251,3.686376,2.944439,1,0,8.75274,1.0,,,1.575551,1.575551,2.397895,,2.803331,2.564964,,1.575551,2.639057,0.0,0.0,1.171916e-16,2.91778,1888,...,0,0,0,0,0,0,0,0,0.303852,0.297572,1.0,,1.0,,1.0,,1.0,,1.799617,2.244387,1.0,,1.0,,1.0,,1.0,,1.0,,1.721298,2.303035,1.718853,2.317982,1.719624,2.318411,1.0,,1.0,
2,9.392745,5,2.889816,5.701012,3,17,12.152202,2.197225,11.211834,5.587249,3.168424,0.0,6.516193,6.522093,2.484907,0.0,0.0,8.435332,3.966511,3.332205,0,0,0.693147,1.0,0.0,0.0,1.386294,1.386294,0.0,0.0,3.091042,1.609438,1.791759,1.386294,2.484907,0.0,0.0,0.0,1.609438,3044,...,0,0,0,0,0,0,0,0,0.303852,0.304015,1.757368,2.315603,1.53122,2.161958,1.816601,2.341786,1.816601,2.341786,1.369429,2.229075,1.217917,2.25057,1.660906,2.24876,1.704511,2.361495,1.577005,2.332637,1.813296,2.344317,1.707234,2.308753,1.718853,2.317982,1.719624,2.318411,1.726769,2.316507,1.444127,2.264665
3,9.305741,3,2.111425,5.834694,0,3,12.427747,2.397895,11.678448,4.043051,2.901971,0.0,6.530878,6.536692,2.302585,0.0,0.0,9.205227,3.981549,3.367296,1,0,10.173591,1.0,1.94591,1.609438,1.94591,1.94591,1.609438,2.833213,1.609438,2.079442,3.091042,1.94591,2.302585,0.0,0.0,0.0,0.693147,2983,...,0,0,0,0,0,0,0,0,0.060375,0.067221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.006701,3,2.638343,4.625659,2,11,12.453983,2.126529,10.275086,5.429346,3.501344,0.0,6.53814,6.543912,2.564949,0.0,0.0,7.987185,3.496508,3.332205,0,0,6.81564,1.0,0.693147,1.098612,2.079442,2.079442,1.098612,1.609438,2.302585,2.397895,2.772589,2.079442,2.564949,0.0,0.0,0.0,1.609438,3196,...,0,0,0,0,0,0,0,0,0.22502,0.206892,1.054598,1.575441,1.170104,1.566099,1.101956,1.569043,1.101956,1.569043,1.09681,1.57181,1.075744,1.566738,1.15491,1.534359,1.142249,1.54174,1.163517,1.554608,1.103551,1.566999,1.143978,1.543316,1.145902,1.545322,1.146416,1.545607,1.151179,1.544338,0.962751,1.509776


In [None]:
x_train_cp1.head()

Unnamed: 0,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,annualIncome,postCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDateDT,...,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50,grade_target_mean,subGrade_target_mean,grade_to_mean_n0,grade_to_std_n0,grade_to_mean_n1,grade_to_std_n1,grade_to_mean_n2,grade_to_std_n2,grade_to_mean_n3,grade_to_std_n3,grade_to_mean_n4,grade_to_std_n4,grade_to_mean_n5,grade_to_std_n5,grade_to_mean_n6,grade_to_std_n6,grade_to_mean_n7,grade_to_std_n7,grade_to_mean_n8,grade_to_std_n8,grade_to_mean_n9,grade_to_std_n9,grade_to_mean_n10,grade_to_std_n10,grade_to_mean_n11,grade_to_std_n11,grade_to_mean_n12,grade_to_std_n12,grade_to_mean_n13,grade_to_std_n13,grade_to_mean_n14,grade_to_std_n14
0,35000.0,5,19.52,917.97,4,21,192026,2.0,110000.0,44,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,2587,...,0,0,0,0,0,0,0,0,0.384291,0.376903,2.343157,3.08747,2.340208,3.132197,2.483341,3.09007,2.483341,3.09007,2.290512,3.12296,2.361338,3.124309,2.321539,3.098429,2.272681,3.148661,2.293558,3.124742,2.476104,3.091823,2.290444,3.184266,2.291804,3.090643,2.292831,3.091214,2.302359,3.088676,2.290853,3.168627
1,18000.0,5,18.49,461.9,3,16,104734,5.0,46000.0,65,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,6327,1.0,-29.500403,-8.833409,3.833404,3.833404,10.0,-41.000365,15.499512,12.00019,-18.00007,3.833404,13.0,0.0,0.0,1.171916e-16,17.500178,1888,...,0,0,0,0,0,0,0,0,0.303852,0.297572,1.0,,1.0,,1.0,,1.0,,1.799617,2.244387,1.0,,1.0,,1.0,,1.0,,1.0,,1.721298,2.303035,1.718853,2.317982,1.719624,2.318411,1.0,,1.0,
2,12000.0,5,16.99,298.17,3,17,189510,8.0,74000.0,266,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,1,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,3044,...,0,0,0,0,0,0,0,0,0.303852,0.304015,1.757368,2.315603,1.53122,2.161958,1.816601,2.341786,1.816601,2.341786,1.369429,2.229075,1.217917,2.25057,1.660906,2.24876,1.704511,2.361495,1.577005,2.332637,1.813296,2.344317,1.707234,2.308753,1.718853,2.317982,1.719624,2.318411,1.726769,2.316507,1.444127,2.264665
3,11000.0,3,7.26,340.96,0,3,249632,10.0,118000.0,56,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,26201,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,2983,...,0,0,0,0,0,0,0,0,0.060375,0.067221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3000.0,3,12.99,101.07,2,11,256268,7.385708,29000.0,227,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,911,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,3196,...,0,0,0,0,0,0,0,0,0.22502,0.206892,1.054598,1.575441,1.170104,1.566099,1.101956,1.569043,1.101956,1.569043,1.09681,1.57181,1.075744,1.566738,1.15491,1.534359,1.142249,1.54174,1.163517,1.554608,1.103551,1.566999,1.143978,1.543316,1.145902,1.545322,1.146416,1.545607,1.151179,1.544338,0.962751,1.509776


### Log some of the feature

`interestRate`, `annualIncome`, `openAcc`, `revolBal`, `totalAcc`, `n1` to `n10`

In [None]:
for data in [x_train, x_test]:
    for fea in ["interestRate", "annualIncome", "openAcc", "revolBal", "totalAcc", 
                'n1','n2','n3','n4','n5','n6','n7','n8','n9','n10']:
        data[fea] = data[fea].apply(np.log1p)

## ~Calculating covariance~

The covariance is more useful when you want to select features. I guess. Perhaps we don't have to anything here. 

In [None]:
# x_train = data_train.drop(['isDefault'], axis=1)
# #计算协方差
# data_corr = x_train.corrwith(data_train["isDefault"]) #计算相关性
# data_corr

In [None]:
# result = pd.DataFrame(columns=['features', 'corr'])
# result['features'] = data_corr.index
# result['corr'] = data_corr.values
# result

Visualize the correlation: 

In [None]:
# numerical_fea

In [None]:
# # 当然也可以直接看图
# numerical_fea.remove("id")
# data_numeric = data_train[numerical_fea]
# correlation = data_numeric.corr()

# f, ax = plt.subplots(figsize = (7, 7))
# plt.title('Correlation of Numeric Features with Price',y=1,size=16)
# sns.heatmap(correlation,square = True,  vmax=0.8)

In [None]:
# list(data_train.columns)

# Save preprocessed data

In [None]:
x_train.head()

Unnamed: 0,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,annualIncome,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,earliesCreditLine,title,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDateDT,...,homeOwnership_5,verificationStatus_1,verificationStatus_2,applicationType_1,initialListStatus_1,loanAmnt_bin1,loanAmnt_bin2,loanAmnt_bin3,grade_target_mean,subGrade_target_mean,grade_to_mean_n0,grade_to_std_n0,grade_to_mean_n1,grade_to_std_n1,grade_to_mean_n2,grade_to_std_n2,grade_to_mean_n3,grade_to_std_n3,grade_to_mean_n4,grade_to_std_n4,grade_to_mean_n5,grade_to_std_n5,grade_to_mean_n6,grade_to_std_n6,grade_to_mean_n7,grade_to_std_n7,grade_to_mean_n8,grade_to_std_n8,grade_to_mean_n9,grade_to_std_n9,grade_to_mean_n10,grade_to_std_n10,grade_to_mean_n11,grade_to_std_n11,grade_to_mean_n12,grade_to_std_n12,grade_to_mean_n13,grade_to_std_n13,grade_to_mean_n14,grade_to_std_n14
0,35000.0,5,19.52,917.97,5,21,192025,2.0,110000.0,1,43,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,2001,1,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,2587,...,0,0,1,0,0,35.0,4.0,9,0.384291,0.376903,1.853364,3.848126,1.845529,3.915247,1.915171,3.862588,1.915171,3.862588,1.850437,3.842205,1.856007,3.905387,1.836213,3.873036,1.811571,3.935826,1.822148,3.905928,1.911723,3.864778,1.820574,3.980333,1.821304,3.863294,1.82198,3.863687,1.829339,3.85634,1.84548,3.899647
1,18000.0,5,18.49,461.9,4,16,104733,5.0,46000.0,0,64,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,2002,6326,0.0,3.0,5.0,5.0,10.0,7.0,7.0,7.0,13.0,5.0,13.0,0.0,0.0,0.0,2.0,1888,...,0,0,1,0,1,18.0,4.0,6,0.303852,0.297572,1.482691,3.0785,1.48592,3.095163,1.491092,3.09362,1.491092,3.09362,1.4998,2.992516,1.498132,3.054955,1.50872,3.026935,1.49086,3.061355,1.513533,3.02586,1.490842,3.093421,1.458326,3.070714,1.457043,3.090635,1.457584,3.090949,1.463471,3.085072,1.476384,3.119718
2,12000.0,5,16.99,298.17,4,17,189509,8.0,74000.0,0,265,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,2006,0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,3044,...,0,0,1,0,0,12.0,4.0,4,0.303852,0.304015,1.482691,3.0785,1.35066,2.88209,1.508617,3.122382,1.508617,3.122382,1.251507,2.977571,1.141414,3.045774,1.425393,2.998346,1.449257,3.148661,1.378198,3.110183,1.506906,3.125756,1.49359,3.014728,1.457043,3.090635,1.457584,3.090949,1.463471,3.085072,1.299807,3.019553
3,11000.0,3,7.26,340.96,1,3,249631,10.0,118000.0,4,55,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1999,26200,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,2983,...,0,1,0,0,1,11.0,4.0,4,0.060375,0.067221,0.345553,0.812803,0.364319,0.781221,0.361138,0.786672,0.361138,0.786672,0.370087,0.768441,0.383021,0.768889,0.365541,0.788355,0.372715,0.765339,0.371837,0.770467,0.360988,0.786035,0.36306,0.784788,0.364261,0.772659,0.364396,0.772737,0.365868,0.771268,0.391052,0.820342
4,3000.0,3,12.99,101.07,3,11,256267,10.0,29000.0,10,226,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,1977,910,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,3196,...,0,0,1,0,0,3.0,3.0,0,0.22502,0.206892,1.035748,2.363162,1.107318,2.349148,1.065737,2.353565,1.065737,2.353565,1.062523,2.357715,1.049253,2.350106,1.098202,2.301539,1.090539,2.31261,1.103377,2.331911,1.066731,2.350499,1.091589,2.314974,1.092782,2.317976,1.093188,2.318212,1.097603,2.313804,0.974855,2.264665


In [None]:
x_train.shape

(800000, 84)

In [None]:
x_train.isnull().sum()

loanAmnt             0
term                 0
interestRate         0
installment          0
grade                0
                    ..
grade_to_std_n12     0
grade_to_mean_n13    0
grade_to_std_n13     0
grade_to_mean_n14    0
grade_to_std_n14     0
Length: 84, dtype: int64

In [None]:
# x_train = x_train.fillna(axis = 0, method = "ffill")
# x_test = x_test.fillna(axis = 0, method = "ffill")

In [None]:
x_train.to_csv("preprocessedData/x_train-1110-3.7-1.csv", index=False)
x_test.to_csv("preprocessedData/x_test-1110-3.7-1.csv", index=False)
y_train.to_csv("preprocessedData/y_train-1110-3.7-1.csv", index=False)