## featuretools で特徴量作成

In [68]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import featuretools as ft
import featuretools.variable_types as vtypes
from boruta import BorutaPy

from sklearn.ensemble import RandomForestClassifier

import check_miss_value


## 学習データ

In [37]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('../001/train.pk1')

In [38]:
train_pkl.shape

(242150, 10)

In [39]:
# グレード(grade)、目的(purpose)は削除
train_pkl.drop(['grade', 'purpose'], axis=1, inplace=True)

In [40]:
train_pkl.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,loan_status
0,0,609.296068,3,8.421982,0,714.061803,0,0
1,1,1183.266999,5,10.286776,10,697.706701,0,1
2,2,695.783256,3,14.723425,1,656.419357,0,0
3,3,738.392546,3,14.260708,0,657.906852,0,0
4,4,1642.400654,5,25.217452,10,662.972297,0,0


In [41]:
train_pk2 = train_pkl.copy()
train_pkl.shape

(242150, 8)

In [42]:
es = ft.EntitySet(id='feature')

In [43]:
es.entity_from_dataframe( 
    entity_id='feature_id',
    dataframe=train_pkl.drop(['id', 'loan_status'], axis=1),
    index='index',
    variable_types={}
)

  "integer column".format(index))


Entityset: feature
  Entities:
    feature_id [Rows: 242150, Columns: 7]
  Relationships:
    No relationships

In [44]:
feature_matrix, features_dfs = ft.dfs(
    entityset=es,
    target_entity='feature_id',
    agg_primitives=['sum', 'mean', 'std', 'max', 'count', 'skew'],
    trans_primitives=['add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric'],
    max_depth=1,
)

  agg_primitives: ['count', 'max', 'mean', 'skew', 'std', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


In [45]:
feature_matrix.shape

(242150, 81)

In [46]:
feature_matrix.head()

Unnamed: 0_level_0,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,application_type + credit_score,application_type + employment_length,application_type + interest_rate,application_type + loan_amnt,...,credit_score - employment_length,credit_score - interest_rate,credit_score - loan_amnt,credit_score - term,employment_length - interest_rate,employment_length - loan_amnt,employment_length - term,interest_rate - loan_amnt,interest_rate - term,loan_amnt - term
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,609.296068,3,8.421982,0,714.061803,0,714.061803,0,8.421982,609.296068,...,714.061803,705.639821,104.765735,711.061803,-8.421982,-609.296068,-3,-600.874086,5.421982,606.296068
1,1183.266999,5,10.286776,10,697.706701,0,697.706701,10,10.286776,1183.266999,...,687.706701,687.419925,-485.560298,692.706701,-0.286776,-1173.266999,5,-1172.980223,5.286776,1178.266999
2,695.783256,3,14.723425,1,656.419357,0,656.419357,1,14.723425,695.783256,...,655.419357,641.695933,-39.363899,653.419357,-13.723425,-694.783256,-2,-681.059832,11.723425,692.783256
3,738.392546,3,14.260708,0,657.906852,0,657.906852,0,14.260708,738.392546,...,657.906852,643.646144,-80.485694,654.906852,-14.260708,-738.392546,-3,-724.131838,11.260708,735.392546
4,1642.400654,5,25.217452,10,662.972297,0,662.972297,10,25.217452,1642.400654,...,652.972297,637.754845,-979.428357,657.972297,-15.217452,-1632.400654,5,-1617.183202,20.217452,1637.400654


In [47]:
train_pkl = feature_matrix.copy()

In [48]:
train_pkl['loan_status'] = train_pk2['loan_status']
train_pkl['id'] = train_pk2[('id')]

In [49]:
train_pkl.shape

(242150, 83)

In [66]:
train_pkl.describe()

  x2 = take(ap, indices_above, axis=axis) * weights_above


Unnamed: 0,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,application_type + credit_score,application_type + employment_length,application_type + interest_rate,application_type + loan_amnt,...,credit_score - loan_amnt,credit_score - term,employment_length - interest_rate,employment_length - loan_amnt,employment_length - term,interest_rate - loan_amnt,interest_rate - term,loan_amnt - term,loan_status,id
count,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,...,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0
mean,1520.389009,3.431658,13.801496,6.672298,683.575024,0.007359,683.582383,6.679657,13.808855,1520.396369,...,-836.813985,680.143366,-7.129197,-1513.716711,3.24064,-1506.587514,10.369838,1516.957351,0.174916,121074.5
std,830.250197,0.822794,4.588924,3.657411,29.554795,0.085469,29.556557,3.656696,4.590767,830.254507,...,827.868342,29.590202,5.864958,830.014131,3.725264,829.295029,4.269758,829.966347,0.379896,69902.82818
min,323.797279,3.0,5.704849,0.0,655.424269,0.0,655.424269,0.0,5.704849,323.797279,...,-3175.687333,650.510945,-27.52859,-3851.867974,-5.0,-3839.01803,0.812533,320.797279,0.0,0.0
25%,761.954545,3.0,10.876086,3.0,659.531106,0.0,659.535793,3.0,10.881283,761.954545,...,-1456.738754,656.161458,-11.415029,-2145.168102,0.0,-2137.691382,7.722855,758.919295,0.0,60537.25
50%,1212.680586,3.0,13.543833,9.0,678.672563,0.0,678.678756,9.0,13.547887,1212.686834,...,-543.566096,675.292875,-6.997864,-1206.503023,5.0,-1198.974968,10.27917,1209.357969,0.0,121074.5
75%,2152.21333,3.0,17.172395,10.0,698.59196,0.0,698.595488,10.0,17.176818,2152.22384,...,-94.138528,695.267101,-3.04824,-756.153564,7.0,-748.529252,12.980316,2148.261662,0.0,181611.75
max,3851.867974,5.0,27.980604,10.0,808.551641,1.0,808.551641,11.0,28.980604,3851.867974,...,462.497412,804.433596,4.295151,-313.797279,7.0,-316.593872,24.260526,3846.867974,1.0,242149.0


In [77]:
train_pkl.isnull().sum()[train_pkl.isnull().sum() > 0]

application_type / employment_length    18851
employment_length / application_type    18851
dtype: int64

In [80]:
train_pkl.fillna(0, inplace=True)

In [81]:
train_pkl.isnull().sum()[train_pkl.isnull().sum() > 0]

Series([], dtype: int64)

In [87]:
train_pkl.describe()

Unnamed: 0,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,application_type + credit_score,application_type + employment_length,application_type + interest_rate,application_type + loan_amnt,...,credit_score - loan_amnt,credit_score - term,employment_length - interest_rate,employment_length - loan_amnt,employment_length - term,interest_rate - loan_amnt,interest_rate - term,loan_amnt - term,loan_status,id
count,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,...,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0,242150.0
mean,1520.389009,3.431658,13.801496,6.672298,683.575024,0.007359,683.582383,6.679657,13.808855,1520.396369,...,-836.813985,680.143366,-7.129197,-1513.716711,3.24064,-1506.587514,10.369838,1516.957351,0.174916,121074.5
std,830.250197,0.822794,4.588924,3.657411,29.554795,0.085469,29.556557,3.656696,4.590767,830.254507,...,827.868342,29.590202,5.864958,830.014131,3.725264,829.295029,4.269758,829.966347,0.379896,69902.82818
min,323.797279,3.0,5.704849,0.0,655.424269,0.0,655.424269,0.0,5.704849,323.797279,...,-3175.687333,650.510945,-27.52859,-3851.867974,-5.0,-3839.01803,0.812533,320.797279,0.0,0.0
25%,761.954545,3.0,10.876086,3.0,659.531106,0.0,659.535793,3.0,10.881283,761.954545,...,-1456.738754,656.161458,-11.415029,-2145.168102,0.0,-2137.691382,7.722855,758.919295,0.0,60537.25
50%,1212.680586,3.0,13.543833,9.0,678.672563,0.0,678.678756,9.0,13.547887,1212.686834,...,-543.566096,675.292875,-6.997864,-1206.503023,5.0,-1198.974968,10.27917,1209.357969,0.0,121074.5
75%,2152.21333,3.0,17.172395,10.0,698.59196,0.0,698.595488,10.0,17.176818,2152.22384,...,-94.138528,695.267101,-3.04824,-756.153564,7.0,-748.529252,12.980316,2148.261662,0.0,181611.75
max,3851.867974,5.0,27.980604,10.0,808.551641,1.0,808.551641,11.0,28.980604,3851.867974,...,462.497412,804.433596,4.295151,-313.797279,7.0,-316.593872,24.260526,3846.867974,1.0,242149.0


In [88]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242150 entries, 0 to 242149
Data columns (total 83 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   loan_amnt                             242150 non-null  float64
 1   term                                  242150 non-null  int16  
 2   interest_rate                         242150 non-null  float64
 3   employment_length                     242150 non-null  int16  
 4   credit_score                          242150 non-null  float64
 5   application_type                      242150 non-null  int8   
 6   application_type + credit_score       242150 non-null  float64
 7   application_type + employment_length  242150 non-null  int16  
 8   application_type + interest_rate      242150 non-null  float64
 9   application_type + loan_amnt          242150 non-null  float64
 10  application_type + term               242150 non-null  int16  
 11  

In [82]:
train_pkl.to_pickle('train.pk2')

In [89]:
train_pk3 = train_pkl.copy()
train_pk3.replace(np.inf, np.nan, inplace=True)

In [90]:
train_pk3.isnull().sum()[train_pk3.isnull().sum() > 0]

application_type / employment_length       522
credit_score / application_type         240368
credit_score / employment_length         19373
employment_length / application_type    221517
interest_rate / application_type        240368
interest_rate / employment_length        19373
loan_amnt / application_type            240368
loan_amnt / employment_length            19373
term / application_type                 240368
term / employment_length                 19373
dtype: int64

## 検証データ

In [51]:
# pickle ファイルから読み込み
test_pkl = pd.read_pickle('../001/test.pk1')

In [52]:
test_pkl.shape

(26900, 9)

In [53]:
# グレード(grade)、目的(purpose)は削除
test_pkl.drop(['grade', 'purpose'], axis=1, inplace=True)

In [54]:
test_pkl.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,employment_length,credit_score,application_type
0,242150,1161.830205,3,13.105362,10,682.016083,0
1,242151,2257.465815,5,14.1962,10,668.240313,0
2,242152,2182.137002,3,8.307772,10,703.277694,0
3,242153,520.284912,3,6.956504,7,731.138779,0
4,242154,2303.349549,3,11.658497,6,698.368991,0


In [55]:
test_pk2 = test_pkl.copy()
test_pkl.shape, test_pk2.shape

((26900, 7), (26900, 7))

In [56]:
es = ft.EntitySet(id='feature')

In [57]:
es.entity_from_dataframe( 
    entity_id='feature_id',
    dataframe=test_pkl.drop(['id'], axis=1),
    index='index',
    variable_types={}
)

  "integer column".format(index))


Entityset: feature
  Entities:
    feature_id [Rows: 26900, Columns: 7]
  Relationships:
    No relationships

In [58]:
feature_matrix, features_dfs = ft.dfs(
    entityset=es,
    target_entity='feature_id',
    agg_primitives=['sum', 'mean', 'std', 'max', 'count', 'skew'],
    trans_primitives=['add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric'],
    max_depth=1,
)

  agg_primitives: ['count', 'max', 'mean', 'skew', 'std', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


In [59]:
feature_matrix.shape

(26900, 81)

In [60]:
feature_matrix.head()

Unnamed: 0_level_0,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,application_type + credit_score,application_type + employment_length,application_type + interest_rate,application_type + loan_amnt,...,credit_score - employment_length,credit_score - interest_rate,credit_score - loan_amnt,credit_score - term,employment_length - interest_rate,employment_length - loan_amnt,employment_length - term,interest_rate - loan_amnt,interest_rate - term,loan_amnt - term
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1161.830205,3,13.105362,10,682.016083,0,682.016083,10,13.105362,1161.830205,...,672.016083,668.910721,-479.814122,679.016083,-3.105362,-1151.830205,7,-1148.724843,10.105362,1158.830205
1,2257.465815,5,14.1962,10,668.240313,0,668.240313,10,14.1962,2257.465815,...,658.240313,654.044114,-1589.225502,663.240313,-4.1962,-2247.465815,5,-2243.269615,9.1962,2252.465815
2,2182.137002,3,8.307772,10,703.277694,0,703.277694,10,8.307772,2182.137002,...,693.277694,694.969921,-1478.859308,700.277694,1.692228,-2172.137002,7,-2173.829229,5.307772,2179.137002
3,520.284912,3,6.956504,7,731.138779,0,731.138779,7,6.956504,520.284912,...,724.138779,724.182275,210.853868,728.138779,0.043496,-513.284912,4,-513.328407,3.956504,517.284912
4,2303.349549,3,11.658497,6,698.368991,0,698.368991,6,11.658497,2303.349549,...,692.368991,686.710494,-1604.980558,695.368991,-5.658497,-2297.349549,3,-2291.691052,8.658497,2300.349549


In [61]:
test_pkl = feature_matrix.copy()

In [62]:
test_pkl['id'] = train_pk2['id']

In [64]:
test_pkl.shape

(26900, 82)

In [83]:
test_pkl.isnull().sum()[test_pkl.isnull().sum() > 0]

application_type / employment_length    2000
employment_length / application_type    2000
dtype: int64

In [84]:
test_pkl.fillna(0, inplace=True)

In [85]:
test_pkl.isnull().sum()[test_pkl.isnull().sum() > 0]

Series([], dtype: int64)

In [86]:
test_pkl.to_pickle('test.pk2')