In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest

Using TensorFlow backend.


In [2]:
bank_df = pd.read_csv('../datasets/ric/bank.csv', sep=',')
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no


In [3]:
bank_df = bank_df.dropna(subset=['job', 'education'])
print(bank_df.shape)

(6935, 17)


## 文字列値の集約

In [4]:
bank_df.loc[(bank_df['job'] == 'management') | 
            (bank_df['job'] == 'technician') |
            (bank_df['job'] == 'blue-collar') | 
            (bank_df['job'] == 'admin') |
            (bank_df['job'] == 'services') |
            (bank_df['job'] == 'self-employed') |
            (bank_df['job'] == 'entrepreneur') |
            (bank_df['job'] == 'housemaid'), 'job2'] = 'worker'

bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job2
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no,worker
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no,worker
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no,worker
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no,worker
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no,worker


In [5]:
bank_df.loc[(bank_df['month'] == 'jan') |
            (bank_df['month'] == 'feb') |
            (bank_df['month'] == 'mar'), 'month2'] = '1Q'

bank_df.loc[(bank_df['month'] == 'apr') |
            (bank_df['month'] == 'may') |
            (bank_df['month'] == 'jun'), 'month2'] = '2Q'

bank_df.loc[(bank_df['month'] == 'jul') |
            (bank_df['month'] == 'aug') |
            (bank_df['month'] == 'sep'), 'month2'] = '3Q'

bank_df.loc[(bank_df['month'] == 'oct') |
            (bank_df['month'] == 'nov') |
            (bank_df['month'] == 'dec'), 'month2'] = '4Q'

## 数値の集約

In [6]:
bank_df.loc[bank_df['day'] <= 10, 'day2'] = 'early'
bank_df.loc[bank_df['day'] <= 20, 'day2'] = 'middle'
bank_df.loc[bank_df['day'] > 20, 'day2'] = 'late'
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job2,month2,day2
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no,worker,2Q,middle
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no,worker,2Q,middle
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no,worker,2Q,middle
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no,worker,2Q,middle
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no,worker,2Q,middle


In [7]:
bank_df.loc[bank_df['duration'] < 300, 'duration2'] = 'short'
bank_df.loc[bank_df['duration'] >= 300, 'duration2'] = 'long'
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,duration,campaign,pdays,previous,poutcome,y,job2,month2,day2,duration2
0,58,management,married,tertiary,no,2143,yes,no,,5,...,261,1,-1,0,,no,worker,2Q,middle,short
1,36,technician,single,secondary,no,265,yes,yes,,5,...,348,1,-1,0,,no,worker,2Q,middle,long
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,...,365,1,-1,0,,no,worker,2Q,middle,long
3,53,technician,married,secondary,no,-3,no,no,,5,...,1666,1,-1,0,,no,worker,2Q,middle,long
4,24,technician,single,secondary,no,-103,yes,yes,,5,...,145,1,-1,0,,no,worker,2Q,middle,short


In [8]:
bank_df.loc[bank_df['previous'] < 1, 'previous2'] = 'zero'
bank_df.loc[bank_df['previous'] >= 1, 'previous2'] = 'one-more'
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,campaign,pdays,previous,poutcome,y,job2,month2,day2,duration2,previous2
0,58,management,married,tertiary,no,2143,yes,no,,5,...,1,-1,0,,no,worker,2Q,middle,short,zero
1,36,technician,single,secondary,no,265,yes,yes,,5,...,1,-1,0,,no,worker,2Q,middle,long,zero
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,...,1,-1,0,,no,worker,2Q,middle,long,zero
3,53,technician,married,secondary,no,-3,no,no,,5,...,1,-1,0,,no,worker,2Q,middle,long,zero
4,24,technician,single,secondary,no,-103,yes,yes,,5,...,1,-1,0,,no,worker,2Q,middle,short,zero


In [9]:
bank_df.loc[bank_df['pdays'] < 0, 'pdays2'] = 'less'
bank_df.loc[bank_df['pdays'] >= 0, 'pdays2'] = 'more'
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,pdays,previous,poutcome,y,job2,month2,day2,duration2,previous2,pdays2
0,58,management,married,tertiary,no,2143,yes,no,,5,...,-1,0,,no,worker,2Q,middle,short,zero,less
1,36,technician,single,secondary,no,265,yes,yes,,5,...,-1,0,,no,worker,2Q,middle,long,zero,less
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,...,-1,0,,no,worker,2Q,middle,long,zero,less
3,53,technician,married,secondary,no,-3,no,no,,5,...,-1,0,,no,worker,2Q,middle,long,zero,less
4,24,technician,single,secondary,no,-103,yes,yes,,5,...,-1,0,,no,worker,2Q,middle,short,zero,less


In [10]:
bank_df = bank_df.replace('yes', 1)
bank_df = bank_df.replace('no', 0)

bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,pdays,previous,poutcome,y,job2,month2,day2,duration2,previous2,pdays2
0,58,management,married,tertiary,0,2143,1,0,,5,...,-1,0,,0,worker,2Q,middle,short,zero,less
1,36,technician,single,secondary,0,265,1,1,,5,...,-1,0,,0,worker,2Q,middle,long,zero,less
2,25,blue-collar,married,secondary,0,-7,1,0,,5,...,-1,0,,0,worker,2Q,middle,long,zero,less
3,53,technician,married,secondary,0,-3,0,0,,5,...,-1,0,,0,worker,2Q,middle,long,zero,less
4,24,technician,single,secondary,0,-103,1,1,,5,...,-1,0,,0,worker,2Q,middle,short,zero,less


In [11]:
bank_df_job = pd.get_dummies(bank_df['job'])
bank_df_job.head()

Unnamed: 0,admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed
0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,1,0


In [12]:
bank_df_marital = pd.get_dummies(bank_df['marital'])
bank_df_education = pd.get_dummies(bank_df['education'])
bank_df_contact = pd.get_dummies(bank_df['contact'])
bank_df_month = pd.get_dummies(bank_df['month'])

In [13]:
tmp1 = bank_df[['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y']]
tmp1.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y
0,58,0,2143,1,0,5,261,1,-1,0,0
1,36,0,265,1,1,5,348,1,-1,0,0
2,25,0,-7,1,0,5,365,1,-1,0,0
3,53,0,-3,0,0,5,1666,1,-1,0,0
4,24,0,-103,1,1,5,145,1,-1,0,0


In [14]:
bank_df_job2 = pd.get_dummies(bank_df['job2'])
bank_df_month2 = pd.get_dummies(bank_df['month2'])
bank_df_day2 = pd.get_dummies(bank_df['day2'])
bank_df_duration2 = pd.get_dummies(bank_df['duration2'])
bank_df_previous2 = pd.get_dummies(bank_df['previous2'])
bank_df_pdays2 = pd.get_dummies(bank_df['pdays2'])

In [15]:
tmp2 = pd.concat([tmp1, bank_df_marital], axis=1)
tmp3 = pd.concat([tmp2, bank_df_education], axis=1)
tmp4 = pd.concat([tmp3, bank_df_contact], axis=1)
bank_df_new = pd.concat([tmp4, bank_df_month], axis=1)

bank_df_new.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,dec,feb,jan,jul,jun,mar,may,nov,oct,sep
0,58,0,2143,1,0,5,261,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
1,36,0,265,1,1,5,348,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
2,25,0,-7,1,0,5,365,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,0,-3,0,0,5,1666,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
4,24,0,-103,1,1,5,145,1,-1,0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
tmp5 = pd.concat([bank_df_new, bank_df_job2], axis=1)
tmp6 = pd.concat([tmp5, bank_df_month2], axis=1)
tmp7 = pd.concat([tmp6, bank_df_day2], axis=1)
tmp8 = pd.concat([tmp7, bank_df_duration2], axis=1)
tmp9 = pd.concat([tmp8, bank_df_previous2], axis=1)
bank_df_new2 = pd.concat([tmp9, bank_df_pdays2], axis=1)

bank_df_new2.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,3Q,4Q,late,middle,long,short,one-more,zero,less,more
0,58,0,2143,1,0,5,261,1,-1,0,...,0,0,0,1,0,1,0,1,1,0
1,36,0,265,1,1,5,348,1,-1,0,...,0,0,0,1,1,0,0,1,1,0
2,25,0,-7,1,0,5,365,1,-1,0,...,0,0,0,1,1,0,0,1,1,0
3,53,0,-3,0,0,5,1666,1,-1,0,...,0,0,0,1,1,0,0,1,1,0
4,24,0,-103,1,1,5,145,1,-1,0,...,0,0,0,1,0,1,0,1,1,0


In [17]:
bank_df_new2.to_csv('../datasets/ric/bank-prep2.csv', index=False)

In [18]:
X = np.array(bank_df_new.drop('y', axis=1))
Y = np.array(bank_df_new[['y']])
print('サンプリング前')
print(np.sum(Y == 1), np.sum(Y == 0))

sampler =RandomUnderSampler(random_state = 42)
X, Y = sampler.fit_resample(X, Y)
print('サンプリング後')
print(np.sum(Y==1), np.sum(Y==0))

サンプリング前
820 6115
サンプリング後
820 820


## 特徴量選択

In [19]:
selector = SelectKBest(k=5)
selector.fit(X, Y)
mask = selector.get_support()

print(bank_df_new.drop('y', axis=1).columns)
print(mask)

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'divorced', 'married', 'single',
       'primary', 'secondary', 'tertiary', 'cellular', 'telephone', 'apr',
       'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct',
       'sep'],
      dtype='object')
[False False False  True False False  True False False  True False False
 False False False False  True False False False False False False False
 False False  True False False False]
