In [1]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
import gc
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold
warnings.simplefilter('ignore')

In [2]:
# dataの読み込み
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')
submit_df = pd.read_csv('input/submit_sample.csv',header=None)

In [3]:
# データの量の確認
train_df.shape,test_df.shape,submit_df.shape

((27100, 18), (18050, 17), (18050, 2))

In [4]:
# 訓練データ、テストデータがわかるようにダミーの目的変数を代入
test_df['y']=-999

In [5]:
train_df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,31,services,married,secondary,no,12294,yes,no,cellular,21,nov,101,3,498,0,other,0
1,1,29,entrepreneur,single,tertiary,no,43027,no,no,cellular,22,aug,158,2,702,0,unknown,1
2,2,35,management,married,tertiary,no,12252,yes,no,cellular,11,nov,351,1,826,0,failure,0
3,3,31,technician,married,secondary,no,99121,yes,yes,unknown,16,may,658,2,120,0,failure,0
4,4,48,unemployed,married,primary,no,42005,yes,no,telephone,3,apr,177,1,273,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27095,27095,37,blue-collar,married,secondary,no,26661,yes,no,cellular,27,may,345,4,425,0,unknown,0
27096,27096,35,services,married,secondary,no,42150,yes,no,cellular,27,may,121,1,719,0,unknown,0
27097,27097,35,services,married,unknown,no,34531,no,no,cellular,28,jun,177,2,121,0,unknown,0
27098,27098,30,admin.,single,secondary,no,99621,yes,no,cellular,27,may,121,1,100,0,unknown,0


In [6]:
# 訓練データ、テストデータを結合
all_df = pd.concat([train_df,test_df])
del train_df,test_df
gc.collect()

20

In [7]:
all_df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,31,services,married,secondary,no,12294,yes,no,cellular,21,nov,101,3,498,0,other,0
1,1,29,entrepreneur,single,tertiary,no,43027,no,no,cellular,22,aug,158,2,702,0,unknown,1
2,2,35,management,married,tertiary,no,12252,yes,no,cellular,11,nov,351,1,826,0,failure,0
3,3,31,technician,married,secondary,no,99121,yes,yes,unknown,16,may,658,2,120,0,failure,0
4,4,48,unemployed,married,primary,no,42005,yes,no,telephone,3,apr,177,1,273,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18045,18045,49,self-employed,married,tertiary,no,98357,yes,no,cellular,6,jul,101,2,417,0,failure,-999
18046,18046,34,blue-collar,married,secondary,no,29621,yes,no,cellular,12,may,345,1,815,0,unknown,-999
18047,18047,34,admin.,single,secondary,no,94260,yes,no,unknown,16,may,121,2,370,0,unknown,-999
18048,18048,31,technician,single,secondary,no,65483,yes,no,unknown,15,may,345,2,41,0,unknown,-999


In [8]:
# 各特徴量の変換用の辞書を設定する
marital_mapping = {'married': 3, 'single': 2, 'divorcedw': 1}
education_mapping = {'secondary': 4, 'tertiary': 3, 'primary': 2, 'unknown': 1}
default_mapping = {'no': 0, 'yes': 1}
housing_mapping = {'no': 0, 'yes': 1}
loan_mapping = {'no': 0, 'yes': 1}
month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

# データの各特徴量を変換する
all_df['marital'] = all_df['marital'].map(marital_mapping)
all_df['education'] = all_df['education'].map(education_mapping)
all_df['default'] = all_df['default'].map(default_mapping)
all_df['housing'] = all_df['housing'].map(housing_mapping)
all_df['loan'] = all_df['loan'].map(loan_mapping)
all_df['month'] = all_df['month'].map(month_mapping)

In [9]:
all_df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,31,services,3.0,4,0,12294,1,0,cellular,21,11,101,3,498,0,other,0
1,1,29,entrepreneur,2.0,3,0,43027,0,0,cellular,22,8,158,2,702,0,unknown,1
2,2,35,management,3.0,3,0,12252,1,0,cellular,11,11,351,1,826,0,failure,0
3,3,31,technician,3.0,4,0,99121,1,1,unknown,16,5,658,2,120,0,failure,0
4,4,48,unemployed,3.0,2,0,42005,1,0,telephone,3,4,177,1,273,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18045,18045,49,self-employed,3.0,3,0,98357,1,0,cellular,6,7,101,2,417,0,failure,-999
18046,18046,34,blue-collar,3.0,4,0,29621,1,0,cellular,12,5,345,1,815,0,unknown,-999
18047,18047,34,admin.,2.0,4,0,94260,1,0,unknown,16,5,121,2,370,0,unknown,-999
18048,18048,31,technician,2.0,4,0,65483,1,0,unknown,15,5,345,2,41,0,unknown,-999


In [10]:
# 訓練データ、テストデータの分割
train_df = all_df[all_df['y']!=-999]
test_df = all_df[all_df['y']==-999]

In [11]:
train_df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,31,services,3.0,4,0,12294,1,0,cellular,21,11,101,3,498,0,other,0
1,1,29,entrepreneur,2.0,3,0,43027,0,0,cellular,22,8,158,2,702,0,unknown,1
2,2,35,management,3.0,3,0,12252,1,0,cellular,11,11,351,1,826,0,failure,0
3,3,31,technician,3.0,4,0,99121,1,1,unknown,16,5,658,2,120,0,failure,0
4,4,48,unemployed,3.0,2,0,42005,1,0,telephone,3,4,177,1,273,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27095,27095,37,blue-collar,3.0,4,0,26661,1,0,cellular,27,5,345,4,425,0,unknown,0
27096,27096,35,services,3.0,4,0,42150,1,0,cellular,27,5,121,1,719,0,unknown,0
27097,27097,35,services,3.0,1,0,34531,0,0,cellular,28,6,177,2,121,0,unknown,0
27098,27098,30,admin.,2.0,4,0,99621,1,0,cellular,27,5,121,1,100,0,unknown,0


In [12]:
train_df = train_df.drop(['contact','poutcome'], axis=1)
test_df = test_df.drop(['contact','poutcome'], axis=1)

In [13]:
train_df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,y
0,0,31,services,3.0,4,0,12294,1,0,21,11,101,3,498,0,0
1,1,29,entrepreneur,2.0,3,0,43027,0,0,22,8,158,2,702,0,1
2,2,35,management,3.0,3,0,12252,1,0,11,11,351,1,826,0,0
3,3,31,technician,3.0,4,0,99121,1,1,16,5,658,2,120,0,0
4,4,48,unemployed,3.0,2,0,42005,1,0,3,4,177,1,273,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27095,27095,37,blue-collar,3.0,4,0,26661,1,0,27,5,345,4,425,0,0
27096,27096,35,services,3.0,4,0,42150,1,0,27,5,121,1,719,0,0
27097,27097,35,services,3.0,1,0,34531,0,0,28,6,177,2,121,0,0
27098,27098,30,admin.,2.0,4,0,99621,1,0,27,5,121,1,100,0,0


In [14]:
test_df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,y
0,0,35,technician,2.0,4,0,89043,0,0,7,2,101,2,184,2,-999
1,1,37,services,3.0,4,0,64372,1,0,7,7,158,3,241,0,-999
2,2,31,services,2.0,4,0,31606,1,0,15,5,152,2,47,0,-999
3,3,31,admin.,3.0,4,0,94826,1,0,27,5,345,2,490,0,-999
4,4,32,services,3.0,4,0,100401,0,0,7,1,126,1,686,0,-999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18045,18045,49,self-employed,3.0,3,0,98357,1,0,6,7,101,2,417,0,-999
18046,18046,34,blue-collar,3.0,4,0,29621,1,0,12,5,345,1,815,0,-999
18047,18047,34,admin.,2.0,4,0,94260,1,0,16,5,121,2,370,0,-999
18048,18048,31,technician,2.0,4,0,65483,1,0,15,5,345,2,41,0,-999


In [15]:
train_df = train_df.drop(['job','marital','day','campaign','pdays','previous'], axis=1)
test_df = test_df.drop(['job','marital','day','campaign','pdays','previous'], axis=1)

In [16]:
# categorical_features = ['age','education','default','balance','housing','loan','month','duration']

In [17]:
train_df

Unnamed: 0,id,age,education,default,balance,housing,loan,month,duration,y
0,0,31,4,0,12294,1,0,11,101,0
1,1,29,3,0,43027,0,0,8,158,1
2,2,35,3,0,12252,1,0,11,351,0
3,3,31,4,0,99121,1,1,5,658,0
4,4,48,2,0,42005,1,0,4,177,0
...,...,...,...,...,...,...,...,...,...,...
27095,27095,37,4,0,26661,1,0,5,345,0
27096,27096,35,4,0,42150,1,0,5,121,0
27097,27097,35,1,0,34531,0,0,6,177,0
27098,27098,30,4,0,99621,1,0,5,121,0


In [18]:
# 各foldのスコアを保存するリスト
scores_accuracy = []
scores_logloss = []

# Modelを作成
class Model:

    def __init__(self, params=None):
        self.model = None
        if params is None:
            self.params = {}
        else:
            self.params = params

    def fit(self, tr_x, tr_y, va_x, va_y):
        # ベースラインのパラメータ
        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eta': 0.2,
            'gamma': 0.0,
            'alpha': 0.0,
            'lambda': 1.0,
            'min_child_weight': 1,
            'max_depth': 8,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 71,
        }
        params.update(self.params)
        num_round = 10000
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(params,
                               dtrain,
                               num_round,
                               evals=watchlist,
                               early_stopping_rounds=100,
                               categorical_feature=categorical_features)     

    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred

In [19]:
train_y = train_df['y']
train_x = train_df.drop(['y','id'], axis=1)
test_x = test_df.drop(['y','id'], axis=1)

In [27]:
# クロスバリデーションを行う
# 学習データを4つに分割し、うち1つをバリデーションデータとすることを、バリデーションデータを変えて繰り返す
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    # 学習データを学習データとバリデーションデータに分ける
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # モデルの学習を行う
    model = XGBClassifier(n_estimators=100, random_state=71)
    model.fit(tr_x, tr_y)

    # バリデーションデータの予測値を確率で出力する
    va_pred = model.predict_proba(va_x)[:, 1]

    # バリデーションデータでのスコアを計算する
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)

    # そのfoldのスコアを保存する
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)

In [28]:
logloss

0.23329219679007945

In [29]:
accuracy

0.9186715867158671

In [23]:
pred = model.predict_proba(test_x)[:, 1]

In [24]:
pred

array([0.2168988 , 0.10047965, 0.10047965, ..., 0.10047965, 0.07642312,
       0.15636547], dtype=float32)

In [25]:
submit_df[1]=pred

In [26]:
submit_df.to_csv('output/submission_v8.csv',index=False,header=None)