In [1]:
# データ解析のライブラリ
import pandas as pd
import numpy as np 

# Scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix 

# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier

import time

## データの読み込みと準備

In [2]:
X_train = pd.read_csv('train_x.csv')
Y_train = pd.read_csv('train_y.csv')
X_test = pd.read_csv('test_x.csv')
Y_test = pd.read_csv('test_y.csv')
X_train_in = X_train.values
Y_train_in = Y_train.values.ravel()
X_test_in = X_test.values
Y_test_in = Y_test.values.ravel()

### GridSearchの実行

### estimatorの探索

In [22]:
params = { 'n_estimators':[200,300,400]}

In [23]:
s = time.time()
gs = GridSearchCV(estimator = XGBClassifier(seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

{'n_estimators': 400}
0.926125387027
elapsed time227.71946811676025


In [24]:
params = { 'n_estimators':[400,500,600]}

In [25]:
s = time.time()
gs = GridSearchCV(estimator = XGBClassifier(seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

{'n_estimators': 500}
0.926417717295
elapsed time294.14839792251587


In [27]:
# n_estimatorsは500に決定。

In [28]:
initial_params = {
    'n_estimators':500
}

In [29]:
params={
    'max_depth':[4,6,10],
    'min_child_weight':[4,6,10]
}

In [30]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

{'max_depth': 4, 'min_child_weight': 10}
0.926479684757
elapsed time6286.767390966415


In [32]:
initial_params = {
    'n_estimators':500,
    'min_child_weight':10
}
params={
    'max_depth':[3,4,5],
}

In [33]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

{'max_depth': 4}
0.926479684757
elapsed time6722.813516139984


In [34]:
initial_params = {
    'n_estimators':500,
    'max_depth':4,
}
params={
    'min_child_weight':[8,10,12],
}

In [35]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

{'min_child_weight': 10}
0.926479684757
elapsed time7191.437421798706


In [41]:
Y_test_pred = gs.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

0.909325469959
[[2324  174]
 [  72  143]]


In [36]:
initial_params = {
    'n_estimators':500,
    'max_depth':4,
}
params={
    'min_child_weight':[9,10,11],
}

In [42]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

Y_test_pred = gs.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

{'min_child_weight': 9}
0.926507244002
elapsed time7809.8395919799805
0.905270917803
[[2322  183]
 [  74  134]]


In [43]:
initial_params = {
    'n_estimators':500,
    'max_depth':4,
    'min_child_weight':9
}
params = {
    'gamma':[0.0,0.1,0.2]
}


In [44]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

Y_test_pred = gs.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

{'gamma': 0.0}
0.926507244002
elapsed time8733.908485889435
0.905270917803
[[2322  183]
 [  74  134]]


In [46]:
initial_params = {
    'n_estimators':500,
    'max_depth':4,
    'min_child_weight':9,
    'gamma':0
}
params = {
    'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
}

In [47]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

Y_test_pred = gs.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

{'subsample': 1.0}
0.926507244002
elapsed time9511.5060338974
0.905270917803
[[2322  183]
 [  74  134]]


In [49]:
initial_params = {
    'n_estimators':500,
    'max_depth':4,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
}
params = {
    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0]
}

In [50]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

Y_test_pred = gs.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

{'colsample_bytree': 0.6}
0.926961051828
elapsed time10826.95302605629
0.906745300405
[[2327  184]
 [  69  133]]


In [51]:
initial_params = {
    'n_estimators':500,
    'max_depth':4,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6
}
params = {
    'learning_rate':[0.5, 0.2, 0.1, 0.05]
}

In [52]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, scoring='roc_auc',cv=5,n_jobs=-1)
gs.fit(X_train_in, Y_train_in)
print(gs.best_params_)
print(gs.best_score_)
e = time.time()-s
print(f"elapsed time{e}")

Y_test_pred = gs.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

{'learning_rate': 0.1}
0.926961051828
elapsed time11388.557144165039
0.906745300405
[[2327  184]
 [  69  133]]


### 以上のパラメタでXGBoostの学習をする

In [3]:
params = {
    'n_estimators':500,
    'max_depth':4,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

In [4]:
xgboost_opt = XGBClassifier(**params, seed=42)
xgboost_opt.fit(X_train_in, Y_train_in)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=9, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=1.0)

In [5]:
Y_test_pred = xgboost_opt.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

0.906745300405
[[2327  184]
 [  69  133]]


In [7]:
Y_train_pred = xgboost_opt.predict(X_train_in)
cmat = confusion_matrix(Y_train_pred, Y_train_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_train_pred))
print(cmat)

0.927667417571
[[21069  1277]
 [  489  1580]]


# 試しにRandomForestに同じデータを食わせてみる

In [10]:
rf = RandomForestClassifier(n_estimators = 300, random_state = 42)
rf.fit(X_train_in, Y_train_in)

1.0
[[21558     0]
 [    0  2857]]


In [11]:
Y_train_pred = rf.predict(X_train_in)
cmat = confusion_matrix(Y_train_pred, Y_train_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_train_pred))
print(cmat)

1.0
[[21558     0]
 [    0  2857]]


In [12]:
Y_test_pred = rf.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

0.909325469959
[[2338  188]
 [  58  129]]


## RandomForestで確率を取得してみる

In [27]:
X_test = pd.read_csv('pred_x.csv')
X_test_in = X_test.values

In [28]:
Y_test_pred = rf.predict_proba(X_test_in)

In [29]:
Y_test_pred[0:45,0:2]

array([[ 0.52333333,  0.47666667],
       [ 0.51      ,  0.49      ],
       [ 0.87666667,  0.12333333],
       [ 0.98666667,  0.01333333],
       [ 0.49      ,  0.51      ],
       [ 0.76666667,  0.23333333],
       [ 0.93666667,  0.06333333],
       [ 0.84      ,  0.16      ],
       [ 0.98666667,  0.01333333],
       [ 0.42666667,  0.57333333],
       [ 0.92666667,  0.07333333],
       [ 0.74      ,  0.26      ],
       [ 0.42666667,  0.57333333],
       [ 0.97333333,  0.02666667],
       [ 0.86333333,  0.13666667],
       [ 0.28666667,  0.71333333],
       [ 0.94666667,  0.05333333],
       [ 0.85333333,  0.14666667],
       [ 0.99666667,  0.00333333],
       [ 0.99333333,  0.00666667],
       [ 0.26666667,  0.73333333],
       [ 0.98333333,  0.01666667],
       [ 0.95      ,  0.05      ],
       [ 0.7       ,  0.3       ],
       [ 0.71333333,  0.28666667],
       [ 0.91      ,  0.09      ],
       [ 0.93666667,  0.06333333],
       [ 0.72333333,  0.27666667],
       [ 0.91333333,

In [30]:
submit_file = pd.read_csv('data/submit_sample.csv', names=['id','prob'])

In [31]:
t = Y_test_pred[:,1]

In [32]:
t.shape

(18083,)

In [33]:
submit_file.shape

(18083, 2)

In [34]:
submit_file['prob']=t

In [35]:
submit_file.head()

Unnamed: 0,id,prob
0,1,0.476667
1,2,0.49
2,3,0.123333
3,4,0.013333
4,5,0.51


In [38]:
submit_file.to_csv('submit9_RF_python.csv', columns=['id','prob'], header=False, index=False)