In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb 
from xgboost import XGBClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix 

#### 教師データを見込み

In [7]:
X_train = pd.read_csv('train_x.csv')
Y_train = pd.read_csv('train_y.csv')

#### 検証用データを読み込み

In [8]:
X_test = pd.read_csv('test_x.csv')
Y_test = pd.read_csv('test_y.csv')

#### ランダムフォレストでモデル作成と精度検証

In [9]:
X_train_in = X_train.values
Y_train_in = Y_train.values.ravel()

In [10]:
RF = RandomForestClassifier(n_estimators = 300, random_state = 42)
RF.fit(X_train_in, Y_train_in)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
Y_test_pred = RF.predict(X_test.values)

In [41]:
cmat = confusion_matrix(Y_test_pred, Y_test, labels=[0,1])
print(cmat)

[[2338  188]
 [  58  129]]


In [48]:
(cmat[0,0]+cmat[1,1])/len(Y_test_pred)

0.90932546995945451

#### XGBoostでモデル作成と精度検証

In [50]:
XGB = XGBClassifier(n_estimators=300, seed_size=42)
XGB.fit(X_train_in, Y_train_in)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       seed_size=42, silent=True, subsample=1)

In [51]:
Y_test_pred = XGB.predict(X_test.values)

In [52]:
cmat = confusion_matrix(Y_test_pred, Y_test, labels=[0,1])
print(cmat)

[[2336  175]
 [  60  142]]


In [53]:
(cmat[0,0]+cmat[1,1])/len(Y_test_pred)

0.91338002211573899

### GridSearchで探索する

In [5]:
from sklearn.grid_search import GridSearchCV

In [11]:
params = {
    'n_estimators':[400,500,600],
    'max_depth':[2,3,4],
    'min_child_weight':[1,2,5,10],
    'gamma':[0,0.1,0.2],
    'subsample':[0.7,0.8,0.9],
    'colsample_bytree':[0.5,0.6,0.7],
    }
grid = GridSearchCV(estimator=XGBClassifier(seed_size=42),param_grid=params,scoring='roc_auc',cv=5)
grid.fit(X_train_in, Y_train_in)

KeyboardInterrupt: 

#### XGBoostで確率を取得する

In [54]:
Y_test_prob = XGB.predict_proba(X_test.values)

In [55]:
Y_test_prob

array([[ 0.2882694 ,  0.7117306 ],
       [ 0.9896273 ,  0.01037269],
       [ 0.97251546,  0.02748452],
       ..., 
       [ 0.81028998,  0.18970999],
       [ 0.33791405,  0.66208595],
       [ 0.97226554,  0.02773447]], dtype=float32)

In [56]:
Y_test_prob.shape

(2713, 2)

In [57]:
type(Y_test_prob)

numpy.ndarray