In [7]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

from util import prepare_train_X_y
from util import prepare_test_X

In [8]:
train_X, train_y = prepare_train_X_y()


In [9]:
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, learning_rate=0.03, nthread=4, subsample=0.95, colsample_bytree=0.85, seed=7171)

In [10]:
fit_X, eval_X, fit_y, eval_y = train_test_split(train_X, train_y, test_size=0.3)

In [11]:
clf.fit(train_X, train_y, early_stopping_rounds=20, eval_metric="auc", eval_set=[(eval_X, eval_y)])

Will train until validation_0 error hasn't decreased in 20 rounds.
[0]	validation_0-auc:0.816582
[1]	validation_0-auc:0.779287
[2]	validation_0-auc:0.755121
[3]	validation_0-auc:0.789165
[4]	validation_0-auc:0.774244
[5]	validation_0-auc:0.788026
[6]	validation_0-auc:0.798764
[7]	validation_0-auc:0.800394
[8]	validation_0-auc:0.803066
[9]	validation_0-auc:0.805444
[10]	validation_0-auc:0.806470
[11]	validation_0-auc:0.807320
[12]	validation_0-auc:0.809352
[13]	validation_0-auc:0.809517
[14]	validation_0-auc:0.810770
[15]	validation_0-auc:0.811794
[16]	validation_0-auc:0.809407
[17]	validation_0-auc:0.806220
[18]	validation_0-auc:0.806437
[19]	validation_0-auc:0.807251
[20]	validation_0-auc:0.807697
Stopping. Best iteration:
[0]	validation_0-auc:0.816582



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7171, silent=True, subsample=0.95)

In [12]:
print('Overall AUC:', roc_auc_score(train_y, clf.predict_proba(train_X)[:,1]))

Overall AUC: 0.837095684085


#### Making predictions

In [13]:
test_ids, test_X = prepare_test_X()
pred_y = clf.predict_proba(test_X)[:,1]

In [14]:
submission = pd.DataFrame({"ID":test_ids, "TARGET":pred_y})
submission.to_csv("submission.csv", index=False)

In [16]:
!head submission.csv

ID,TARGET
2,0.2809791564941406
5,0.2854905426502228
6,0.26829150319099426
7,0.27427324652671814
9,0.26829150319099426
11,0.3549484312534332
12,0.2890394926071167
15,0.3535606861114502
16,0.27775314450263977
