In [2]:
import numpy as np
import mltools as ml
import xgboost as xgb

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
np.random.seed(9)

In [5]:
X_train = np.genfromtxt('data/X_train.txt', delimiter=None)
Y_train = np.genfromtxt('data/Y_train.txt', delimiter=None)
X_test = np.genfromtxt('data/X_test.txt', delimiter=None)

#### Rescaling Data

In [6]:
X_train_rescaled, parameters = ml.rescale(X_train)
X_test_rescaled, _ = ml.rescale(X_test, parameters)

#### Split data into Train and Validate

In [7]:
X_split_train, X_split_validation, Y_split_train, Y_split_validation = \
                                                train_test_split(X_train_rescaled, Y_train, test_size = 0.3)

#### Applying XGBoost

In [8]:
xlf = xgb.XGBRegressor(max_depth=6,
                       objective='binary:logistic',
                       booster = 'gbtree',
                       subsample=1,
                       colsample_bytree=0.85,
                       eval_metric='auc') 

In [15]:
xlf.fit(X_split_train, Y_split_train, eval_metric='auc', verbose = True, \
        eval_set = [(X_split_train, Y_split_train)], early_stopping_rounds=100)

# xlf.fit(X_train_rescaled, Y_train, eval_metric='auc', verbose = True)

[0]	validation_0-auc:0.675584
Will train until validation_0-auc hasn't improved in 500 rounds.
[1]	validation_0-auc:0.68185
[2]	validation_0-auc:0.683596
[3]	validation_0-auc:0.685653
[4]	validation_0-auc:0.68712
[5]	validation_0-auc:0.688308
[6]	validation_0-auc:0.689007
[7]	validation_0-auc:0.690445
[8]	validation_0-auc:0.691957
[9]	validation_0-auc:0.693452
[10]	validation_0-auc:0.694373
[11]	validation_0-auc:0.69652
[12]	validation_0-auc:0.69807
[13]	validation_0-auc:0.700486
[14]	validation_0-auc:0.700962
[15]	validation_0-auc:0.701489
[16]	validation_0-auc:0.702226
[17]	validation_0-auc:0.703259
[18]	validation_0-auc:0.704764
[19]	validation_0-auc:0.705832
[20]	validation_0-auc:0.706881
[21]	validation_0-auc:0.708354
[22]	validation_0-auc:0.708941
[23]	validation_0-auc:0.709572
[24]	validation_0-auc:0.710333
[25]	validation_0-auc:0.711766
[26]	validation_0-auc:0.712421
[27]	validation_0-auc:0.713115
[28]	validation_0-auc:0.714492
[29]	validation_0-auc:0.716212
[30]	validation_0-a

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.85, eval_metric='auc', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [10]:
predictions = xlf.predict(X_test_rescaled)

In [11]:
predictions #1

array([ 0.57764763,  0.40613833,  0.1747202 , ...,  0.18124287,
        0.14698055,  0.175818  ], dtype=float32)

#### Saving txt file

In [12]:
Y_test = np.vstack((np.arange(X_test.shape[0]), predictions)).T

In [13]:
np.savetxt('Y_submit_XGBoost_v2.txt', Y_test, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')