In [1]:
import numpy as np
import mltools as ml
import xgboost as xgb

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
np.random.seed(9)

In [4]:
X_train = np.genfromtxt('data/X_train.txt', delimiter=None)
Y_train = np.genfromtxt('data/Y_train.txt', delimiter=None)
X_test = np.genfromtxt('data/X_test.txt', delimiter=None)

#### Rescaling Data

In [5]:
X_train_rescaled, parameters = ml.rescale(X_train)
X_test_rescaled, _ = ml.rescale(X_test, parameters)

#### Split data into Train and Validate

In [6]:
X_split_train, X_split_validation, Y_split_train, Y_split_validation = \
                                                train_test_split(X_train_rescaled, Y_train, test_size = 0.3)

#### Applying XGBoost

In [16]:
xlf = xgb.XGBRegressor(nthread=4, #when use hyperthread, xgboost may become slower
                        objective='binary:logistic',
                        learning_rate=0.15, #so called `eta` value
                        max_depth=8,
                        min_child_weight=3,
                        silent=1,
                        subsample=0.9,
                        colsample_bytree=0.5,
                        n_estimators=300, #number of trees
                        seed=1337)

In [17]:
xlf.fit(X_split_train, Y_split_train, eval_metric='auc', verbose = True, \
        eval_set = [(X_split_train, Y_split_train)], early_stopping_rounds=500)

# xlf.fit(X_train_rescaled, Y_train, eval_metric='auc', verbose = True)

[0]	validation_0-auc:0.69036
Will train until validation_0-auc hasn't improved in 500 rounds.
[1]	validation_0-auc:0.700098
[2]	validation_0-auc:0.703322
[3]	validation_0-auc:0.708441
[4]	validation_0-auc:0.710035
[5]	validation_0-auc:0.717068
[6]	validation_0-auc:0.71979
[7]	validation_0-auc:0.722433
[8]	validation_0-auc:0.727084
[9]	validation_0-auc:0.728973
[10]	validation_0-auc:0.733315
[11]	validation_0-auc:0.734714
[12]	validation_0-auc:0.740296
[13]	validation_0-auc:0.744943
[14]	validation_0-auc:0.748486
[15]	validation_0-auc:0.750838
[16]	validation_0-auc:0.752115
[17]	validation_0-auc:0.755977
[18]	validation_0-auc:0.758253
[19]	validation_0-auc:0.760315
[20]	validation_0-auc:0.761866
[21]	validation_0-auc:0.762621
[22]	validation_0-auc:0.762898
[23]	validation_0-auc:0.763481
[24]	validation_0-auc:0.766182
[25]	validation_0-auc:0.768098
[26]	validation_0-auc:0.768947
[27]	validation_0-auc:0.770864
[28]	validation_0-auc:0.771208
[29]	validation_0-auc:0.773469
[30]	validation_0

[259]	validation_0-auc:0.868503
[260]	validation_0-auc:0.868597
[261]	validation_0-auc:0.868808
[262]	validation_0-auc:0.868986
[263]	validation_0-auc:0.869149
[264]	validation_0-auc:0.869212
[265]	validation_0-auc:0.869404
[266]	validation_0-auc:0.869482
[267]	validation_0-auc:0.86962
[268]	validation_0-auc:0.869887
[269]	validation_0-auc:0.870036
[270]	validation_0-auc:0.870071
[271]	validation_0-auc:0.870209
[272]	validation_0-auc:0.870593
[273]	validation_0-auc:0.870561
[274]	validation_0-auc:0.870743
[275]	validation_0-auc:0.87097
[276]	validation_0-auc:0.871257
[277]	validation_0-auc:0.871441
[278]	validation_0-auc:0.871746
[279]	validation_0-auc:0.871952
[280]	validation_0-auc:0.87216
[281]	validation_0-auc:0.872311
[282]	validation_0-auc:0.872366
[283]	validation_0-auc:0.872552
[284]	validation_0-auc:0.872636
[285]	validation_0-auc:0.872812
[286]	validation_0-auc:0.872895
[287]	validation_0-auc:0.872985
[288]	validation_0-auc:0.873249
[289]	validation_0-auc:0.873345
[290]	valid

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.15, max_delta_step=0,
       max_depth=8, min_child_weight=3, missing=None, n_estimators=300,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1337, silent=1,
       subsample=0.9)

In [18]:
predictions = xlf.predict(X_test_rescaled)

In [19]:
predictions

array([ 0.78285754,  0.62050647,  0.20419091, ...,  0.26792434,
        0.15568559,  0.16630571], dtype=float32)

#### Saving txt file

In [20]:
Y_test = np.vstack((np.arange(X_test.shape[0]), predictions)).T

In [21]:
np.savetxt('Y_submit_XGBoost_v4.txt', Y_test, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')