In [1]:
"""
    Loading required packages
"""
import pandas as pd
import numpy as np
import xgboost as xgb
import datetime
from sklearn.model_selection import train_test_split
import timeit

In [2]:
"""
    Logging start date time
"""
Start_DateTime = datetime.datetime.now()

In [3]:
"""
    Read source files: train & test
    Get datasets from https://www.kaggle.com/c/santander-customer-transaction-prediction/data
"""
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [4]:
"""
    split train dataset for cross validation
"""
build, valid = train_test_split(train, test_size=0.2, random_state=2019)

In [5]:
train.columns

Index(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4',
       'var_5', 'var_6', 'var_7',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=202)

In [6]:
"""
    ID_code - Id column, no meaning
    target - target variable for modeling
"""
non_feature_names = ['ID_code','target']
feature_names = [col for col in train.columns if col not in non_feature_names]

In [7]:
"""
    Build XGBoost DMatrix for build & valid & test
    We may not re-train with full train dataset here
"""
dbuild = xgb.DMatrix(build[feature_names], label=build['target'])
dvalid = xgb.DMatrix(valid[feature_names], label=valid['target'])

"""
    watchlist for cross validation monitoring
    here main focus to check the run time for performance monitoring rather folding, seeding
"""

watchlist = [ (dbuild,'build'), (dvalid, 'valid') ]

xgbtest = xgb.DMatrix(test[feature_names])


In [8]:
"""
    XGBoost parameters
"""
param = {}
param['seed'] = 201902
param['objective'] = 'reg:linear'
param['eta'] = 0.01
param['max_depth'] = 8
param['silent'] = 0
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['nthread'] = 10
param['print_every_n'] = 1
xgb_num_rounds = 500
xgbParameters = list(param.items())

In [9]:
%%time
"""
    Model training and log run time
"""
model = xgb.train(xgbParameters, 
                    dbuild, 
                    xgb_num_rounds, 
                    watchlist                                       
                  )

[0]	build-rmse:0.496663	valid-rmse:0.496746
[1]	build-rmse:0.493363	valid-rmse:0.493523
[2]	build-rmse:0.490109	valid-rmse:0.49035
[3]	build-rmse:0.486892	valid-rmse:0.487221
[4]	build-rmse:0.48374	valid-rmse:0.484148
[5]	build-rmse:0.480615	valid-rmse:0.4811
[6]	build-rmse:0.477521	valid-rmse:0.478084
[7]	build-rmse:0.474477	valid-rmse:0.475115
[8]	build-rmse:0.471469	valid-rmse:0.472184
[9]	build-rmse:0.468497	valid-rmse:0.469293
[10]	build-rmse:0.465561	valid-rmse:0.466435
[11]	build-rmse:0.462664	valid-rmse:0.463617
[12]	build-rmse:0.459816	valid-rmse:0.460857
[13]	build-rmse:0.456992	valid-rmse:0.458122
[14]	build-rmse:0.45421	valid-rmse:0.455427
[15]	build-rmse:0.451467	valid-rmse:0.45277
[16]	build-rmse:0.448768	valid-rmse:0.450153
[17]	build-rmse:0.446104	valid-rmse:0.447568
[18]	build-rmse:0.443456	valid-rmse:0.445013
[19]	build-rmse:0.440855	valid-rmse:0.442499
[20]	build-rmse:0.438277	valid-rmse:0.440005
[21]	build-rmse:0.435746	valid-rmse:0.437564
[22]	build-rmse:0.433243	v

[182]	build-rmse:0.276343	valid-rmse:0.294143
[183]	build-rmse:0.276059	valid-rmse:0.293951
[184]	build-rmse:0.275786	valid-rmse:0.293776
[185]	build-rmse:0.275523	valid-rmse:0.293587
[186]	build-rmse:0.275248	valid-rmse:0.293408
[187]	build-rmse:0.274984	valid-rmse:0.293234
[188]	build-rmse:0.274728	valid-rmse:0.29306
[189]	build-rmse:0.274473	valid-rmse:0.292884
[190]	build-rmse:0.274212	valid-rmse:0.292706
[191]	build-rmse:0.273954	valid-rmse:0.29254
[192]	build-rmse:0.273697	valid-rmse:0.292372
[193]	build-rmse:0.273467	valid-rmse:0.292213
[194]	build-rmse:0.273219	valid-rmse:0.292053
[195]	build-rmse:0.272979	valid-rmse:0.291903
[196]	build-rmse:0.272733	valid-rmse:0.291749
[197]	build-rmse:0.272502	valid-rmse:0.29159
[198]	build-rmse:0.272267	valid-rmse:0.291441
[199]	build-rmse:0.272038	valid-rmse:0.291295
[200]	build-rmse:0.271815	valid-rmse:0.291143
[201]	build-rmse:0.271595	valid-rmse:0.290998
[202]	build-rmse:0.27137	valid-rmse:0.290858
[203]	build-rmse:0.271141	valid-rmse:0

[361]	build-rmse:0.248732	valid-rmse:0.279818
[362]	build-rmse:0.248637	valid-rmse:0.279789
[363]	build-rmse:0.248535	valid-rmse:0.279756
[364]	build-rmse:0.24844	valid-rmse:0.279717
[365]	build-rmse:0.248335	valid-rmse:0.279682
[366]	build-rmse:0.248234	valid-rmse:0.279646
[367]	build-rmse:0.248136	valid-rmse:0.279607
[368]	build-rmse:0.248038	valid-rmse:0.279563
[369]	build-rmse:0.247928	valid-rmse:0.279523
[370]	build-rmse:0.247829	valid-rmse:0.279489
[371]	build-rmse:0.24772	valid-rmse:0.279442
[372]	build-rmse:0.247613	valid-rmse:0.279404
[373]	build-rmse:0.247514	valid-rmse:0.279367
[374]	build-rmse:0.247403	valid-rmse:0.279328
[375]	build-rmse:0.247295	valid-rmse:0.279294
[376]	build-rmse:0.247186	valid-rmse:0.27925
[377]	build-rmse:0.247087	valid-rmse:0.279211
[378]	build-rmse:0.246978	valid-rmse:0.279182
[379]	build-rmse:0.246879	valid-rmse:0.279147
[380]	build-rmse:0.246773	valid-rmse:0.279114
[381]	build-rmse:0.24667	valid-rmse:0.279081
[382]	build-rmse:0.246572	valid-rmse:0

In [10]:
"""
    Predict on test set, we are not interested in predictions here
"""
preds = model.predict(xgbtest)

In [11]:
preds

array([0.12881759, 0.16210675, 0.23539215, ..., 0.0243341 , 0.12761903,
       0.09670562], dtype=float32)

In [12]:
"""
    Logging end date time
"""
End_DateTime = datetime.datetime.now()

In [13]:
minutes_diff = (End_DateTime - Start_DateTime).total_seconds() / 60.0

In [14]:
"""
    Total run time
"""
minutes_diff

7.7587299666666665