In [7]:
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split  # for validation
import gc  # memory
from datetime import datetime  # train time checking
import time
import sys
from tqdm import tqdm

In [8]:
from veclib.solution import *
from veclib.modellib import *
from veclib.ptlib import *
from veclib.featurelib import *
from veclib.utils import *

In [9]:
TRAIN_FILE = 'dev.csv'
TEST_FILE= 'dev_test.csv'
SKIP_TRAIN_ROWS= None #range(1,109903891) #184903889 18790468
NUM_TRAIN_ROWS= None
NUM_TEST_ROWS= None
FEATURE_PPL = [f_base,f_hour,f_dfw,f_count,f_mean_hour]
METHOD = 'xgb'
TRANSDUCTIVE = True
PARA_TUNE = False
RELEASE = True
PARA_TUNE_CFG = './cfg/default.cfg'

In [10]:
d = DataSet()
d.train_file=TRAIN_FILE
d.test_file=TEST_FILE
d.skip_train_rows=SKIP_TRAIN_ROWS
d.num_train_rows = NUM_TRAIN_ROWS
d.num_test_rows= NUM_TEST_ROWS

s = Solution()
s.data_set = d
s.method = METHOD
s.f_ppl= FEATURE_PPL
s.para_tune_fcg = PARA_TUNE_CFG
s.transductive = TRANSDUCTIVE

In [11]:
s.load_dataset()


>>>>  Load Dataset Start 


>>>>  Load Dataset Done --------------------------------------------------------------------------- TC:0.1829



In [12]:
# dev_df = s.train_df.sample(100000)
# dev_df.to_csv(s.input_path+'dev.csv')

# dev_test_df = s.test_df.sample(10000)
# dev_test_df.to_csv(s.input_path+'dev_test.csv')

In [13]:
s.build_features()


>>>>  Build Features Start 

Feature Selected:	ip,os_device_hour_count,channel,dayofweek,os_device_mean_hour,ip_count,os_device_count,os,device,app,hour,app_channel_count,ip_mean_hour,ip_hour_count

>>>>  Build Features Done ------------------------------------------------------------------------- TC:1.417

       ip  os_device_hour_count  channel  dayofweek  os_device_mean_hour  \
0  163294                    91      125          3                   10   
1   78646                   294      178          2                    9   
2   18703                   235      466          3                    9   
3   63368                    46      459          1                    9   
4  255213                   894      435          2                    9   

   ip_count  os_device_count  os  device  app  hour  app_channel_count  \
0         3              935  13       2    6    10                554   
1         4             5200  18       1   12     9               2954   
2        31

In [14]:
param_test1 = {
               'max_depth':[1,2,3,4,5,6],
               'min_child_weight':[0.5,0.7,1,1.2]
               }

gsearch1 = GridSearchCV(
    estimator = XGBClassifier(learning_rate=0.3,
                      n_estimators=1000,
                      objective='binary:logistic',
                      nthread=8,
                      max_depth=5,
                      min_child_weight=0,
                      subsample=0.9,
                      colsample_bytree=0.7,
                      reg_alpha=4,
                      scale_pos_weight=0.9,
                      seed=99), 
                      param_grid = param_test1,     
                      scoring='roc_auc', 
                      n_jobs=4,
                      iid=False, 
                      cv=5)

gsearch1.fit(s.train_df,s.label_df)
para_dict = gsearch1.best_params_
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.94146, std: 0.01795, params: {'max_depth': 1, 'min_child_weight': 0.5},
  mean: 0.94146, std: 0.01795, params: {'max_depth': 1, 'min_child_weight': 0.7},
  mean: 0.94146, std: 0.01795, params: {'max_depth': 1, 'min_child_weight': 1},
  mean: 0.94146, std: 0.01795, params: {'max_depth': 1, 'min_child_weight': 1.2},
  mean: 0.93680, std: 0.02128, params: {'max_depth': 2, 'min_child_weight': 0.5},
  mean: 0.93751, std: 0.02224, params: {'max_depth': 2, 'min_child_weight': 0.7},
  mean: 0.93676, std: 0.02151, params: {'max_depth': 2, 'min_child_weight': 1},
  mean: 0.93783, std: 0.02034, params: {'max_depth': 2, 'min_child_weight': 1.2},
  mean: 0.93322, std: 0.02405, params: {'max_depth': 3, 'min_child_weight': 0.5},
  mean: 0.93151, std: 0.02668, params: {'max_depth': 3, 'min_child_weight': 0.7},
  mean: 0.93126, std: 0.02695, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.93161, std: 0.02573, params: {'max_depth': 3, 'min_child_weight': 1.2},
  mean: 0.93153, std: 

In [16]:
param_test2 = {
               'gamma':[i/10.0 for i in range(0,5)]
               }

gsearch2 = GridSearchCV(
    estimator = XGBClassifier(learning_rate=0.3,
                      n_estimators=1000,
                      objective='binary:logistic',
                      nthread=8,
                      max_depth=5,
                      min_child_weight=0,
                      subsample=0.9,
                      colsample_bytree=0.7,
                      reg_alpha=4,
                      scale_pos_weight=0.9,
                      seed=99), 
                      param_grid = param_test2,     
                      scoring='roc_auc', 
                      n_jobs=4,
                      iid=False, 
                      cv=5)

gsearch2.fit(s.train_df,s.label_df)
para_dict.update(gsearch2.best_params_)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.93897, std: 0.08212, params: {'gamma': 0.0},
  mean: 0.93256, std: 0.09379, params: {'gamma': 0.1},
  mean: 0.93132, std: 0.09524, params: {'gamma': 0.2},
  mean: 0.93045, std: 0.09324, params: {'gamma': 0.3},
  mean: 0.93436, std: 0.09119, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 0.9389690112421432)

In [17]:
param_test3 = {
               'subsample':[i/10.0 for i in range(6,10)],
               'colsample_bytree':[i/10.0 for i in range(6,10)]
               }

gsearch3 = GridSearchCV(
    estimator = XGBClassifier(learning_rate=0.3,
                      n_estimators=1000,
                      objective='binary:logistic',
                      nthread=8,
                      max_depth=5,
                      min_child_weight=0,
                      subsample=0.9,
                      colsample_bytree=0.7,
                      reg_alpha=4,
                      scale_pos_weight=0.9,
                      seed=99), 
                      param_grid = param_test3,     
                      scoring='roc_auc', 
                      n_jobs=4,
                      iid=False, 
                      cv=5)

gsearch3.fit(s.train_df,s.label_df)
para_dict.update(gsearch3.best_params_)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.95650, std: 0.04210, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.92728, std: 0.10051, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.93697, std: 0.08380, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.93870, std: 0.08342, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.94939, std: 0.06019, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.92765, std: 0.10350, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.94508, std: 0.06823, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.93897, std: 0.08212, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.95116, std: 0.05926, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.94846, std: 0.06430, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.92588, std: 0.10549, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.93246, std: 0.09506, params: {'colsample_bytree': 0.8, 'subsample'

In [18]:
param_test4 = {
               'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
              }

gsearch4 = GridSearchCV(
    estimator = XGBClassifier(learning_rate=0.3,
                      n_estimators=1000,
                      objective='binary:logistic',
                      nthread=8,
                      max_depth=5,
                      min_child_weight=0,
                      subsample=0.9,
                      colsample_bytree=0.7,
                      reg_alpha=4,
                      scale_pos_weight=0.9,
                      seed=99), 
                      param_grid = param_test4,     
                      scoring='roc_auc', 
                      n_jobs=4,
                      iid=False, 
                      cv=5)

gsearch4.fit(s.train_df,s.label_df)
para_dict.update(gsearch4.best_params_)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.95266, std: 0.03821, params: {'reg_alpha': 1e-05},
  mean: 0.94925, std: 0.04309, params: {'reg_alpha': 0.01},
  mean: 0.95139, std: 0.03896, params: {'reg_alpha': 0.1},
  mean: 0.96833, std: 0.02466, params: {'reg_alpha': 1},
  mean: 0.81965, std: 0.15242, params: {'reg_alpha': 100}],
 {'reg_alpha': 1},
 0.9683268241598544)

In [19]:
save_json(para_dict, PARA_TUNE_CFG)

In [20]:
s.init_model()
s.train()


>>>>  Model training Start 

[0]	train-auc:0.538107+0.0191645	test-auc:0.4999+0.0002008
[1]	train-auc:0.538107+0.0191645	test-auc:0.4999+0.0002008
[2]	train-auc:0.61918+0.107721	test-auc:0.548397+0.0985535
[3]	train-auc:0.725051+0.0996928	test-auc:0.64599+0.131645
[4]	train-auc:0.793597+0.0528426	test-auc:0.677017+0.111193
[5]	train-auc:0.802545+0.0651831	test-auc:0.67558+0.112814
[6]	train-auc:0.811422+0.0798079	test-auc:0.673876+0.113636
[7]	train-auc:0.838191+0.0620873	test-auc:0.787267+0.195812
[8]	train-auc:0.902723+0.0385791	test-auc:0.786933+0.1966
[9]	train-auc:0.902267+0.0383817	test-auc:0.786933+0.1966
[10]	train-auc:0.92921+0.0456693	test-auc:0.787635+0.197212
[11]	train-auc:0.947461+0.01808	test-auc:0.845976+0.193241
[12]	train-auc:0.975655+0.0217186	test-auc:0.920695+0.122754
[13]	train-auc:0.976586+0.0220191	test-auc:0.916503+0.129462
[14]	train-auc:0.984004+0.00787127	test-auc:0.91774+0.130071
[15]	train-auc:0.984371+0.00704207	test-auc:0.917673+0.130021
[16]	train-auc:

In [21]:
s.para_tune_fcg = input("\nPlease specify a new cfg file:\n")
cfg = load_json(s.para_tune_fcg)
print("To double check, your cfg file is:\n{0}\n{1}".format(s.para_tune_fcg,cfg))


Please specify a new cfg file:
./cfg/default.cfg
To double check, your cfg file is:
./cfg/default.cfg
{'max_depth': 1, 'min_child_weight': 1, 'gamma': 0.0, 'colsample_bytree': 0.6, 'subsample': 0.6, 'reg_alpha': 1}
