# 3. XGBoost
**Start from the most basic features, and try to improve step by step.**
Reference:
- https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
- https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

## Run name

In [1]:
import time

project_name = 'TalkingdataAFD2018'
step_name = 'XGBoost'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = '%s_%s_%s' % (project_name, step_name, time_str)
print('run_name: %s' % run_name)
t0 = time.time()

run_name: TalkingdataAFD2018_XGBoost_20180425_102906


## Important params

In [2]:
date = 6
print('date: ', date)

test_n_rows = None
# test_n_rows = 18790469
# test_n_rows = 10*10000

date:  6


In [3]:
day_rows = {
    0: {
        'n_skiprows': 1,
        'n_rows': 10 * 10000
    },
    6: {
        'n_skiprows': 1,
        'n_rows': 9308568
    },
    7: {
        'n_skiprows': 1 + 9308568,
        'n_rows': 59633310
    },
    8: {
        'n_skiprows': 1 + 9308568 + 59633310,
        'n_rows': 62945075
    },
    9: {
        'n_skiprows': 1 + 9308568 + 59633310 + 62945075,
        'n_rows': 53016937
    }
}
n_skiprows = day_rows[date]['n_skiprows']
n_rows = day_rows[date]['n_rows']

## Import PKGs

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
import gc
import time
import random
import zipfile
import h5py
import pickle
import math
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing
from multiprocessing import cpu_count

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

random_num = np.random.randint(10000)
print('random_num: %s' % random_num)

random_num: 1272


  from ._conv import register_converters as _register_converters


## Project folders

In [5]:
cwd = os.getcwd()

input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t%s' % input_folder)
print('output_folder: \t\t\t%s' % output_folder)
print('model_folder: \t\t\t%s' % model_folder)
print('log_folder: \t\t\t%s' % log_folder)

train_csv_file = os.path.join(input_folder, 'train.csv')
train_sample_csv_file = os.path.join(input_folder, 'train_sample.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
sample_submission_csv_file = os.path.join(input_folder, 'sample_submission.csv')

print('\ntrain_csv_file: \t\t%s' % train_csv_file)
print('train_sample_csv_file: \t\t%s' % train_sample_csv_file)
print('test_csv_file: \t\t\t%s' % test_csv_file)
print('sample_submission_csv_file: \t%s' % sample_submission_csv_file)

input_folder: 			D:\ref\talkingdata-adtracking-fraud-detection\input
output_folder: 			D:\ref\talkingdata-adtracking-fraud-detection\output
model_folder: 			D:\ref\talkingdata-adtracking-fraud-detection\model
log_folder: 			D:\ref\talkingdata-adtracking-fraud-detection\log

train_csv_file: 		D:\ref\talkingdata-adtracking-fraud-detection\input\train.csv
train_sample_csv_file: 		D:\ref\talkingdata-adtracking-fraud-detection\input\train_sample.csv
test_csv_file: 			D:\ref\talkingdata-adtracking-fraud-detection\input\test.csv
sample_submission_csv_file: 	D:\ref\talkingdata-adtracking-fraud-detection\input\sample_submission.csv


## Load data

In [6]:
%%time
train_csv = pd.read_csv(train_csv_file, skiprows=range(1, n_skiprows), nrows=n_rows, parse_dates=['click_time'])
test_csv = pd.read_csv(test_csv_file, nrows=test_n_rows, parse_dates=['click_time'])
sample_submission_csv = pd.read_csv(sample_submission_csv_file)

print('train_csv.shape: \t\t', train_csv.shape)
print('test_csv.shape: \t\t', test_csv.shape)
print('sample_submission_csv.shape: \t', sample_submission_csv.shape)
print('train_csv.dtypes: \n', train_csv.dtypes)

display(train_csv.head(2))
display(test_csv.head(2))
display(sample_submission_csv.head(2))

train_csv.shape: 		 (9308568, 8)
test_csv.shape: 		 (18790469, 7)
sample_submission_csv.shape: 	 (18790469, 2)
train_csv.dtypes: 
 ip                          int64
app                         int64
device                      int64
os                          int64
channel                     int64
click_time         datetime64[ns]
attributed_time            object
is_attributed               int64
dtype: object


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0


Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00


Unnamed: 0,click_id,is_attributed
0,0,0
1,1,0


Wall time: 25 s


In [7]:
y_data = train_csv['is_attributed']
train_csv.drop(['is_attributed'], axis=1, inplace=True)
display(y_data.head())

0    0
1    0
2    0
3    0
4    0
Name: is_attributed, dtype: int64

## Features

In [8]:
train_csv['day'] = train_csv['click_time'].dt.day.astype('uint8')
train_csv['hour'] = train_csv['click_time'].dt.hour.astype('uint8')
train_csv['minute'] = train_csv['click_time'].dt.minute.astype('uint8')
train_csv['second'] = train_csv['click_time'].dt.second.astype('uint8')
print('train_csv.shape: \t', train_csv.shape)
display(train_csv.head(2))

train_csv.shape: 	 (9308568, 11)


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,day,hour,minute,second
0,83230,3,1,13,379,2017-11-06 14:32:21,,6,14,32,21
1,17357,3,1,19,379,2017-11-06 14:33:34,,6,14,33,34


In [9]:
test_csv['day'] = test_csv['click_time'].dt.day.astype('uint8')
test_csv['hour'] = test_csv['click_time'].dt.hour.astype('uint8')
test_csv['minute'] = test_csv['click_time'].dt.minute.astype('uint8')
test_csv['second'] = test_csv['click_time'].dt.second.astype('uint8')
print('test_csv.shape: \t', test_csv.shape)
display(test_csv.head(2))

test_csv.shape: 	 (18790469, 11)


Unnamed: 0,click_id,ip,app,device,os,channel,click_time,day,hour,minute,second
0,0,5744,9,1,3,107,2017-11-10 04:00:00,10,4,0,0
1,1,119901,9,1,3,466,2017-11-10 04:00:00,10,4,0,0


In [10]:
arr = np.array([[3,6,6],[4,5,1]])
print(arr)
np.ravel_multi_index(arr, (7,6))
print(arr)
print(np.ravel_multi_index(arr, (7,6), order='F'))

[[3 6 6]
 [4 5 1]]
[[3 6 6]
 [4 5 1]]
[31 41 13]


In [11]:
def df_add_counts(df, cols, tag="_count"):
    arr_slice = df[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1), return_inverse=True, return_counts=True)
    df["_".join(cols) + tag] = counts[unqtags]
    return df

In [12]:
def df_add_uniques(df, cols, tag="_unique"):
    gp = df[cols] \
        .groupby(by=cols[0:len(cols) - 1])[cols[len(cols) - 1]] \
        .nunique() \
        .reset_index() \
        .rename(index=str, columns={cols[len(cols) - 1]: "_".join(cols)+tag})
    df = df.merge(gp, on=cols[0:len(cols) - 1], how='left')
    return df

In [13]:
train_csv = df_add_counts(train_csv, ['ip', 'day', 'hour'])
train_csv = df_add_counts(train_csv, ['ip', 'app'])
train_csv = df_add_counts(train_csv, ['ip', 'app', 'os'])
train_csv = df_add_counts(train_csv, ['ip', 'device'])
train_csv = df_add_counts(train_csv, ['app', 'channel'])
train_csv = df_add_uniques(train_csv, ['ip', 'channel'])

display(train_csv.head())

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,day,hour,minute,second,ip_day_hour_count,ip_app_count,ip_app_os_count,ip_device_count,app_channel_count,ip_channel_unique
0,83230,3,1,13,379,2017-11-06 14:32:21,,6,14,32,21,1,194,80,1243,48623,82
1,17357,3,1,19,379,2017-11-06 14:33:34,,6,14,33,34,1,122,22,922,48623,79
2,35810,3,1,13,379,2017-11-06 14:34:12,,6,14,34,12,1,35,5,375,48623,61
3,45745,14,1,13,478,2017-11-06 14:34:52,,6,14,34,52,1,459,105,8075,1518,95
4,161007,3,1,13,379,2017-11-06 14:35:08,,6,14,35,8,1,26,4,176,48623,53


In [14]:
test_csv = df_add_counts(test_csv, ['ip', 'day', 'hour'])
test_csv = df_add_counts(test_csv, ['ip', 'app'])
test_csv = df_add_counts(test_csv, ['ip', 'app', 'os'])
test_csv = df_add_counts(test_csv, ['ip', 'device'])
test_csv = df_add_counts(test_csv, ['app', 'channel'])
test_csv = df_add_uniques(test_csv, ['ip', 'channel'])

display(test_csv.head())

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,day,hour,minute,second,ip_day_hour_count,ip_app_count,ip_app_os_count,ip_device_count,app_channel_count,ip_channel_unique
0,0,5744,9,1,3,107,2017-11-10 04:00:00,10,4,0,0,34,28,1,91,269116,34
1,1,119901,9,1,3,466,2017-11-10 04:00:00,10,4,0,0,403,289,5,2069,372406,90
2,2,72287,21,1,19,128,2017-11-10 04:00:00,10,4,0,0,229,312,24,2092,219966,87
3,3,78477,15,1,13,111,2017-11-10 04:00:00,10,4,0,0,239,42,23,1190,67038,92
4,4,123080,12,1,13,328,2017-11-10 04:00:00,10,4,0,0,60,24,7,203,191986,58


## Prepare data

In [15]:
train_useless_features = ['click_time', 'attributed_time']
train_csv.drop(train_useless_features, axis=1, inplace=True)

test_useless_features = ['click_time', 'click_id']
test_csv.drop(test_useless_features, axis=1, inplace=True)

display(train_csv.head())
display(test_csv.head())

Unnamed: 0,ip,app,device,os,channel,day,hour,minute,second,ip_day_hour_count,ip_app_count,ip_app_os_count,ip_device_count,app_channel_count,ip_channel_unique
0,83230,3,1,13,379,6,14,32,21,1,194,80,1243,48623,82
1,17357,3,1,19,379,6,14,33,34,1,122,22,922,48623,79
2,35810,3,1,13,379,6,14,34,12,1,35,5,375,48623,61
3,45745,14,1,13,478,6,14,34,52,1,459,105,8075,1518,95
4,161007,3,1,13,379,6,14,35,8,1,26,4,176,48623,53


Unnamed: 0,ip,app,device,os,channel,day,hour,minute,second,ip_day_hour_count,ip_app_count,ip_app_os_count,ip_device_count,app_channel_count,ip_channel_unique
0,5744,9,1,3,107,10,4,0,0,34,28,1,91,269116,34
1,119901,9,1,3,466,10,4,0,0,403,289,5,2069,372406,90
2,72287,21,1,19,128,10,4,0,0,229,312,24,2092,219966,87
3,78477,15,1,13,111,10,4,0,0,239,42,23,1190,67038,92
4,123080,12,1,13,328,10,4,0,0,60,24,7,203,191986,58


In [16]:
x_train, x_val, y_train, y_val = train_test_split(train_csv, y_data, test_size=0.01, random_state=2017)
x_test = test_csv
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)

print('Time cost: %.2f s' % (time.time() - t0))

(9215482, 15)
(9215482,)
(93086, 15)
(93086,)
(18790469, 15)
Time cost: 91.63 s


## Train

In [17]:
%%time

import xgboost as xgb
from sklearn.metrics import roc_auc_score

xg_train = xgb.DMatrix(x_train, y_train)
xg_val = xgb.DMatrix(x_val, y_val)
xg_test = xgb.DMatrix(x_test)


watch_list = [(xg_train, "train"), (xg_val, 'val')]
param = {
    # General Parameters
    'booster': 'gbtree',
    'silent': 0,
    # 'nthread': 1, # defaultmaximum
    
    # Parameters for Tree Booster
    'eta': 0.3, # [0, 1]
    'gamma': 0, # [0, +inf]
#     'max_depth': 300, # [0, +inf]
#     'min_child_weight': 1, # [0, +inf]
#     'max_delta_step': 1, # [0, +inf]
#     'subsample': 0.7, # (0, 1]
#     'colsample_bytree': 0.7, # (0, 1]
#     'colsample_bylevel': 1, # (0, 1]
#     'lambda': 1,
#     'alpha': 0,
    'tree_method': 'gpu_hist', #  {'auto', 'exact', 'approx', 'hist', 'gpu_exact', 'gpu_hist'}, default 'auto'
#     'sketch_eps': 0.3,
    'scale_pos_weight': 97,
#     'updater': 'grow_colmaker, prune',
#     'refresh_leaf': 1,
#     'process_type': 'default',
#     'grow_policy': 'depthwise',
    # 'max_leaves': 0,
#     'max_bin': 256,
    'predictor': 'gpu_predictor', # {'cpu_predictor', 'gpu_predictor'}

    # Learning Task Parameters
    'objective': 'gpu:binary:logistic', # {'reg:linear', 'reg:logistic', 'binary:logistic', 'binary:logitraw', 'gpu:reg:linear', 'gpu:reg:logistic', 'gpu:binary:logistic', 'gpu:binary:logitraw'}
#     'base_score': 0.5,
    'eval_metric': ['auc', 'error'], # 'logloss'
#     'seed': 0,

    # Train Parameters
#     'num_boost_round': 100,
#     'evals': watch_list, 
#     'obj': None, 
#     'feval': None, 
#     'maximize': False, 
#     'early_stopping_rounds': 10, 
#     'evals_result': None, 
#     'verbose_eval': True, 
#     'xgb_model': None, 
#     'callbacks': None, 
#     'learning_rates': 0.01

}
params = {}
params['tree_method'] = 'gpu_hist'


bst = xgb.train(
    param, 
    xg_train, 
    num_boost_round=20, 
    evals=watch_list,
    early_stopping_rounds=10
)


print('*' * 80)
y_train_proba = bst.predict(xg_train)
y_train_pred = (y_train_proba>=0.5).astype(int)
acc_train = accuracy_score(y_train, y_train_pred)
roc_train = roc_auc_score(y_train, y_train_proba)
print('acc_train: %.4f \t roc_train: %.4f' % (acc_train, roc_train))

y_val_proba = bst.predict(xg_val)
y_val_pred = (y_val_proba>=0.5).astype(int)
acc_val = accuracy_score(y_val, y_val_pred)
roc_val = roc_auc_score(y_val, y_val_proba)
print('acc_val:   %.4f \t roc_val:   %.4f' % (acc_val, roc_val))

[0]	train-auc:0.944452	train-error:0.008973	val-auc:0.943057	val-error:0.008809
Multiple eval metrics have been passed: 'val-error' will be used for early stopping.

Will train until val-error hasn't improved in 10 rounds.
[1]	train-auc:0.947709	train-error:0.009942	val-auc:0.948157	val-error:0.00998
[2]	train-auc:0.951249	train-error:0.009489	val-auc:0.954818	val-error:0.009335
[3]	train-auc:0.956363	train-error:0.009296	val-auc:0.960372	val-error:0.009185
[4]	train-auc:0.960523	train-error:0.009128	val-auc:0.96244	val-error:0.009067
[5]	train-auc:0.962547	train-error:0.009506	val-auc:0.963778	val-error:0.009529
[6]	train-auc:0.963815	train-error:0.009194	val-auc:0.963427	val-error:0.009174
[7]	train-auc:0.965234	train-error:0.008947	val-auc:0.963519	val-error:0.00882
[8]	train-auc:0.96586	train-error:0.008918	val-auc:0.962829	val-error:0.008884
[9]	train-auc:0.967051	train-error:0.008856	val-auc:0.963848	val-error:0.008841
[10]	train-auc:0.968104	train-error:0.008784	val-auc:0.963639

## Predict

In [18]:
run_name_acc = run_name + '_' + str(int(roc_val*10000)).zfill(4)
print(run_name_acc)

TalkingdataAFD2018_XGBoost_20180425_102906_9636


In [19]:
y_test_proba = bst.predict(xg_test)
print(y_test_proba.shape)
print(y_test_proba[:20])

(18790469,)
[0.0787384  0.04576111 0.03353291 0.02428759 0.02390176 0.03024226
 0.04080065 0.04013658 0.05299309 0.02771144 0.06025707 0.02523745
 0.02250482 0.03903299 0.03918507 0.02593384 0.06384441 0.02593384
 0.07003337 0.02658433]


In [20]:
def save_proba(y_train_proba, y_train, y_val_proba, y_val, y_test_proba, click_ids, file_name):
    print(click_ids[:5])
    if os.path.exists(file_name):
        os.remove(file_name)
        print('File removed: \t%s' % file_name)
    with h5py.File(file_name) as h:
        h.create_dataset('y_train_proba', data=y_train_proba)
        h.create_dataset('y_train', data=y_train)
        h.create_dataset('y_val_proba', data=y_val_proba)
        h.create_dataset('y_val', data=y_val)
        h.create_dataset('y_test_proba', data=y_test_proba)
        h.create_dataset('click_ids', data=click_ids)
    print('File saved: \t%s' % file_name)

def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_train_proba = np.array(h['y_train_proba'])
        y_train = np.array(h['y_train'])
        y_val_proba = np.array(h['y_val_proba'])
        y_val = np.array(h['y_val'])
        y_test_proba = np.array(h['y_test_proba'])
        click_ids = np.array(h['click_ids'])
    print('File loaded: \t%s' % file_name)
    print(click_ids[:5])
    
    return y_train_proba, y_train, y_val_proba, y_val, y_test_proba, click_ids


y_proba_file = os.path.join(model_folder, 'proba_%s.p' % run_name_acc)
save_proba(y_train_proba, y_train, y_val_proba, y_val, y_test_proba, np.array(sample_submission_csv['click_id']), y_proba_file)
y_train_proba, y_train, y_val_proba, y_val, y_test_proba, click_ids = load_proba(y_proba_file)

print(y_train_proba.shape)
print(y_train.shape)
print(y_val_proba.shape)
print(y_val.shape)
print(y_test_proba.shape)
print(len(click_ids))

[0 1 2 3 4]
File saved: 	D:\ref\talkingdata-adtracking-fraud-detection\model\proba_TalkingdataAFD2018_XGBoost_20180425_102906_9636.p
File loaded: 	D:\ref\talkingdata-adtracking-fraud-detection\model\proba_TalkingdataAFD2018_XGBoost_20180425_102906_9636.p
[0 1 2 3 4]
(9215482,)
(9215482,)
(93086,)
(93086,)
(18790469,)
18790469


In [21]:
%%time
submission_csv_file = os.path.join(output_folder, 'pred_%s.csv' % run_name_acc)
print(submission_csv_file)
submission_csv = pd.DataFrame({ 'click_id': click_ids , 'is_attributed': y_test_proba })
submission_csv.to_csv(submission_csv_file, index = False)

D:\ref\talkingdata-adtracking-fraud-detection\output\pred_TalkingdataAFD2018_XGBoost_20180425_102906_9636.csv
Wall time: 45 s


In [22]:
print('Time cost: %.2f s' % (time.time() - t0))

print('random_num: ', random_num)
print('date: ', date)
print(run_name_acc)
print('Done!')

Time cost: 167.08 s
random_num:  1272
date:  6
TalkingdataAFD2018_XGBoost_20180425_102906_9636
Done!
