# 3. XGBClassifier_GridSearchCV
**Start from the most basic features, and try to improve step by step.**

Kaggle score: 

Reference:
- XGBoost Parameters, http://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters
- Python API Reference, http://xgboost.readthedocs.io/en/latest/python/python_api.html
- sklearn.model_selection.GridSearchCV, http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
- https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
- https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

## Run name

In [None]:
import time

project_name = 'TalkingdataAFD2018'
step_name = 'XGBClassifier_GridSearchCV'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = '%s_%s_%s' % (project_name, step_name, time_str)
print('run_name: %s' % run_name)
t0 = time.time()

## Important params

In [None]:
date = 7
print('date: ', date)

test_n_rows = None
# test_n_rows = 18790469
# test_n_rows = 10*10000

In [None]:
day_rows = {
    0: {
        'n_skiprows': 1,
        'n_rows': 10 * 10000
    },
    6: {
        'n_skiprows': 1,
        'n_rows': 9308568
    },
    7: {
        'n_skiprows': 1 + 9308568,
        'n_rows': 59633310
    },
    8: {
        'n_skiprows': 1 + 9308568 + 59633310,
        'n_rows': 62945075
    },
    9: {
        'n_skiprows': 1 + 9308568 + 59633310 + 62945075,
        'n_rows': 53016937
    }
}
n_skiprows = day_rows[date]['n_skiprows']
n_rows = day_rows[date]['n_rows']

## Import PKGs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
import sys
import gc
import time
import random
import zipfile
import h5py
import pickle
import math
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing
from multiprocessing import cpu_count

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

random_num = np.random.randint(10000)
print('random_num: %s' % random_num)

## Project folders

In [None]:
cwd = os.getcwd()

input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t%s' % input_folder)
print('output_folder: \t\t\t%s' % output_folder)
print('model_folder: \t\t\t%s' % model_folder)
print('log_folder: \t\t\t%s' % log_folder)

train_csv_file = os.path.join(input_folder, 'train.csv')
train_sample_csv_file = os.path.join(input_folder, 'train_sample.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
sample_submission_csv_file = os.path.join(input_folder, 'sample_submission.csv')

print('\ntrain_csv_file: \t\t%s' % train_csv_file)
print('train_sample_csv_file: \t\t%s' % train_sample_csv_file)
print('test_csv_file: \t\t\t%s' % test_csv_file)
print('sample_submission_csv_file: \t%s' % sample_submission_csv_file)

## Load data

In [None]:
%%time

train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_columns  = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

train_csv = pd.read_csv(
    train_csv_file, 
    skiprows=range(1, n_skiprows), 
    nrows=n_rows, 
    usecols=train_columns,
    dtype=dtypes,
    parse_dates=['click_time']
)
test_csv = pd.read_csv(
    test_csv_file, 
    nrows=test_n_rows, 
    usecols=test_columns,
    dtype=dtypes,
    parse_dates=['click_time']
)
sample_submission_csv = pd.read_csv(sample_submission_csv_file)

print('train_csv.shape: \t\t', train_csv.shape)
print('test_csv.shape: \t\t', test_csv.shape)
print('sample_submission_csv.shape: \t', sample_submission_csv.shape)
print('train_csv.dtypes: \n', train_csv.dtypes)

display(train_csv.head(2))
display(test_csv.head(2))
display(sample_submission_csv.head(2))

print('train_csv: %.2f Mb' % (sys.getsizeof(train_csv)/1024./1024.))
print('test_csv:  %.2f Mb' % (sys.getsizeof(test_csv)/1024./1024.))

In [None]:
y_data = train_csv['is_attributed']
train_csv.drop(['is_attributed'], axis=1, inplace=True)
display(y_data.head())

click_ids = test_csv['click_id']
test_csv.drop(['click_id'], axis=1, inplace=True)
display(click_ids.head())


display(train_csv.head())
display(test_csv.head())

## Features

In [None]:
train_csv['day'] = train_csv['click_time'].dt.day.astype('uint8')
train_csv['hour'] = train_csv['click_time'].dt.hour.astype('uint8')
train_csv['minute'] = train_csv['click_time'].dt.minute.astype('uint8')
train_csv['second'] = train_csv['click_time'].dt.second.astype('uint8')
print('train_csv.shape: \t', train_csv.shape)
display(train_csv.head(2))

In [None]:
test_csv['day'] = test_csv['click_time'].dt.day.astype('uint8')
test_csv['hour'] = test_csv['click_time'].dt.hour.astype('uint8')
test_csv['minute'] = test_csv['click_time'].dt.minute.astype('uint8')
test_csv['second'] = test_csv['click_time'].dt.second.astype('uint8')
print('test_csv.shape: \t', test_csv.shape)
display(test_csv.head(2))

In [None]:
arr = np.array([[3,6,6],[4,5,1]])
print(arr)
np.ravel_multi_index(arr, (7,6))
print(arr)
print(np.ravel_multi_index(arr, (7,6), order='F'))

In [None]:
def df_add_counts(df, cols, tag="_count"):
    arr_slice = df[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1), return_inverse=True, return_counts=True)
    df["_".join(cols) + tag] = counts[unqtags]
    return df

In [None]:
def df_add_uniques(df, cols, tag="_unique"):
    gp = df[cols] \
        .groupby(by=cols[0:len(cols) - 1])[cols[len(cols) - 1]] \
        .nunique() \
        .reset_index() \
        .rename(index=str, columns={cols[len(cols) - 1]: "_".join(cols)+tag})
    df = df.merge(gp, on=cols[0:len(cols) - 1], how='left')
    return df

In [None]:
# train_csv = df_add_counts(train_csv, ['ip', 'day', 'hour'])
# train_csv = df_add_counts(train_csv, ['ip', 'app'])
# train_csv = df_add_counts(train_csv, ['ip', 'app', 'os'])
# train_csv = df_add_counts(train_csv, ['ip', 'device'])
# train_csv = df_add_counts(train_csv, ['app', 'channel'])
# train_csv = df_add_uniques(train_csv, ['ip', 'channel'])

# display(train_csv.head())

In [None]:
# test_csv = df_add_counts(test_csv, ['ip', 'day', 'hour'])
# test_csv = df_add_counts(test_csv, ['ip', 'app'])
# test_csv = df_add_counts(test_csv, ['ip', 'app', 'os'])
# test_csv = df_add_counts(test_csv, ['ip', 'device'])
# test_csv = df_add_counts(test_csv, ['app', 'channel'])
# test_csv = df_add_uniques(test_csv, ['ip', 'channel'])

# display(test_csv.head())

In [None]:
def do_next_prev_Click( df,agg_suffix, agg_type='float32'):
    print('Extracting new features...')
    df['hour'] = pd.to_datetime(df.click_time).dt.hour.astype('int8')
    df['day'] = pd.to_datetime(df.click_time).dt.day.astype('int8')
    
    #### New added
    df['minute'] = pd.to_datetime(df.click_time).dt.minute.astype('int8')
    predictors.append('minute')
    df['second'] = pd.to_datetime(df.click_time).dt.second.astype('int8')
    predictors.append('second')
    print(f">> \nExtracting {agg_suffix} time calculation features...\n")
    
    GROUP_BY_NEXT_CLICKS = [
    
    # V1
    # {'groupby': ['ip']},
    # {'groupby': ['ip', 'app']},
    # {'groupby': ['ip', 'channel']},
    # {'groupby': ['ip', 'os']},
    
    # V3
    {'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    {'groupby': ['ip', 'os', 'device']},
    {'groupby': ['ip', 'os', 'device', 'app']}
    ]

    # Calculate the time to next click for each group
    for spec in GROUP_BY_NEXT_CLICKS:
    
       # Name of new feature
        new_feature = '{}_{}'.format('_'.join(spec['groupby']),agg_suffix)    
    
        # Unique list of features to select
        all_features = spec['groupby'] + ['click_time']

        # Run calculation
        print(f">> Grouping by {spec['groupby']}, and saving time to {agg_suffix} in: {new_feature}")
        if agg_suffix=="nextClick":
            df[new_feature] = (df[all_features].groupby(spec[
            'groupby']).click_time.shift(-1) - df.click_time).dt.seconds.astype(agg_type)
        elif agg_suffix== "prevClick":
            df[new_feature] = (df.click_time - df[all_features].groupby(spec[
                'groupby']).click_time.shift(+1) ).dt.seconds.astype(agg_type)
        predictors.append(new_feature)
        gc.collect()
#         print('predictors',predictors)
    return (df)

In [None]:
## Below a function is written to extract count feature by aggregating different cols
def do_count( df, group_cols, agg_type='uint32', show_max=False, show_agg=True ):
    agg_name='{}count'.format('_'.join(group_cols))  
    if show_agg:
        print( "\nAggregating by ", group_cols ,  '... and saved in', agg_name )
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )
    
##  Below a function is written to extract unique count feature from different cols
def do_countuniq( df, group_cols, counted, agg_type='uint32', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_countuniq'.format(('_'.join(group_cols)),(counted))  
    if show_agg:
        print( "\nCounting unqiue ", counted, " by ", group_cols ,  '... and saved in', agg_name )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )
### Below a function is written to extract cumulative count feature  from different cols    
def do_cumcount( df, group_cols, counted,agg_type='uint32', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_cumcount'.format(('_'.join(group_cols)),(counted)) 
    if show_agg:
        print( "\nCumulative count by ", group_cols , '... and saved in', agg_name  )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name]=gp.values
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )
### Below a function is written to extract mean feature  from different cols
def do_mean( df, group_cols, counted, agg_type='float32', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_mean'.format(('_'.join(group_cols)),(counted))  
    if show_agg:
        print( "\nCalculating mean of ", counted, " by ", group_cols , '... and saved in', agg_name )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )

def do_var( df, group_cols, counted, agg_type='float32', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_var'.format(('_'.join(group_cols)),(counted)) 
    if show_agg:
        print( "\nCalculating variance of ", counted, " by ", group_cols , '... and saved in', agg_name )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )

In [None]:
len_train = len(train_csv)
train_df=train_csv.append(test_csv)
predictors=[]

In [None]:
gc.collect()
train_df = do_next_prev_Click( train_df,agg_suffix='nextClick', agg_type='float32'  ); gc.collect()
train_df = do_next_prev_Click( train_df,agg_suffix='prevClick', agg_type='float32'  ); gc.collect()  ## Removed temporarily due RAM sortage. 
train_df = do_countuniq( train_df, ['ip'], 'channel' ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'device', 'os'], 'app'); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'day'], 'hour' ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'app'); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'app'], 'os'); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'device'); gc.collect()
train_df = do_countuniq( train_df, ['app'], 'channel'); gc.collect()
train_df = do_cumcount( train_df, ['ip'], 'os'); gc.collect()
train_df = do_cumcount( train_df, ['ip', 'device', 'os'], 'app'); gc.collect()
train_df = do_count( train_df, ['ip', 'day', 'hour'] ); gc.collect()
train_df = do_count( train_df, ['ip', 'app']); gc.collect()
train_df = do_count( train_df, ['ip', 'app', 'os']); gc.collect()
# train_df = do_var( train_df, ['ip', 'day', 'channel'], 'hour'); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os'], 'hour'); gc.collect()
# train_df = do_var( train_df, ['ip', 'app', 'channel'], 'day'); gc.collect()
# train_df = do_mean( train_df, ['ip', 'app', 'channel'], 'hour' ); gc.collect()

print(train_df.head(5))
gc.collect()

In [None]:
train_csv = train_df[:len_train]
test_csv = train_df[len_train:]

del train_df; gc.collect()

print(train_csv.shape)
print(test_csv.shape)

## Prepare data

In [None]:
print(train_csv.columns)
print(test_csv.columns)

In [None]:
train_useless_features = ['click_time']
train_csv.drop(train_useless_features, axis=1, inplace=True)

test_useless_features = ['click_time']
test_csv.drop(test_useless_features, axis=1, inplace=True)

display(train_csv.head())
display(test_csv.head())

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_csv, y_data, test_size=0.05, random_state=2017)
x_test = test_csv
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)

print('Time cost: %.2f s' % (time.time() - t0))

In [None]:
print(x_train.columns)
print(x_val.columns)
print(x_test.columns)

In [None]:
print(type(x_train))
print(type(y_train))
print(type(x_val))
print(type(y_val))
print(type(x_test))

x_train = x_train.as_matrix()
y_train = y_train.as_matrix()
x_val = x_val.as_matrix()
y_val = y_val.as_matrix()
x_test = x_test.as_matrix()

print(type(x_train))
print(type(y_train))
print(type(x_val))
print(type(y_val))
print(type(x_test))

In [None]:
gc.collect()

## Train

In [None]:
# %%time
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score


clf = xgb.XGBClassifier(
    max_depth=20, 
    learning_rate=0.1, 
    n_estimators=5000, 
    silent=False, 
    objective='gpu:binary:logistic', 
    booster='gbtree', 
#     n_jobs=1, 
    nthread=None, 
    gamma=0, 
    min_child_weight=1, 
    max_delta_step=0, 
    subsample=0.5, 
    colsample_bytree=0.7, 
    colsample_bylevel=0.7, 
    reg_alpha=0, 
    reg_lambda=1, 
    scale_pos_weight=97, 
    base_score=0.5, 
    random_state=0, 
    seed=None, 
    missing=None,
    # booster params
    num_boost_round=50,
    early_stopping_rounds=30,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    eval_metric=['auc'],

)

parameters = {
    'reg_alpha':[0.3], 
    'reg_lambda':[0.8],
    'scale_pos_weight': [1, 98, 200]
}

grid_search = GridSearchCV(clf, parameters, verbose=2, cv=3)
grid_search.fit(x_train, y_train)


In [None]:
print('*' * 80)
y_train_proba = grid_search.predict_proba(x_train)
print(y_train_proba.shape)
print(y_train_proba[:10])
y_train_pred = (y_train_proba[:, 1]>=0.5).astype(int)
acc_train = accuracy_score(y_train, y_train_pred)
roc_train = roc_auc_score(y_train, y_train_proba[:, 1])
print('acc_train: %.4f \t roc_train: %.4f' % (acc_train, roc_train))

# y_train_pred = grid_search.predict(x_train)
# acc_train = accuracy_score(y_train, y_train_pred)
# roc_train = roc_auc_score(y_train, y_train_proba[:, 1])
# print('acc_train: %.4f \t roc_train: %.4f' % (acc_train, roc_train))

y_val_proba = grid_search.predict_proba(x_val)
print(y_val_proba.shape)
print(y_val_proba[:10])
y_val_pred = (y_val_proba[:, 0]>=0.5).astype(int)
acc_val = accuracy_score(y_val, y_val_pred)
roc_val = roc_auc_score(y_val, y_val_proba[:, 1])
print('acc_val:   %.4f \t roc_val:   %.4f' % (acc_val, roc_val))

In [None]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
del x_train; gc.collect()
del x_val; gc.collect()

## Predict

In [None]:
run_name_acc = run_name + '_' + str(int(roc_val*10000)).zfill(4)
print(run_name_acc)

In [None]:
from sklearn.cross_validation import KFold

kf = KFold(105, n_folds=10)
for train_index, test_index in kf:
    print(test_index)

In [None]:
kf = KFold(len(x_test), n_folds=10)
y_test_proba = []
for train_index, test_index in kf:
    y_test_proba_fold = grid_search.predict_proba(x_test[test_index])
    y_test_proba.append(y_test_proba_fold)
    print(y_test_proba_fold.shape)
    
y_test_proba = np.concatenate(y_test_proba, axis=0)

print(y_test_proba.shape)
print(y_test_proba[:20])

In [None]:
def save_proba(y_train_proba, y_train, y_val_proba, y_val, y_test_proba, click_ids, file_name):
    print(click_ids[:5])
    if os.path.exists(file_name):
        os.remove(file_name)
        print('File removed: \t%s' % file_name)
    with h5py.File(file_name) as h:
        h.create_dataset('y_train_proba', data=y_train_proba)
        h.create_dataset('y_train', data=y_train)
        h.create_dataset('y_val_proba', data=y_val_proba)
        h.create_dataset('y_val', data=y_val)
        h.create_dataset('y_test_proba', data=y_test_proba)
        h.create_dataset('click_ids', data=click_ids)
    print('File saved: \t%s' % file_name)

def load_proba(file_name):
    with h5py.File(file_name, 'r') as h:
        y_train_proba = np.array(h['y_train_proba'])
        y_train = np.array(h['y_train'])
        y_val_proba = np.array(h['y_val_proba'])
        y_val = np.array(h['y_val'])
        y_test_proba = np.array(h['y_test_proba'])
        click_ids = np.array(h['click_ids'])
    print('File loaded: \t%s' % file_name)
    print(click_ids[:5])
    
    return y_train_proba, y_train, y_val_proba, y_val, y_test_proba, click_ids


y_proba_file = os.path.join(model_folder, 'proba_%s.p' % run_name_acc)
save_proba(
    y_train_proba[:, 1], 
    y_train, 
    y_val_proba[:, 1], 
    y_val, 
    y_test_proba[:, 1], 
    np.array(sample_submission_csv['click_id']), 
    y_proba_file
)
y_train_proba_true, y_train, y_val_proba_true, y_val, y_test_proba_true, click_ids = load_proba(y_proba_file)

print(y_train_proba_true.shape)
print(y_train.shape)
print(y_val_proba_true.shape)
print(y_val.shape)
print(y_test_proba_true.shape)
print(len(click_ids))

In [None]:
# %%time
submission_csv_file = os.path.join(output_folder, 'pred_%s.csv' % run_name_acc)
print(submission_csv_file)
submission_csv = pd.DataFrame({ 'click_id': click_ids , 'is_attributed': y_test_proba_true })
submission_csv.to_csv(submission_csv_file, index = False)
display(submission_csv.head())

In [None]:
print('Time cost: %.2f s' % (time.time() - t0))

print('random_num: ', random_num)
print('date: ', date)
print(run_name_acc)
print('Done!')