In [101]:
import gc
import time
import numpy as np
import pandas as pd
from time import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import xgboost as xgb
from xgboost import plot_importance

import matplotlib.pyplot as plt

#from fastai.imports import *
#from fastai.structured import*

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [102]:
PATH = "/home/vishy/Desktop/Myfiles/AV/AMEX/"

In [103]:
# Change this for validation with 10% from train
is_valid = False

In [167]:
def handle_missing_inplace(df):
    df['age_level'].fillna(value=99.0, inplace=True)
    df['city_development_index'].fillna(value=99.0, inplace=True)
    df['gender'].fillna(value='Unknown', inplace=True)
    df['product_category_2'].fillna(value=999999.0, inplace=True)
    df['user_depth'].fillna(value=99.0, inplace=True)
    df['user_group_id'].fillna(value=99.0, inplace=True)
    return df

In [168]:
def timeFeatures(df):
    # Make some new features with click_time column
    df['day'] = df['DateTime'].dt.day.astype('uint8')
    df['hour'] = df['DateTime'].dt.hour.astype('uint8')
    df['minute'] = df['DateTime'].dt.minute.astype('uint8')
    df['dow'] = df['DateTime'].dt.dayofweek.astype('uint8')
    df.drop(['DateTime'], axis=1, inplace=True)
    return df

In [2]:
train = pd.read_csv(PATH+'train.csv', low_memory=False, parse_dates=['DateTime'])
train.rename(columns={'product':'prod'}, inplace=True)
print(train.shape)

NameError: name 'pd' is not defined

In [170]:
test = pd.read_csv(PATH+'test.csv', low_memory=False, parse_dates=['DateTime'])
test.rename(columns={'product':'prod'}, inplace=True)
print(test.shape)

(128858, 14)


In [171]:
train = handle_missing_inplace(train)
train = timeFeatures(train)

test = handle_missing_inplace(test)
test = timeFeatures(test)

In [173]:
CLICK_ATTR_CATS = [['prod', 'campaign_id'],['prod', 'webpage_id'], ['prod', 'product_category_1'],
                   ['user_group_id','gender'],['user_group_id','age_level'],['user_group_id', 'user_depth'],
                   ['prod','age_level'], ['prod','user_depth'],['product_category_1','age_level'],
                   ['product_category_1','user_depth']]

In [174]:
# Find frequency of is_attributed for each unique value in column
freqs = {}
for cols in CLICK_ATTR_CATS:
    
    # New feature name
    new_feature = '_'.join(cols)+'_confRate'    
    
    # Perform the groupby
    group_object = train.groupby(cols)
    
    # Group sizes    
    group_sizes = group_object.size()
    log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
    print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature, 
        group_sizes.max(), 
        np.round(group_sizes.mean(), 2),
        np.round(group_sizes.median(), 2),
        group_sizes.min()
    ))
    
    # Aggregation function
    def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        return rate * conf
    
    # Perform the merge
    train = train.merge(
        group_object['is_click']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_click': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )
    test = test.merge(
        group_object['is_click']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_click': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )
    
print(train.shape, test.shape)

>> Calculating confidence-weighted rate for: ['prod', 'campaign_id'].
   Saving to: prod_campaign_id_confRate. Group Max /Mean / Median / Min: 64962 / 5939.63 / 2162.0 / 1
>> Calculating confidence-weighted rate for: ['prod', 'webpage_id'].
   Saving to: prod_webpage_id_confRate. Group Max /Mean / Median / Min: 81829 / 6346.45 / 1871.0 / 1
>> Calculating confidence-weighted rate for: ['prod', 'product_category_1'].
   Saving to: prod_product_category_1_confRate. Group Max /Mean / Median / Min: 70930 / 10774.21 / 4588.0 / 1
>> Calculating confidence-weighted rate for: ['user_group_id', 'gender'].
   Saving to: user_group_id_gender_confRate. Group Max /Mean / Median / Min: 140317 / 30886.07 / 13779.0 / 33
>> Calculating confidence-weighted rate for: ['user_group_id', 'age_level'].
   Saving to: user_group_id_age_level_confRate. Group Max /Mean / Median / Min: 140317 / 33092.21 / 16011.0 / 153
>> Calculating confidence-weighted rate for: ['user_group_id', 'user_depth'].
   Saving to: user

In [175]:
y = train['is_click']
train.drop(['is_click','session_id'], axis=1, inplace=True)

# Drop IP and ID from test rows
sub = pd.DataFrame()
sub['session_id'] = test['session_id'].astype('int')
test.drop(['session_id'], axis=1, inplace=True)
gc.collect()

nrow_train = train.shape[0]
merge = pd.concat([train, test])
print(merge.shape)

del train, test
gc.collect()

(592149, 28)


0

In [176]:
# Define all the groupby transformations
GROUPBY_AGGREGATIONS = [
    # Variance in day, for user_id-prod-campaign_id
    {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},
    # Variance in hour, for user_id-prod-product_category_1
    {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},
    # Variance in hour, for user_id-day-campaign_id
    {'groupby': ['user_id','day','campaign_id'], 'select': 'hour', 'agg': 'var'},
    # Count, for user_id-day-hour'dow','hour'
    {'groupby': ['user_id','day','hour'], 'select': 'campaign_id', 'agg': 'count'},
    # Count, for user_id-prod
    {'groupby': ['user_id', 'prod'], 'select': 'campaign_id', 'agg': 'count'},        
    # Count, for user_id-prod-webpage_id
    {'groupby': ['user_id', 'prod', 'webpage_id'], 'select': 'campaign_id', 'agg': 'count'},
    # Count, for user_id-prod-day-hour
    {'groupby': ['user_id','prod','day','hour'], 'select': 'campaign_id', 'agg': 'count'},
    # Mean hour, for user_id-prod-campaign_id
    {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, 
    
    {'groupby': ['prod'], 
     'select': 'user_id', 
     'agg': lambda x: float(len(x)) / len(x.unique()), 
     'agg_name': 'AvgprodPerDistinct'
    },
    # How popular is the app or channel?
    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},
    {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},

    {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, 
    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, 
    {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, 
    {'groupby': ['user_id','prod'], 'select': 'webpage_id', 'agg': 'nunique'}, 
    {'groupby': ['user_id'], 'select': 'product_category_1', 'agg': 'nunique'}, 
    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'nunique'}, 
    {'groupby': ['user_id', 'product_category_1', 'webpage_id'], 'select': 'prod', 'agg': 'nunique'}, 
    {'groupby': ['user_id','product_category_1','webpage_id'], 'select': 'prod', 'agg': 'cumcount'}, 
    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'cumcount'}, 
    {'groupby': ['user_id'], 'select': 'webpage_id', 'agg': 'cumcount'}
]

In [177]:
# Apply all the groupby transformations
for spec in GROUPBY_AGGREGATIONS:
    
    # Name of the aggregation we're applying
    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']
    
    # Name of new feature
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])
    
    # Info
    print("Grouping by {}, and aggregating {} with {}".format(
        spec['groupby'], spec['select'], agg_name
    ))
    
    # Unique list of features to select
    all_features = list(set(spec['groupby'] + [spec['select']]))
    
    # Perform the groupby
    gp = merge[all_features]. \
        groupby(spec['groupby'])[spec['select']]. \
        agg(spec['agg']). \
        reset_index(). \
        rename(index=str, columns={spec['select']: new_feature})
        
    # Merge back to X_total
    if 'cumcount' == spec['agg']:
        merge[new_feature] = gp[0].values
    else:
        merge = merge.merge(gp, on=spec['groupby'], how='left')
        
     # Clear memory
    del gp
    gc.collect()

print(merge.shape)

Grouping by ['user_id', 'prod', 'campaign_id'], and aggregating day with var
Grouping by ['user_id', 'prod', 'product_category_1'], and aggregating hour with var
Grouping by ['user_id', 'day', 'campaign_id'], and aggregating hour with var
Grouping by ['user_id', 'day', 'hour'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod', 'webpage_id'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod', 'day', 'hour'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod', 'campaign_id'], and aggregating hour with mean
Grouping by ['prod'], and aggregating user_id with AvgprodPerDistinct
Grouping by ['prod'], and aggregating campaign_id with count
Grouping by ['campaign_id'], and aggregating prod with count
Grouping by ['user_id'], and aggregating campaign_id with nunique
Grouping by ['user_id'], and aggregating prod with nunique
Grouping by ['user_id', 'day'], and aggre

In [178]:
merge.columns

Index(['user_id', 'prod', 'campaign_id', 'webpage_id', 'product_category_1',
       'product_category_2', 'user_group_id', 'gender', 'age_level',
       'user_depth', 'city_development_index', 'var_1', 'day', 'hour',
       'minute', 'dow', 'prod_campaign_id_confRate',
       'prod_webpage_id_confRate', 'prod_product_category_1_confRate',
       'user_group_id_gender_confRate', 'user_group_id_age_level_confRate',
       'user_group_id_user_depth_confRate', 'prod_age_level_confRate',
       'prod_user_depth_confRate', 'product_category_1_age_level_confRate',
       'product_category_1_user_depth_confRate', 'prod_dow_confRate',
       'product_category_1_dow_confRate', 'user_id_prod_campaign_id_var_day',
       'user_id_prod_product_category_1_var_hour',
       'user_id_day_campaign_id_var_hour',
       'user_id_day_hour_count_campaign_id', 'user_id_prod_count_campaign_id',
       'user_id_prod_webpage_id_count_campaign_id',
       'user_id_prod_day_hour_count_campaign_id',
       'user_

In [179]:
cat_vars = ['user_id', 'prod', 'campaign_id','webpage_id', 'product_category_1','product_category_2',
            'user_group_id', 'gender', 'age_level', 'user_depth','city_development_index', 'var_1',
            'day','hour','minute', 'dow']

contin_vars = ['prod_campaign_id_confRate',
       'prod_webpage_id_confRate', 'prod_product_category_1_confRate',
       'user_group_id_gender_confRate', 'user_group_id_age_level_confRate',
       'user_group_id_user_depth_confRate', 'prod_age_level_confRate',
       'prod_user_depth_confRate', 'product_category_1_age_level_confRate',
       'product_category_1_user_depth_confRate', 'prod_dow_confRate',
       'product_category_1_dow_confRate', 'user_id_prod_campaign_id_var_day',
       'user_id_prod_product_category_1_var_hour',
       'user_id_day_campaign_id_var_hour',
       'user_id_day_hour_count_campaign_id', 'user_id_prod_count_campaign_id',
       'user_id_prod_webpage_id_count_campaign_id',
       'user_id_prod_day_hour_count_campaign_id',
       'user_id_prod_campaign_id_mean_hour', 'prod_AvgprodPerDistinct_user_id',
       'prod_count_campaign_id', 'campaign_id_count_prod',
       'user_id_nunique_campaign_id', 'user_id_nunique_prod',
       'user_id_day_nunique_hour', 'user_id_prod_nunique_webpage_id',
       'user_id_nunique_product_category_1', 'prod_nunique_campaign_id',
       'user_id_product_category_1_webpage_id_nunique_prod',
       'user_id_product_category_1_webpage_id_cumcount_prod',
       'user_id_cumcount_prod', 'user_id_cumcount_webpage_id']

for v in cat_vars: 
    merge[v] = merge[v].astype('category')

for v in contin_vars: 
    merge[v] = merge[v].astype('float32')

In [180]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [181]:
merge.head()

Unnamed: 0,user_id,prod,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,day,hour,minute,dow,prod_campaign_id_confRate,prod_webpage_id_confRate,prod_product_category_1_confRate,user_group_id_gender_confRate,user_group_id_age_level_confRate,user_group_id_user_depth_confRate,prod_age_level_confRate,prod_user_depth_confRate,product_category_1_age_level_confRate,product_category_1_user_depth_confRate,prod_dow_confRate,product_category_1_dow_confRate,user_id_prod_campaign_id_var_day,user_id_prod_product_category_1_var_hour,user_id_day_campaign_id_var_hour,user_id_day_hour_count_campaign_id,user_id_prod_count_campaign_id,user_id_prod_webpage_id_count_campaign_id,user_id_prod_day_hour_count_campaign_id,user_id_prod_campaign_id_mean_hour,prod_AvgprodPerDistinct_user_id,prod_count_campaign_id,campaign_id_count_prod,user_id_nunique_campaign_id,user_id_nunique_prod,user_id_day_nunique_hour,user_id_prod_nunique_webpage_id,user_id_nunique_product_category_1,prod_nunique_campaign_id,user_id_product_category_1_webpage_id_nunique_prod,user_id_product_category_1_webpage_id_cumcount_prod,user_id_cumcount_prod,user_id_cumcount_webpage_id
0,858557,C,359520,13787,4,999999.0,10.0,Female,4.0,3.0,3.0,0,2,0,0,6,0.052104,0.0491,0.05067,0.046802,0.046802,0.046592,0.047969,0.069581,0.047981,0.059002,0.068583,0.054782,18.0,72.0,,1.0,2.0,2.0,1.0,6.0,2.1245,191188.0,127872.0,2.0,2.0,1.0,1.0,2.0,9.0,1.0,0.0,0.0,0.0
1,243253,C,105960,11085,5,999999.0,8.0,Female,2.0,2.0,99.0,0,2,0,0,6,0.055999,0.055999,0.044871,0.048408,0.048408,0.036015,0.07152,0.051159,0.046252,0.041873,0.068583,0.042707,,0.0,,3.0,5.0,1.0,3.0,0.0,2.1245,191188.0,40349.0,3.0,1.0,1.0,2.0,3.0,9.0,1.0,0.0,0.0,0.0
2,243253,C,359520,13787,4,999999.0,8.0,Female,2.0,2.0,99.0,0,2,0,0,6,0.052104,0.0491,0.05067,0.048408,0.048408,0.036015,0.07152,0.051159,0.055615,0.045323,0.068583,0.054782,24.5,0.5,,3.0,5.0,4.0,3.0,0.5,2.1245,191188.0,127872.0,3.0,1.0,1.0,2.0,3.0,9.0,1.0,0.0,1.0,1.0
3,1097446,I,359520,13787,3,999999.0,3.0,Male,3.0,3.0,2.0,1,2,0,0,6,0.044395,0.049024,0.040035,0.064668,0.064668,0.064879,0.04995,0.059989,0.070024,0.083187,0.059274,0.083642,,,,1.0,3.0,3.0,1.0,0.0,1.925924,76438.0,127872.0,3.0,5.0,1.0,1.0,4.0,9.0,1.0,0.0,0.0,0.0
4,663656,C,405490,60305,3,999999.0,2.0,Male,2.0,3.0,2.0,1,2,0,1,6,0.091613,0.091613,0.087606,0.071242,0.071242,0.071409,0.07152,0.069581,0.081638,0.083187,0.068583,0.083642,8.0,112.5,,1.0,2.0,2.0,1.0,7.5,2.1245,191188.0,107919.0,3.0,4.0,1.0,1.0,5.0,9.0,1.0,0.0,0.0,0.0


In [182]:
merge.shape

(592149, 49)

In [183]:
lb = LabelEncoder()
for v in cat_vars:
    merge[v] = lb.fit_transform(merge[v])
print(merge.shape)

(592149, 49)


In [184]:
merge.head()

Unnamed: 0,user_id,prod,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,day,hour,minute,dow,prod_campaign_id_confRate,prod_webpage_id_confRate,prod_product_category_1_confRate,user_group_id_gender_confRate,user_group_id_age_level_confRate,user_group_id_user_depth_confRate,prod_age_level_confRate,prod_user_depth_confRate,product_category_1_age_level_confRate,product_category_1_user_depth_confRate,prod_dow_confRate,product_category_1_dow_confRate,user_id_prod_campaign_id_var_day,user_id_prod_product_category_1_var_hour,user_id_day_campaign_id_var_hour,user_id_day_hour_count_campaign_id,user_id_prod_count_campaign_id,user_id_prod_webpage_id_count_campaign_id,user_id_prod_day_hour_count_campaign_id,user_id_prod_campaign_id_mean_hour,prod_AvgprodPerDistinct_user_id,prod_count_campaign_id,campaign_id_count_prod,user_id_nunique_campaign_id,user_id_nunique_prod,user_id_day_nunique_hour,user_id_prod_nunique_webpage_id,user_id_nunique_product_category_1,prod_nunique_campaign_id,user_id_product_category_1_webpage_id_nunique_prod,user_id_product_category_1_webpage_id_cumcount_prod,user_id_cumcount_prod,user_id_cumcount_webpage_id
0,137427,2,4,3,3,31,10,0,4,2,2,0,0,0,0,6,0.052104,0.0491,0.05067,0.046802,0.046802,0.046592,0.047969,0.069581,0.047981,0.059002,0.068583,0.054782,18.0,72.0,,1.0,2.0,2.0,1.0,6.0,2.1245,191188.0,127872.0,2.0,2.0,1.0,1.0,2.0,9.0,1.0,0.0,0.0,0.0
1,41508,2,2,2,4,31,8,0,2,1,4,0,0,0,0,6,0.055999,0.055999,0.044871,0.048408,0.048408,0.036015,0.07152,0.051159,0.046252,0.041873,0.068583,0.042707,,0.0,,3.0,5.0,1.0,3.0,0.0,2.1245,191188.0,40349.0,3.0,1.0,1.0,2.0,3.0,9.0,1.0,0.0,0.0,0.0
2,41508,2,4,3,3,31,8,0,2,1,4,0,0,0,0,6,0.052104,0.0491,0.05067,0.048408,0.048408,0.036015,0.07152,0.051159,0.055615,0.045323,0.068583,0.054782,24.5,0.5,,3.0,5.0,4.0,3.0,0.5,2.1245,191188.0,127872.0,3.0,1.0,1.0,2.0,3.0,9.0,1.0,0.0,1.0,1.0
3,171416,8,4,3,2,31,3,1,3,2,1,1,0,0,0,6,0.044395,0.049024,0.040035,0.064668,0.064668,0.064879,0.04995,0.059989,0.070024,0.083187,0.059274,0.083642,,,,1.0,3.0,3.0,1.0,0.0,1.925924,76438.0,127872.0,3.0,5.0,1.0,1.0,4.0,9.0,1.0,0.0,0.0,0.0
4,109538,2,8,8,2,31,2,1,2,2,1,1,0,0,1,6,0.091613,0.091613,0.087606,0.071242,0.071242,0.071409,0.07152,0.069581,0.081638,0.083187,0.068583,0.083642,8.0,112.5,,1.0,2.0,2.0,1.0,7.5,2.1245,191188.0,107919.0,3.0,4.0,1.0,1.0,5.0,9.0,1.0,0.0,0.0,0.0


In [185]:
train = merge[:nrow_train]
test = merge[nrow_train:]
print(train.shape, test.shape)

(463291, 49) (128858, 49)


# XGB for simple parameters

In [54]:
# Set the params(this params from Pranav kernel) for xgboost model
params = {'eta': 0.002,'tree_method': "hist",'subsample': 0.9,'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,'objective': 'binary:logistic','eval_metric': 'auc',
          'nthread':8,'random_state': 42, 'silent': True}

In [25]:
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=99)
dtrain = xgb.DMatrix(x1, y1)
dvalid = xgb.DMatrix(x2, y2)

del x1, y1, x2, y2 
gc.collect()
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 10000, watchlist, maximize=True, early_stopping_rounds = 50, verbose_eval=100)

[15:54:12] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.602358	valid-auc:0.593116
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[100]	train-auc:0.633035	valid-auc:0.618447
[200]	train-auc:0.634775	valid-auc:0.61936
[300]	train-auc:0.636299	valid-auc:0.620077
[400]	train-auc:0.637436	valid-auc:0.620651
[500]	train-auc:0.638394	valid-auc:0.621114
[600]	train-auc:0.639101	valid-auc:0.621332
[700]	train-auc:0.639999	valid-auc:0.621889
[800]	train-auc:0.640718	valid-auc:0.622093
[900]	train-auc:0.641512	valid-auc:0.622516
[1000]	train-auc:0.642287	valid-auc:0.622805
[1100]	train-auc:0.643145	valid-auc:0.623157
[1200]	train-auc:0.644066	valid-auc:0.623527
[1300]	train-auc:0.645054	valid-auc:0.623994
[1400]	train-auc:0.646182	valid-auc:0.624465
[1500]	train-auc:0.647512	valid-auc:0.625073
[1600]	train-auc:0.648907	valid-auc:0.625633
[1700]	

In [26]:
dtest = xgb.DMatrix(test)
sub['is_click'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
sub.head()

Unnamed: 0,session_id,is_click
0,411705,0.111324
1,208263,0.031059
2,239450,0.027977
3,547761,0.053547
4,574275,0.101651


In [27]:
print(model.best_ntree_limit)

5993


In [28]:
#sub.to_csv('basic_xgb.csv', float_format='%.8f', index=False) #subsample': 0.9 - LB - 0.626989521070428
#sub.to_csv('basic_xgb_2.csv', float_format='%.8f', index=False) #subsample': 0.9 - LB - 0.626989521070428
sub.to_csv('basic_xgb_3.csv', float_format='%.8f', index=False) #subsample': 0.9 - LB - 0.626989521070428

# XBG for more parameters

In [186]:
# Set the params(this params from Pranav kernel) for xgboost model
params = {'eta': 0.002,'tree_method': "hist",'subsample': 0.7,'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,'objective': 'binary:logistic','eval_metric': 'auc',
          'nthread':8,'random_state': 42, 'silent': True}

In [187]:
x1 = train.head(370632)
x2 = train.tail(92659)
y1 = y.head(370632)
y2 = y.tail(92659)
print(x1.shape, x2.shape)
print(y1.shape, y2.shape)

(370632, 49) (92659, 49)
(370632,) (92659,)


In [188]:
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.2, random_state=99)
dtrain = xgb.DMatrix(x1, y1)
dvalid = xgb.DMatrix(x2, y2)

del x1, y1, x2, y2 
gc.collect()
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 10000, watchlist, maximize=True, early_stopping_rounds = 50, verbose_eval=500)

[18:41:38] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.617581	valid-auc:0.606613
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[500]	train-auc:0.643508	valid-auc:0.622765
[1000]	train-auc:0.647884	valid-auc:0.624216
[1500]	train-auc:0.653757	valid-auc:0.626144
[2000]	train-auc:0.661499	valid-auc:0.628907
[2500]	train-auc:0.669651	valid-auc:0.631168
[3000]	train-auc:0.676954	valid-auc:0.632691
[3500]	train-auc:0.683529	valid-auc:0.63383
[4000]	train-auc:0.68934	valid-auc:0.634603
[4500]	train-auc:0.695099	valid-auc:0.63521
[5000]	train-auc:0.700667	valid-auc:0.635669
[5500]	train-auc:0.70572	valid-auc:0.636102
[6000]	train-auc:0.710719	valid-auc:0.636393
Stopping. Best iteration:
[6182]	train-auc:0.71253	valid-auc:0.636564



In [189]:
dtest = xgb.DMatrix(test)
sub['is_click'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
sub.head()

Unnamed: 0,session_id,is_click
0,411705,0.086418
1,208263,0.04439
2,239450,0.034996
3,547761,0.058579
4,574275,0.135259


In [190]:
sub.to_csv('more_xgb_3.csv', float_format='%.8f', index=False) #subsample': 0.9 - LB - 0.626989521070428

# Training Without train/valid split

In [165]:
dtrain = xgb.DMatrix(train, y)
model = xgb.train(params, dtrain, 6092)

[22:28:31] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.


In [166]:
dtest = xgb.DMatrix(test)
sub['is_click'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
sub.head()

Unnamed: 0,session_id,is_click
0,411705,0.116502
1,208263,0.02851
2,239450,0.028597
3,547761,0.049245
4,574275,0.096015


In [167]:
sub.to_csv('basic_xgb_3.csv', float_format='%.8f', index=False)

# XGB with K-Fold

In [168]:
target_train = y.values

train = np.array(train)
test = np.array(test)

xgb_preds = []

In [169]:
print(train.shape, test.shape)

(463291, 38) (128858, 38)


In [172]:
K = 5
kf = KFold(n_splits = K, random_state = 42, shuffle = True)

In [174]:
params = {'eta': 0.002,'tree_method': "hist",'grow_policy': "lossguide",'subsample': 0.8,
          'colsample_bytree': 0.7, 'colsample_bylevel':0.7,'objective': 'binary:logistic', 
          'eval_metric': 'auc', 'nthread':8,'random_state': 42, 'silent': True}

In [176]:
start = time()

for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(params, d_train, 10000, watchlist, maximize=True, verbose_eval=500, early_stopping_rounds=50)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))

end = time()
print ('Time taken is:', end-start)

[22:46:14] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.614875	valid-auc:0.611276
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[95]	train-auc:0.633068	valid-auc:0.623596

[22:46:22] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.613535	valid-auc:0.609268
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[500]	train-auc:0.638255	valid-auc:0.629219
[1000]	train-auc:0.642836	valid-auc:0.631267
[1500]	train-auc:0.648919	valid-auc:0.633771
[2000]	train-auc:0.656875	valid-auc:0.636353
[2500]	train-auc:0.665139	valid-auc:0.638303
[3000]	train-auc:0.673048	valid-auc:0.639727
[3500]	train-auc:0.680071	valid-auc:0.640727
[4000]	train-auc:0.686246	valid-auc:0.641466
[4500

In [177]:
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

In [178]:
sub['is_click']=preds

In [179]:
sub.head()

Unnamed: 0,session_id,is_click
0,411705,0.225929
1,208263,0.167614
2,239450,0.164765
3,547761,0.183155
4,574275,0.209627


In [180]:
sub.to_csv('xgb_5fold.csv', float_format='%.8f', index=False)

# Increased early stoping round

In [181]:
start = time()

for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(params, d_train, 10000, watchlist, maximize=True, verbose_eval=500, early_stopping_rounds=100)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))

end = time()
print ('Time taken is:', end-start)

[23:08:00] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.614875	valid-auc:0.611276
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[500]	train-auc:0.638537	valid-auc:0.625137
[1000]	train-auc:0.643268	valid-auc:0.627179
[1500]	train-auc:0.649013	valid-auc:0.629472
[2000]	train-auc:0.656605	valid-auc:0.632537
[2500]	train-auc:0.664546	valid-auc:0.63499
[3000]	train-auc:0.672091	valid-auc:0.636846
[3500]	train-auc:0.678702	valid-auc:0.638165
[4000]	train-auc:0.684781	valid-auc:0.639157
[4500]	train-auc:0.690434	valid-auc:0.639858
[5000]	train-auc:0.695739	valid-auc:0.640366
[5500]	train-auc:0.700721	valid-auc:0.640741
[6000]	train-auc:0.705516	valid-auc:0.641146
[6500]	train-auc:0.710135	valid-auc:0.641481
[7000]	train-auc:0.714583	valid-auc:0.641882
[7500]	train-auc:0.719097	valid-auc:0.642163
[8000]	train-auc:0.723339	valid-auc:0.64246

In [182]:
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

In [183]:
sub['is_click']=preds

In [184]:
sub.head()

Unnamed: 0,session_id,is_click
0,411705,0.225929
1,208263,0.167614
2,239450,0.164765
3,547761,0.183155
4,574275,0.209627


In [185]:
sub.to_csv('xgb_5fold_2.csv', float_format='%.8f', index=False)

In [1]:
# Tuning for Optimizing parameters

In [None]:
def xgb_evaluate(gamma,alpha):
#min_child_weight,
                 #colsample_bytree,
                 #max_depth,
                 #subsample,
    #params['min_child_weight'] = int(min_child_weight)
    #params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    #params['max_depth'] = int(max_depth)
    #params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=3, seed=random_state,
             callbacks=[xgb.callback.early_stop(50)])

    return cv_result['test-auc-mean'].values[-1]

In [None]:
xgtrain = xgb.DMatrix(train, label=y)

In [None]:
num_rounds = 2000
random_state = 42
num_iter = 25
init_points = 5
params = {'eta': 0.002,'silent': True, 'eval_metric': 'auc','verbose_eval': 100,'seed': random_state}

In [None]:
xgbBO = BayesianOptimization(xgb_evaluate, {#'min_child_weight': (1, 10),
                                            #'colsample_bytree': (0.1, 1),
                                            #'max_depth': (5, 15),
                                            #'subsample': (0.5, 1),
                                            'gamma': (0, 8),
                                            'alpha': (0, 8)},verbose=1)

In [None]:
xgbBO.maximize(init_points=init_points, n_iter=num_iter)