In [1]:
import gc
import time
import numpy as np
import pandas as pd
from time import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from bayes_opt import BayesianOptimization

import xgboost as xgb
from xgboost import plot_importance

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
PATH = "/home/vishy/Desktop/Myfiles/AV/AMEX/"

In [3]:
# Change this for validation with 10% from train
is_valid = False

In [4]:
def handle_missing_inplace(df):
    df['age_level'].fillna(value=99.0, inplace=True)
    df['city_development_index'].fillna(value=99.0, inplace=True)
    df['gender'].fillna(value='Unknown', inplace=True)
    df['product_category_2'].fillna(value=999999.0, inplace=True)
    df['user_depth'].fillna(value=99.0, inplace=True)
    df['user_group_id'].fillna(value=99.0, inplace=True)
    return df

In [5]:
def timeFeatures(df):
    # Make some new features with click_time column
    df['day'] = df['DateTime'].dt.day.astype('uint8')
    df['hour'] = df['DateTime'].dt.hour.astype('uint8')
    df['minute'] = df['DateTime'].dt.minute.astype('uint8')
    df['dow'] = df['DateTime'].dt.dayofweek.astype('uint8')
    df.drop(['DateTime'], axis=1, inplace=True)
    return df

In [6]:
train = pd.read_csv(PATH+'train.csv', low_memory=False, parse_dates=['DateTime'])
train.rename(columns={'product':'prod'}, inplace=True)
print(train.shape)

(463291, 15)


In [7]:
test = pd.read_csv(PATH+'test.csv', low_memory=False, parse_dates=['DateTime'])
test.rename(columns={'product':'prod'}, inplace=True)
print(test.shape)

(128858, 14)


In [8]:
train = handle_missing_inplace(train)
train = timeFeatures(train)

test = handle_missing_inplace(test)
test = timeFeatures(test)

In [9]:
CLICK_ATTR_CATS = [['prod', 'campaign_id'],['prod', 'webpage_id'], ['prod', 'product_category_1'],
                   ['user_group_id','gender'],['user_group_id','age_level'],['user_group_id', 'user_depth'],
                   ['prod','age_level'], ['prod','user_depth'],['product_category_1','age_level'],
                   ['product_category_1','user_depth']]

In [10]:
# Find frequency of is_attributed for each unique value in column
freqs = {}
for cols in CLICK_ATTR_CATS:
    
    # New feature name
    new_feature = '_'.join(cols)+'_confRate'    
    
    # Perform the groupby
    group_object = train.groupby(cols)
    
    # Group sizes    
    group_sizes = group_object.size()
    log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
    print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature,group_sizes.max(), np.round(group_sizes.mean(), 2), np.round(group_sizes.median(), 2),
        group_sizes.min()))
    
    # Aggregation function
    def rate_calculation(x):
        """Calculate the click rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        return rate * conf
    
    # Perform the merge
    train = train.merge(
        group_object['is_click']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_click': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )
    test = test.merge(
        group_object['is_click']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_click': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )
    
print(train.shape, test.shape)

>> Calculating confidence-weighted rate for: ['prod', 'campaign_id'].
   Saving to: prod_campaign_id_confRate. Group Max /Mean / Median / Min: 64962 / 5939.63 / 2162.0 / 1
>> Calculating confidence-weighted rate for: ['prod', 'webpage_id'].
   Saving to: prod_webpage_id_confRate. Group Max /Mean / Median / Min: 81829 / 6346.45 / 1871.0 / 1
>> Calculating confidence-weighted rate for: ['prod', 'product_category_1'].
   Saving to: prod_product_category_1_confRate. Group Max /Mean / Median / Min: 70930 / 10774.21 / 4588.0 / 1
>> Calculating confidence-weighted rate for: ['user_group_id', 'gender'].
   Saving to: user_group_id_gender_confRate. Group Max /Mean / Median / Min: 140317 / 30886.07 / 13779.0 / 33
>> Calculating confidence-weighted rate for: ['user_group_id', 'age_level'].
   Saving to: user_group_id_age_level_confRate. Group Max /Mean / Median / Min: 140317 / 33092.21 / 16011.0 / 153
>> Calculating confidence-weighted rate for: ['user_group_id', 'user_depth'].
   Saving to: user

In [11]:
# Isolate target
y = train['is_click']
# Drop is_click and session ID from train
train.drop(['is_click','session_id'], axis=1, inplace=True)

# Create Submission dataframe
sub = pd.DataFrame()
sub['session_id'] = test['session_id'].astype('int')

# Drop sessionID from test rows
test.drop(['session_id'], axis=1, inplace=True)
gc.collect()

# Create a pointer for train 
nrow_train = train.shape[0]

# Concatenate for counting
merge = pd.concat([train, test])
print(merge.shape)

del train, test
gc.collect()

(592149, 26)


0

In [12]:
# Identify the previous ads and history ads

HISTORY_ADS = {
    'identical_': ['user_id', 'prod', 'product_category_1', 'webpage_id', 'campaign_id'],
    'user_prods': ['user_id', 'prod']
}

# Go through different group-by combinations
for fname, fset in HISTORY_ADS.items():
    
    # Clicks in the past
    merge['prev_'+fname] = merge.groupby(fset).cumcount().rename('prev_'+fname)
        
    # Clicks in the future
    merge['future_'+fname] = merge.iloc[::-1].groupby(fset).cumcount().rename('future_'+fname).iloc[::-1]

# Count cumulative subsequent clicks
print(merge.shape)

(592149, 30)


In [13]:
# Define all the groupby transformations
GROUPBY_AGGREGATIONS = [
    # V1 - GroupBy Features #
    #########################    
    # Variance in day, for user_id-prod-campaign_id
    {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},
    # Variance in hour, for user_id-prod-product_category_1
    {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},
    # Variance in hour, for user_id-day-campaign_id
    {'groupby': ['user_id','day','campaign_id'], 'select': 'hour', 'agg': 'var'},
    # Count, for user_id-day-hour'dow','hour'
    {'groupby': ['user_id','day','hour'], 'select': 'campaign_id', 'agg': 'count'},
    # Count, for user_id-prod
    {'groupby': ['user_id', 'prod'], 'select': 'campaign_id', 'agg': 'count'},        
    # Count, for user_id-prod-webpage_id
    {'groupby': ['user_id', 'prod', 'webpage_id'], 'select': 'campaign_id', 'agg': 'count'},
    # Count, for user_id-prod-day-hour
    {'groupby': ['user_id','prod','day','hour'], 'select': 'campaign_id', 'agg': 'count'},
    # Mean hour, for user_id-prod-campaign_id
    {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, 
    
    # V2 - GroupBy Features #
    #########################
    # Average clicks on app by distinct users; is it an app they return to?
    {'groupby': ['prod'], 
     'select': 'user_id', 
     'agg': lambda x: float(len(x)) / len(x.unique()), 
     'agg_name': 'AvgprodPerDistinct'
    },
    # How popular is the app or channel?
    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},
    {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},
    
    # V3 - GroupBy Features                                              #
    # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #
    ###################################################################### 
    {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, 
    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, 
    {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, 
    {'groupby': ['user_id','prod'], 'select': 'webpage_id', 'agg': 'nunique'}, 
    {'groupby': ['user_id'], 'select': 'product_category_1', 'agg': 'nunique'}, 
    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'nunique'}, 
    {'groupby': ['user_id', 'product_category_1', 'webpage_id'], 'select': 'prod', 'agg': 'nunique'}, 
    {'groupby': ['user_id','product_category_1','webpage_id'], 'select': 'prod', 'agg': 'cumcount'}, 
    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'cumcount'}, 
    {'groupby': ['user_id'], 'select': 'webpage_id', 'agg': 'cumcount'}
]

In [14]:
# Apply all the groupby transformations
for spec in GROUPBY_AGGREGATIONS:
    
    # Name of the aggregation we're applying
    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']
    
    # Name of new feature
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])
    
    # Info
    print("Grouping by {}, and aggregating {} with {}".format(
        spec['groupby'], spec['select'], agg_name
    ))
    
    # Unique list of features to select
    all_features = list(set(spec['groupby'] + [spec['select']]))
    
    # Perform the groupby
    gp = merge[all_features]. \
        groupby(spec['groupby'])[spec['select']]. \
        agg(spec['agg']). \
        reset_index(). \
        rename(index=str, columns={spec['select']: new_feature})
        
    # Merge back to X_total
    if 'cumcount' == spec['agg']:
        merge[new_feature] = gp[0].values
    else:
        merge = merge.merge(gp, on=spec['groupby'], how='left')
        
     # Clear memory
    del gp
    gc.collect()

print(merge.shape)

Grouping by ['user_id', 'prod', 'campaign_id'], and aggregating day with var
Grouping by ['user_id', 'prod', 'product_category_1'], and aggregating hour with var
Grouping by ['user_id', 'day', 'campaign_id'], and aggregating hour with var
Grouping by ['user_id', 'day', 'hour'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod', 'webpage_id'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod', 'day', 'hour'], and aggregating campaign_id with count
Grouping by ['user_id', 'prod', 'campaign_id'], and aggregating hour with mean
Grouping by ['prod'], and aggregating user_id with AvgprodPerDistinct
Grouping by ['prod'], and aggregating campaign_id with count
Grouping by ['campaign_id'], and aggregating prod with count
Grouping by ['user_id'], and aggregating campaign_id with nunique
Grouping by ['user_id'], and aggregating prod with nunique
Grouping by ['user_id', 'day'], and aggre

In [15]:
merge.columns

Index(['user_id', 'prod', 'campaign_id', 'webpage_id', 'product_category_1',
       'product_category_2', 'user_group_id', 'gender', 'age_level',
       'user_depth', 'city_development_index', 'var_1', 'day', 'hour',
       'minute', 'dow', 'prod_campaign_id_confRate',
       'prod_webpage_id_confRate', 'prod_product_category_1_confRate',
       'user_group_id_gender_confRate', 'user_group_id_age_level_confRate',
       'user_group_id_user_depth_confRate', 'prod_age_level_confRate',
       'prod_user_depth_confRate', 'product_category_1_age_level_confRate',
       'product_category_1_user_depth_confRate', 'prev_identical_',
       'future_identical_', 'prev_user_prods', 'future_user_prods',
       'user_id_prod_campaign_id_var_day',
       'user_id_prod_product_category_1_var_hour',
       'user_id_day_campaign_id_var_hour',
       'user_id_day_hour_count_campaign_id', 'user_id_prod_count_campaign_id',
       'user_id_prod_webpage_id_count_campaign_id',
       'user_id_prod_day_hour_co

In [16]:
cat_vars = ['user_id', 'prod', 'campaign_id','webpage_id', 'product_category_1','product_category_2',
            'user_group_id', 'gender', 'age_level', 'user_depth','city_development_index', 'var_1',
            'day','hour','minute', 'dow']

contin_vars = ['prod_campaign_id_confRate',
       'prod_webpage_id_confRate', 'prod_product_category_1_confRate',
       'user_group_id_gender_confRate', 'user_group_id_age_level_confRate',
       'user_group_id_user_depth_confRate', 'prod_age_level_confRate',
       'prod_user_depth_confRate', 'product_category_1_age_level_confRate',
       'product_category_1_user_depth_confRate', 'prev_identical_',
       'future_identical_', 'prev_user_prods', 'future_user_prods',
       'user_id_prod_campaign_id_var_day',
       'user_id_prod_product_category_1_var_hour',
       'user_id_day_campaign_id_var_hour',
       'user_id_day_hour_count_campaign_id', 'user_id_prod_count_campaign_id',
       'user_id_prod_webpage_id_count_campaign_id',
       'user_id_prod_day_hour_count_campaign_id',
       'user_id_prod_campaign_id_mean_hour', 'prod_AvgprodPerDistinct_user_id',
       'prod_count_campaign_id', 'campaign_id_count_prod',
       'user_id_nunique_campaign_id', 'user_id_nunique_prod',
       'user_id_day_nunique_hour', 'user_id_prod_nunique_webpage_id',
       'user_id_nunique_product_category_1', 'prod_nunique_campaign_id',
       'user_id_product_category_1_webpage_id_nunique_prod',
       'user_id_product_category_1_webpage_id_cumcount_prod',
       'user_id_cumcount_prod', 'user_id_cumcount_webpage_id']

for v in cat_vars: 
    merge[v] = merge[v].astype('category')

for v in contin_vars: 
    merge[v] = merge[v].astype('float32')

In [17]:
lb = LabelEncoder()
for v in cat_vars:
    merge[v] = lb.fit_transform(merge[v])
print(merge.shape)

(592149, 51)


In [18]:
train = merge[:nrow_train]
test = merge[nrow_train:]
print(train.shape, test.shape)

(463291, 51) (128858, 51)


# FInding best

In [129]:
#params = {'eta': 0.005,'tree_method': "hist",'grow_policy': "lossguide",'subsample': 0.8,
#          'colsample_bytree': 0.8, 'colsample_bylevel':0.7,'objective': 'binary:logistic', 
#          'eval_metric': 'auc', 'nthread':10,'random_state': 42, 'silent': True, 'max_depth':6,
#          'scale_pos_weight':13, 'gamma': 5, 'lambda': 50, 'alpha':70  }

In [19]:
"""
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=42)
dtrain = xgb.DMatrix(x1, y1)
dvalid = xgb.DMatrix(x2, y2)

del x1, y1, x2, y2 
gc.collect()
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 10000, watchlist, maximize=True, early_stopping_rounds = 50, verbose_eval=500)
"""

"\nx1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=42)\ndtrain = xgb.DMatrix(x1, y1)\ndvalid = xgb.DMatrix(x2, y2)\n\ndel x1, y1, x2, y2 \ngc.collect()\nwatchlist = [(dtrain, 'train'), (dvalid, 'valid')]\nmodel = xgb.train(params, dtrain, 10000, watchlist, maximize=True, early_stopping_rounds = 50, verbose_eval=500)\n"

In [20]:
#dtest = xgb.DMatrix(test)
#sub['is_click'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
#sub.head()

In [21]:
#sub.to_csv('more_xgb_7.csv', float_format='%.8f', index=False) #subsample': 0.9 - LB - 0.626989521070428

# Apply K fold XGB

In [22]:
target_train = y.values

train = np.array(train)
test = np.array(test)

xgb_preds = []

In [23]:
print(train.shape, test.shape)

(463291, 51) (128858, 51)


In [24]:
K = 5
kf = KFold(n_splits = K, random_state = 2018, shuffle = True)

In [25]:
params = {'eta': 0.005,'tree_method': "hist",'grow_policy': "lossguide",'subsample': 0.8,
          'colsample_bytree': 0.8, 'colsample_bylevel':0.7,'objective': 'binary:logistic', 
          'eval_metric': 'auc', 'nthread':10,'random_state': 42, 'silent': True, 'max_depth':6,
          'scale_pos_weight':13, 'gamma': 5, 'lambda': 50, 'alpha':20}

In [26]:
start = time()

for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(params, d_train, 10000, watchlist, maximize=True, verbose_eval=500, early_stopping_rounds=50)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))

end = time()
print ('Time taken is:', end-start)

[22:02:40] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.623044	valid-auc:0.613185
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[500]	train-auc:0.657269	valid-auc:0.639317
[1000]	train-auc:0.671525	valid-auc:0.643225
[1500]	train-auc:0.682919	valid-auc:0.644983
[2000]	train-auc:0.693489	valid-auc:0.64604
[2500]	train-auc:0.702986	valid-auc:0.646798
Stopping. Best iteration:
[2887]	train-auc:0.71024	valid-auc:0.647171

[22:05:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.62671	valid-auc:0.616717
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[500]	train-auc:0.658562	valid-auc:0.635702
[1000]	train-auc:0.672966	valid-auc:0.639259
[1500]	train-auc:0.68419	valid-auc:0.640691
[2000]	t

In [27]:
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

In [28]:
sub['is_click']=preds

In [29]:
sub.head()

Unnamed: 0,session_id,is_click
0,411705,0.639891
1,208263,0.22904
2,239450,0.174025
3,547761,0.364239
4,574275,0.560393


In [30]:
sub.to_csv('xgb_5fold_v2.csv', float_format='%.8f', index=False)