In [13]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os

In [14]:
def do_count( df, group_cols, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Aggregating by ", group_cols , '...' )
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

def do_countuniq( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Counting unqiue ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )
    
def do_cumcount( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Cumulative count by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name]=gp.values
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

def do_mean( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating mean of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

def do_var( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating variance of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )

In [15]:
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.2,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 10,
        'verbose': 0,
        'metric':metrics
    }

    lgb_params.update(params)
    ################################################################################################################
    
    
    
    
    #################################################################################################################

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    #xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
    #                      feature_name=predictors,
    #                       categorical_feature=categorical_features
    #                      )

    evals_results = {}
    ########################################################################################################################
    
    from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
    from hyperopt.fmin import fmin
    from sklearn.metrics import roc_auc_score
    # objective function to optimize; loss is auroc
    

    def objective(params):

        bst = lgb.train(params, 
                         xgtrain, 
                         valid_sets= [xgtrain],  
                         num_boost_round=200,
                         #early_stopping_rounds=30,
                         verbose_eval=20)
    
        pred = bst.predict(dvalid[predictors])
        auc = roc_auc_score(dvalid[target], pred)

        del bst, pred
        gc.collect()
        print('**********************************************************')
        print(params)
        print("SCORE ............. : ",auc)
        print('**********************************************************')
        
        return { 'loss': 1-auc, 'status': STATUS_OK }

    space = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric':'auc',
            'learning_rate': 0.2,
            'num_leaves': hp.choice('num_leaves', np.arange(7, 500,10, dtype=int)),
            'max_depth': hp.choice('max_depth', np.arange(3, 12, dtype=int)),
            'min_child_samples': hp.choice('min_child_samples', np.arange(10, 500,50, dtype=int)),
            'max_bin': hp.choice('max_bin', np.arange(100, 5000,50, dtype=int)),
            #"drop_rate": 0.2,
            'subsample': 0.7,
            'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
            'colsample_bytree': hp.quniform('colsample_bytree', 0.2, 1, 0.1),
            'min_child_weight': hp.quniform('min_child_weight', 1, 100, 1),  # Minimum sum of instance weight(hessian) needed in a child(leaf)
            'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
            'nthread': 10,
            'verbose': 0,
            'scale_pos_weight': hp.choice('scale_pos_weight', np.arange(100, 500,10, dtype=int)),
            }

    trials = Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=300, # WARNING: increase number of evaluations (it's small for the sake of example)
        trials=trials
        )

    # best hyperparameters
    print("\n\n\n The best hyperparameters:")
    print(best)
    #########################################################################################################################
    return 


In [16]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        }

print('loading train data...')
train_df = pd.read_csv("./train.csv.zip",compression='zip',skiprows=range(1,40000000) ,parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
#######################################################



#######################################################
print('loading test data...')

test_df = pd.read_csv("./test.csv.zip", compression='zip',parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

len_train = len(train_df)
train_df=train_df.append(test_df)
del test_df
gc.collect()

loading train data...
loading test data...


14

In [17]:
ATTRIBUTION_CATEGORIES = [        
    # V1 Features #
    ###############
    ['app'],
    
    # V2 Features #
    ###############
    ['app', 'channel'],
    ['app', 'os'],
    
    # V3 Features #
    ###############
    ['os', 'device'],

]
# Find frequency of is_attributed for each unique value in column
freqs = {}
for cols in ATTRIBUTION_CATEGORIES:
    
    # New feature name
    new_feature = '_'.join(cols)+'_confRate'    
    
    # Perform the groupby
    group_object = train_df[:len_train].groupby(cols)
    
    # Group sizes    
    group_sizes = group_object.size()
    log_group = np.log(1000000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
    print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
        cols, new_feature, 
        group_sizes.max(), 
        np.round(group_sizes.mean(), 2),
        np.round(group_sizes.median(), 2),
        group_sizes.min()
    ))
    
    # Aggregation function
    def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        return rate * conf
    
    # Perform the merge
    train_df = train_df.merge(
        group_object['is_attributed']. \
            apply(rate_calculation). \
            reset_index(). \
            rename( 
                index=str,
                columns={'is_attributed': new_feature}
            )[cols + [new_feature]],
        on=cols, how='left'
    )
    
train_df.head()

>> Calculating confidence-weighted rate for: ['app'].
   Saving to: app_confRate. Group Max /Mean / Median / Min: 26562016 / 219219.2 / 29.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'channel'].
   Saving to: app_channel_confRate. Group Max /Mean / Median / Min: 11318288 / 108705.09 / 75.0 / 1
>> Calculating confidence-weighted rate for: ['app', 'device'].
   Saving to: app_device_confRate. Group Max /Mean / Median / Min: 25583655 / 13395.94 / 2.0 / 1


Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os,app_confRate,app_channel_confRate,app_device_confRate
0,13,477,,2017-11-07 09:39:40,1,147296,0.0,18,0.000161,0.000168,0.000165
1,8,145,,2017-11-07 09:39:40,1,48281,0.0,19,0.001768,0.001661,0.001951
2,27,153,,2017-11-07 09:39:40,1,5348,0.0,47,0.001729,0.000338,0.001738
3,26,121,,2017-11-07 09:39:40,1,118229,0.0,18,0.000417,0.00035,0.000418
4,2,219,,2017-11-07 09:39:40,1,67836,0.0,6,0.000244,0.000309,0.000255


In [18]:
print('Extracting new features...')

train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8');gc.collect()
train_df['min'] = pd.to_datetime(train_df.click_time).dt.minute.astype('uint8');gc.collect()
train_df['sec'] = pd.to_datetime(train_df.click_time).dt.second.astype('uint8');gc.collect()
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8');gc.collect()

train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
train_df['nextClick'] = (train_df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(-1) - train_df.click_time).astype(np.float32)
#train_df['nextClick_2'] = (train_df.groupby(['device', 'os','channel','app']).click_time.shift(-1) - train_df.click_time).astype(np.float32)
train_df.drop(['click_time'],axis=1,inplace=True)
#train_df = do_cumcount( train_df, ['ip', 'device', 'os'], 'channel', 'V_1', show_max=True ); gc.collect()
train_df = do_cumcount( train_df, ['ip', 'device', 'os'], 'app', 'V_2', show_max=True ); gc.collect()
# to do ip channel count unique
train_df = do_countuniq( train_df, ['app'], 'channel', 'V_3', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['channel'], 'app', 'V_4', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['day','hour','channel'], 'app', 'V_5', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['day','hour','app'], 'channel', 'V_6', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'app', 'V_7', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'channel', 'V_15', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip', 'app'], 'os', 'V_8', 'uint8', show_max=True ); gc.collect()
#train_df = do_countuniq( train_df, ['ip'], 'device', 'V_9', 'uint16', show_max=True ); gc.collect()

train_df = do_count( train_df, ['ip', 'day', 'hour'], 'V_10', show_max=True ); gc.collect()
train_df = do_count( train_df, ['app', 'day', 'hour'], 'V_18', show_max=True ); gc.collect()
train_df = do_count( train_df, ['channel', 'day', 'hour'], 'V_19', show_max=True ); gc.collect()
train_df = do_count( train_df, ['os', 'day', 'hour'], 'V_20', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app'], 'V_11', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app', 'os'], 'V_12', 'uint16', show_max=True ); gc.collect()

train_df = do_var( train_df, ['day', 'hour','app'],'channel' , 'V_13', show_max=True ); gc.collect()
train_df = do_var( train_df, ['day', 'hour', 'channel'], 'app' , 'V_14', show_max=True ); gc.collect()
#train_df = do_var( train_df, ['day', 'hour', 'os'], 'app' , 'V_16', show_max=True ); gc.collect()
#train_df = do_var( train_df, ['day', 'hour', 'os'], 'channel' , 'V_17', show_max=True ); gc.collect()
train_df.drop(['ip'],axis=1,inplace=True)

Extracting new features...
('Cumulative count by ', ['ip', 'device', 'os'], '...')
('V_2 max value = ', 248448)
('Counting unqiue ', 'channel', ' by ', ['app'], '...')
('V_3 max value = ', 48)
('Counting unqiue ', 'app', ' by ', ['channel'], '...')
('V_4 max value = ', 289)
('Counting unqiue ', 'app', ' by ', ['ip'], '...')
('V_7 max value = ', 263)
('Counting unqiue ', 'channel', ' by ', ['ip'], '...')
('V_15 max value = ', 162)
('Aggregating by ', ['ip', 'day', 'hour'], '...')
('V_10 max value = ', 44259)
('Aggregating by ', ['app', 'day', 'hour'], '...')
('V_18 max value = ', 895281)
('Aggregating by ', ['channel', 'day', 'hour'], '...')
('V_19 max value = ', 526788)
('Aggregating by ', ['os', 'day', 'hour'], '...')
('V_20 max value = ', 984911)
('Aggregating by ', ['ip', 'app'], '...')
('V_11 max value = ', 192442)
('Aggregating by ', ['ip', 'app', 'os'], '...')
('V_12 max value = ', 46279)
('Calculating variance of ', 'channel', ' by ', ['day', 'hour', 'app'], '...')
('V_13 max va

In [19]:
train_df.head()

Unnamed: 0,app,channel,click_id,device,is_attributed,os,app_confRate,app_channel_confRate,app_device_confRate,hour,...,V_7,V_15,V_10,V_18,V_19,V_20,V_11,V_12,V_13,V_14
0,13,477,,1,0.0,18,0.000161,0.000168,0.000165,9,...,11,13,9,31514,32252,49232,1,1,1181.209595,29.850218
1,8,145,,1,0.0,19,0.001768,0.001661,0.001951,9,...,29,74,23,16197,13968,239249,12,4,1474.756592,1.926109
2,27,153,,1,0.0,47,0.001729,0.000338,0.001738,9,...,7,162,7047,7930,32683,8575,9025,204,293.065765,89.350601
3,26,121,,1,0.0,18,0.000417,0.00035,0.000418,9,...,139,142,346,15335,26217,49232,1247,42,5029.855469,14.620737
4,2,219,,1,0.0,6,0.000244,0.000309,0.000255,9,...,44,89,4,110713,14359,25976,138,2,19382.59375,27.856543


In [20]:
test_df = train_df[len_train:]
val_size=20000000
val_df = train_df[(len_train-val_size):len_train]
train_df = train_df[:(len_train-val_size)]
print("train size: ", len(train_df))
print("valid size: ", len(val_df))
print("test size : ", len(test_df))

sub = pd.DataFrame()
sub['click_id'] = test_df['click_id'].astype('int')

gc.collect()
target = 'is_attributed'
predictors = list(set(train_df.columns) - set(['is_attributed','click_id'])) 
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
print('predictors',predictors)

('train size: ', 124903891)
('valid size: ', 20000000)
('test size : ', 18790469)
('predictors', ['V_14', 'V_15', 'V_12', 'V_13', 'V_10', 'V_11', 'app', 'V_18', 'V_19', 'nextClick', 'app_confRate', 'app_channel_confRate', 'app_device_confRate', 'min', 'channel', 'device', 'V_20', 'day', 'V_4', 'V_7', 'V_2', 'hour', 'V_3', 'sec', 'os'])


In [None]:
#*****************************************************************************************************************************
#*****************************************************************************************************************************

In [None]:
print("Training...")
start_time = time.time()

params = {
    'learning_rate': 0.2,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 12,  # 2^max_depth - 1
    'max_depth': 5,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.95,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight':130 # because training data is extremely unbalanced 
}
print("Training...")
start_time = time.time()

(bst,best_iteration) = lgb_modelfit_nocv(params, 
                        train_df, 
                        val_df, 
                        predictors, 
                        target, 
                        objective='binary', 
                        metrics='auc',
                        early_stopping_rounds=50, 
                        verbose_eval=True, 
                        num_boost_round=200, 
                        categorical_features=categorical)

print('[{}]: model training time'.format(time.time() - start_time))
del train_df
del val_df
gc.collect()



Training...
Training...
preparing validation datasets
[20]	training's auc: 0.986582
[40]	training's auc: 0.989092
[60]	training's auc: 0.990876
[80]	training's auc: 0.992147
[100]	training's auc: 0.993108
[120]	training's auc: 0.99384
[140]	training's auc: 0.994487
[160]	training's auc: 0.994975
[180]	training's auc: 0.995404
[200]	training's auc: 0.995786
**********************************************************
{'num_leaves': 317, 'subsample_freq': 1, 'verbose': 0, 'scale_pos_weight': 450, 'learning_rate': 0.2, 'metric': 'auc', 'boosting_type': 'gbdt', 'colsample_bytree': 0.9, 'min_child_samples': 60, 'nthread': 10, 'min_child_weight': 15.0, 'min_split_gain': 0, 'subsample': 0.7, 'max_bin': 255, 'objective': 'binary', 'max_depth': 11}
('SCORE ............. : ', 0.98389297404859088)
**********************************************************
[20]	training's auc: 0.980404
[40]	training's auc: 0.982697
[60]	training's auc: 0.983703
[80]	training's auc: 0.984272
[100]	training's auc: 0.9

[80]	training's auc: 0.984329
[100]	training's auc: 0.984655
[120]	training's auc: 0.984936
[140]	training's auc: 0.985202
[160]	training's auc: 0.985393
[180]	training's auc: 0.985617
[200]	training's auc: 0.985768
**********************************************************
{'num_leaves': 137, 'subsample_freq': 1, 'verbose': 0, 'scale_pos_weight': 410, 'learning_rate': 0.2, 'metric': 'auc', 'boosting_type': 'gbdt', 'colsample_bytree': 0.7000000000000001, 'min_child_samples': 110, 'nthread': 10, 'min_child_weight': 13.0, 'min_split_gain': 0, 'subsample': 0.7, 'max_bin': 3150, 'objective': 'binary', 'max_depth': 4}
('SCORE ............. : ', 0.9851875526898064)
**********************************************************
[20]	training's auc: 0.985307
[40]	training's auc: 0.98765
[60]	training's auc: 0.989091
[80]	training's auc: 0.990396
[100]	training's auc: 0.991295
[120]	training's auc: 0.992096
[140]	training's auc: 0.992806
[160]	training's auc: 0.993409
[180]	training's auc: 0.993963