In [3]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

clicks = pd.read_csv('C://Users/hp/Downloads/train_sample.csv')

In [4]:
clicks.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,89489,3,1,13,379,11/6/2017 15:13,,0
1,204158,35,1,13,21,11/6/2017 15:41,11/7/2017 8:17,1
2,3437,6,1,13,459,11/6/2017 15:42,,0
3,167543,3,1,13,379,11/6/2017 15:56,,0
4,147509,3,1,13,379,11/6/2017 15:57,,0


In [5]:
def get_data_splits(dataframe, valid_fraction=0.1):
    """Splits a dataframe into train, validation, and test sets.

    First, orders by the column 'click_time'. Set the size of the 
    validation and test sets with the valid_fraction keyword argument.
    """

    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

def train_model(train, valid, test=None, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    num_round = 1000
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        return bst, valid_score, test_score
    else:
        return bst, valid_score

In [6]:
print("Baseline model")
train, valid, test = get_data_splits(clicks)
tm = train_model(train, valid)

Baseline model
[LightGBM] [Info] Number of positive: 165464, number of negative: 673397
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 838861, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197248 -> initscore=-1.403581
[LightGBM] [Info] Start training from score -1.403581
Validation AUC score: 0.9687875635640151


# Count encoding

In [35]:
import category_encoders as ce

cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

In [36]:
# Create the count encoder
count_enc = ce.CountEncoder(cols=cat_features)

# Learn encoding from the training set
count_encoded = count_enc.fit(train[cat_features])

# Apply encoding to the train and validation sets as new columns
# Make sure to add `_count` as a suffix to the new columns
train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count'))


  elif pd.api.types.is_categorical(cols):


In [37]:
train_model(train_encoded, valid_encoded)

[LightGBM] [Info] Number of positive: 165464, number of negative: 673397
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1645
[LightGBM] [Info] Number of data points in the train set: 838861, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197248 -> initscore=-1.403581
[LightGBM] [Info] Start training from score -1.403581
Validation AUC score: 0.9704790959892806


(<lightgbm.basic.Booster at 0x1edb22ea460>, 0.9704790959892806)

In [38]:
train_encoded.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,ip_count,app_count,device_count,os_count,channel_count
0,89489,3,1,13,379,11/6/2017 15:13,,0,33,114111,746062,169949,12227
1,204158,35,1,13,21,11/6/2017 15:41,11/7/2017 8:17,1,4,27910,746062,169949,17240
2,3437,6,1,13,459,11/6/2017 15:42,,0,49,9745,746062,169949,15794
3,167543,3,1,13,379,11/6/2017 15:56,,0,13,114111,746062,169949,12227
4,147509,3,1,13,379,11/6/2017 15:57,,0,21,114111,746062,169949,12227


In [39]:
valid_encoded.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,ip_count,app_count,device_count,os_count,channel_count
497186,60265,64,1,19,459,11/7/2017 9:31,,0,11.0,9670.0,746062.0,191943.0,15794.0
497185,77415,3,1,13,173,11/7/2017 9:31,,0,11.0,114111.0,746062.0,169949.0,5607.0
497193,155239,3,1,22,489,11/7/2017 9:31,,0,3.0,114111.0,746062.0,30079.0,9912.0
497188,190090,11,1,32,173,11/7/2017 9:31,,0,9.0,12724.0,746062.0,8112.0,5607.0
497179,18695,1,1,14,178,11/7/2017 9:31,,0,47.0,24214.0,746062.0,10581.0,21646.0


# only encoded column

In [42]:
clicks = clicks.sort_values('click_time')
valid_rows = int(len(clicks) * 0.1)
train = clicks[:-valid_rows * 2]
# valid size == test size, last two sections of the data
valid = clicks[-valid_rows * 2:-valid_rows]
test = clicks[-valid_rows:]

In [43]:
feature_cols = train_encoded.columns.drop(['click_time', 'attributed_time','is_attributed','ip','app','device','os','channel'])

In [46]:
dtrain = lgb.Dataset(train_encoded[feature_cols], label=train['is_attributed'])
dvalid = lgb.Dataset(valid_encoded[feature_cols], label=valid['is_attributed'])
param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)

[LightGBM] [Info] Number of positive: 165464, number of negative: 673397
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 723
[LightGBM] [Info] Number of data points in the train set: 838861, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197248 -> initscore=-1.403581
[LightGBM] [Info] Start training from score -1.403581


In [49]:
valid_pred = bst.predict(valid_encoded[feature_cols])
valid_score = metrics.roc_auc_score(valid_encoded['is_attributed'], valid_pred)
print(f"Validation AUC score: {valid_score}")

Validation AUC score: 0.9685221249439797


# Target Encoding

In [24]:
# Create the target encoder. You can find this easily by using tab completion.
# Start typing ce. the press Tab to bring up a list of classes and functions.
target_enc = ce.TargetEncoder(cols=cat_features)

# Learn encoding from the training set. Use the 'is_attributed' column as the target.
target_enc.fit(train[cat_features], train['is_attributed'])

# Apply encoding to the train and validation sets as new columns
# Make sure to add `_target` as a suffix to the new columns
train_encoded = train.join(target_enc.transform(train[cat_features]).add_suffix("_target"))
valid_encoded = valid.join(target_enc.transform(valid[cat_features]).add_suffix("_target"))

  elif pd.api.types.is_categorical(cols):


In [25]:
train_model(train_encoded, valid_encoded)

[LightGBM] [Info] Number of positive: 165466, number of negative: 673395
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1731
[LightGBM] [Info] Number of data points in the train set: 838861, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197251 -> initscore=-1.403566
[LightGBM] [Info] Start training from score -1.403566
Validation AUC score: 0.956267895907397


(<lightgbm.basic.Booster at 0x1edb22ead90>, 0.956267895907397)

In [27]:
train_encoded.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,ip_target,app_target,device_target,os_target,channel_target
0,89489,3,1,13,379,11/6/2017 15:13,,0,0.121212,0.03522,0.151948,0.14228,0.034841
1,204158,35,1,13,21,11/6/2017 15:41,11/7/2017 8:17,1,0.247498,0.996417,0.151948,0.14228,0.936427
2,3437,6,1,13,459,11/6/2017 15:42,,0,0.122449,0.009236,0.151948,0.14228,0.017348
3,167543,3,1,13,379,11/6/2017 15:56,,0,0.153846,0.03522,0.151948,0.14228,0.034841
4,147509,3,1,13,379,11/6/2017 15:57,,0,0.047619,0.03522,0.151948,0.14228,0.034841


In [28]:
valid_encoded.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,ip_target,app_target,device_target,os_target,channel_target
496793,107091,15,1,13,245,11/7/2017 9:31,,0,0.181818,0.020538,0.151948,0.14228,0.009086
496794,96806,9,1,17,466,11/7/2017 9:31,,0,0.095238,0.106438,0.151948,0.115283,0.095649
496806,22226,2,1,19,477,11/7/2017 9:31,,0,2.4e-05,0.029551,0.151948,0.158024,0.014581
496796,112681,3,1,13,173,11/7/2017 9:31,,0,0.363629,0.03522,0.151948,0.14228,0.057785
496788,120805,14,1,6,480,11/7/2017 9:31,,0,0.317112,0.026476,0.151948,0.124836,0.027851


In [32]:
#remove ip
# Remove IP from the encoded features
cat_features = ['app', 'device', 'os', 'channel']

# Create the CatBoost encoder
t_enc = ce.TargetEncoder(cols=cat_features)

# Learn encoding from the training set
tip_enc=t_enc.fit(train[cat_features],train['is_attributed'])

# Apply encoding to the train and validation sets as new columns
# Make sure to add `_cb` as a suffix to the new columns
train_encoded = train.join(tip_enc.transform(train[cat_features]).add_suffix("_cb"))
valid_encoded = valid.join(tip_enc.transform(valid[cat_features]).add_suffix("_cb"))

  elif pd.api.types.is_categorical(cols):


In [33]:
train_model(train_encoded, valid_encoded)

[LightGBM] [Info] Number of positive: 165466, number of negative: 673395
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 838861, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197251 -> initscore=-1.403566
[LightGBM] [Info] Start training from score -1.403566
Validation AUC score: 0.9695418550329439


(<lightgbm.basic.Booster at 0x1edb1dacbe0>, 0.9695418550329439)

In [29]:
# Remove IP from the encoded features
cat_features = ['app', 'device', 'os', 'channel']

# Create the CatBoost encoder
cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

# Learn encoding from the training set
catboost_enc=cb_enc.fit(train[cat_features],train['is_attributed'])

# Apply encoding to the train and validation sets as new columns
# Make sure to add `_cb` as a suffix to the new columns
train_encoded = train.join(catboost_enc.transform(train[cat_features]).add_suffix("_cb"))
valid_encoded = valid.join(catboost_enc.transform(valid[cat_features]).add_suffix("_cb"))

  elif pd.api.types.is_categorical(cols):


In [30]:
train_model(train_encoded, valid_encoded)

[LightGBM] [Info] Number of positive: 165466, number of negative: 673395
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1527
[LightGBM] [Info] Number of data points in the train set: 838861, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197251 -> initscore=-1.403566
[LightGBM] [Info] Start training from score -1.403566
Validation AUC score: 0.9696329482948687


(<lightgbm.basic.Booster at 0x1edb1f5e0d0>, 0.9696329482948687)

# Best encoder in this case is Count Encoding 