In [24]:
import pandas as pd
import numpy as np
import copy
import sys
import datetime as dt
import os

from sklearn.model_selection import train_test_split

import modelling_utils as mu

import lightgbm as lgb

In [20]:
target_var = 'transactionRevenue'

model_name = 'lgb'

catVarsDict = {'channelGrouping' : 'BinaryEncoder',
               'browser': 'LabelEncoder',
               'operatingSystem': 'LabelEncoder',
               'deviceCategory': 'OneHot',
               'continent': 'BinaryEncoder',
               'subContinent': 'LabelEncoder',
               'country': 'LabelEncoder',
               'region': 'LabelEncoder',
               'metro': 'LabelEncoder',
               'city': 'LabelEncoder',
               'networkDomain': 'LabelEncoder',
               'campaign': 'LabelEncoder',
               'source': 'LabelEncoder',
               'medium': 'LabelEncoder',
               'sourceMedium': 'LabelEncoder'}

params_lgb = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42}

In [3]:
train  = pd.read_csv(os.getcwd() + '/train_set_processed.csv', index_col=0)
test = pd.read_csv(os.getcwd() + '/test_set_processed.csv', index_col=0)

In [4]:
train['set'] = 'train'
test['set'] = 'test'
# Target variable for the test set is created just for data processing purpose
test[target_var]= 0

In [5]:
all = pd.concat([train,test],axis=0)
print('\nAll Data shape: {} Rows, {} Columns'.format(*all.shape))


All Data shape: 90365 Rows, 31 Columns


In [7]:
print("Categorical variables: " + str(list(all.select_dtypes(include=['object']))))

Categorical variables: ['channelGrouping', 'date', 'sessionId', 'browser', 'operatingSystem', 'deviceCategory', 'continent', 'subContinent', 'country', 'region', 'metro', 'city', 'networkDomain', 'campaign', 'source', 'medium', 'sourceMedium', 'set']


In [8]:
all = mu.processingPreModelling(df = all, catVarsDict = catVarsDict)

In [31]:
all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90365 entries, 1 to 18186
Data columns (total 38 columns):
continent_1               90365 non-null int64
continent_2               90365 non-null int64
continent_3               90365 non-null int64
deviceCategory_desktop    90365 non-null int64
deviceCategory_mobile     90365 non-null int64
deviceCategory_tablet     90365 non-null int64
channelGrouping_0         90365 non-null int64
channelGrouping_1         90365 non-null int64
channelGrouping_2         90365 non-null int64
channelGrouping_3         90365 non-null int64
date                      90365 non-null datetime64[ns]
fullVisitorId             90365 non-null float64
sessionId                 90365 non-null object
visitId                   90365 non-null int64
visitNumber               90365 non-null int64
visitStartTime            90365 non-null int64
browser                   90365 non-null int64
operatingSystem           90365 non-null int64
isMobile                  90365 n

In [32]:
train_all = all.loc[all['set'] == 'train']
X = train_all
y = train_all[target_var]

In [33]:
X = X.drop([target_var, 'set', 'sessionId', 'date'], axis=1)

In [34]:
train_X, val_X, train_y, val_y = train_test_split(X, y,
                                                test_size=0.15, random_state=1)

In [35]:
test_X = all.loc[all['set'] == 'test']
test_X = test_X.drop([target_var, 'set', 'sessionId', 'date'], axis=1)

In [36]:
print('\n Training set shape: {} Rows, {} Columns'.format(*train_X.shape))
print('\n Validation set shape: {} Rows, {} Columns'.format(*val_X.shape))
print('\n Test set shape: {} Rows, {} Columns'.format(*test_X.shape))


 Training set shape: 61352 Rows, 34 Columns

 Validation set shape: 10827 Rows, 34 Columns

 Test set shape: 18186 Rows, 34 Columns


In [37]:
do_training(train_X, train_y, val_X, val_y, test_X, model_name, params = params_lgb)

Training until validation scores don't improve for 100 rounds.
[50]	training's rmse: 2.87837e+07	valid_1's rmse: 4.69223e+07
[100]	training's rmse: 2.79631e+07	valid_1's rmse: 4.66699e+07
[150]	training's rmse: 2.72719e+07	valid_1's rmse: 4.65068e+07
[200]	training's rmse: 2.66966e+07	valid_1's rmse: 4.64182e+07
[250]	training's rmse: 2.62205e+07	valid_1's rmse: 4.63938e+07
[300]	training's rmse: 2.57899e+07	valid_1's rmse: 4.63916e+07
[350]	training's rmse: 2.53575e+07	valid_1's rmse: 4.63613e+07
[400]	training's rmse: 2.49753e+07	valid_1's rmse: 4.63403e+07
[450]	training's rmse: 2.45879e+07	valid_1's rmse: 4.63781e+07
[500]	training's rmse: 2.42414e+07	valid_1's rmse: 4.64036e+07
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 2.42414e+07	valid_1's rmse: 4.64036e+07


<lightgbm.basic.Booster at 0x10deca160>

In [28]:
def do_training(train_X, train_y, val_X, val_y, test_X, model_name, params):
    
    if model_name == 'lgb':
        
        lgb_train = lgb.Dataset(train_X, label = train_y)
        lgb_val = lgb.Dataset(val_X, label = val_y)
        
        model = lgb.train(params, 
                          lgb_train, 
                          num_boost_round=500,
                          valid_sets=[lgb_train, lgb_val],
                          early_stopping_rounds=100,
                          verbose_eval=50)
    
    return model