In [1]:
import pandas as pd
import numpy as np
import copy
import sys
import datetime as dt
import os

from sklearn.model_selection import train_test_split

import modelling_utils as mu

import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
target_var = 'transactionRevenue'

model_name = 'lgb'

catVarsDict = {'channelGrouping' : 'BinaryEncoder',
               'browser': 'LabelEncoder',
               'operatingSystem': 'LabelEncoder',
               'deviceCategory': 'OneHot',
               'continent': 'BinaryEncoder',
               'subContinent': 'LabelEncoder',
               'country': 'LabelEncoder',
               'region': 'LabelEncoder',
               'metro': 'LabelEncoder',
               'city': 'LabelEncoder',
               'networkDomain': 'LabelEncoder',
               'campaign': 'LabelEncoder',
               'source': 'LabelEncoder',
               'medium': 'LabelEncoder',
               'sourceMedium': 'LabelEncoder'}

params_lgb = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42}

In [3]:
train  = pd.read_csv(os.getcwd() + '/train_set_processed.csv', index_col=0)
test = pd.read_csv(os.getcwd() + '/test_set_processed.csv', index_col=0)

In [5]:
train['set'] = 'train'
test['set'] = 'test'
# Target variable for the test set is created just for data processing purpose
test[target_var]= 0

In [6]:
all = pd.concat([train,test],axis=0)
print('\nAll Data shape: {} Rows, {} Columns'.format(*all.shape))


All Data shape: 90365 Rows, 31 Columns


In [7]:
print("Categorical variables: " + str(list(all.select_dtypes(include=['object']))))

Categorical variables: ['channelGrouping', 'date', 'sessionId', 'browser', 'operatingSystem', 'deviceCategory', 'continent', 'subContinent', 'country', 'region', 'metro', 'city', 'networkDomain', 'campaign', 'source', 'medium', 'sourceMedium', 'set']


In [8]:
all = mu.processingPreModelling(df = all, catVarsDict = catVarsDict)

In [9]:
all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90365 entries, 1 to 18186
Data columns (total 38 columns):
continent_1               90365 non-null int64
continent_2               90365 non-null int64
continent_3               90365 non-null int64
deviceCategory_desktop    90365 non-null int64
deviceCategory_mobile     90365 non-null int64
deviceCategory_tablet     90365 non-null int64
channelGrouping_0         90365 non-null int64
channelGrouping_1         90365 non-null int64
channelGrouping_2         90365 non-null int64
channelGrouping_3         90365 non-null int64
date                      90365 non-null datetime64[ns]
fullVisitorId             90365 non-null float64
sessionId                 90365 non-null object
visitId                   90365 non-null int64
visitNumber               90365 non-null int64
visitStartTime            90365 non-null int64
browser                   90365 non-null int64
operatingSystem           90365 non-null int64
isMobile                  90365 n

In [18]:
train_all = all.loc[all['set'] == 'train']
X = train_all
y = train_all[target_var]
y = y.apply(lambda x: np.log1p(x))

In [22]:
y.loc[y  > 0].head(n=10)

45     19.538736
120    19.849220
283    17.228637
340    19.108765
411    17.945061
500    16.453138
549    17.909021
672    16.873278
882    18.314987
903    19.886525
Name: transactionRevenue, dtype: float64

In [23]:
X = X.drop([target_var, 'set', 'sessionId', 'date'], axis=1)

In [24]:
train_X, val_X, train_y, val_y = train_test_split(X, y,
                                                test_size=0.15, random_state=1)

In [43]:
test_X_orig = all.loc[all['set'] == 'test']
test_X = test_X_orig.drop([target_var, 'set', 'sessionId', 'date'], axis=1)

In [26]:
print('\n Training set shape: {} Rows, {} Columns'.format(*train_X.shape))
print('\n Validation set shape: {} Rows, {} Columns'.format(*val_X.shape))
print('\n Test set shape: {} Rows, {} Columns'.format(*test_X.shape))


 Training set shape: 61352 Rows, 34 Columns

 Validation set shape: 10827 Rows, 34 Columns

 Test set shape: 18186 Rows, 34 Columns


In [38]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

def do_training(train_X, train_y, val_X, val_y, model_name, params):
    
    if model_name == 'lgb':
        
        lgb_train = lgb.Dataset(train_X, label = train_y)
        lgb_val = lgb.Dataset(val_X, label = val_y)
        
        model = lgb.train(params, 
                          lgb_train, 
                          num_boost_round=500,
                          valid_sets=[lgb_train, lgb_val],
                          early_stopping_rounds=100,
                          verbose_eval=50)
        
        train_y_pred = model.predict(train_X, num_iteration = model.best_iteration)
        val_y_pred = model.predict(val_X, num_iteration = model.best_iteration)
        print(f"LGBM: RMSE val: {rmse(val_y, val_y_pred)}  - RMSE train: {rmse(train_y, train_y_pred)}")
        
    return model

In [39]:
model = do_training(train_X, train_y, val_X, val_y, model_name, params = params_lgb)

Training until validation scores don't improve for 100 rounds.
[50]	training's rmse: 1.8591	valid_1's rmse: 1.83711
[100]	training's rmse: 1.76691	valid_1's rmse: 1.76746
[150]	training's rmse: 1.69294	valid_1's rmse: 1.71765
[200]	training's rmse: 1.63628	valid_1's rmse: 1.68485
[250]	training's rmse: 1.59651	valid_1's rmse: 1.66661
[300]	training's rmse: 1.5609	valid_1's rmse: 1.65251
[350]	training's rmse: 1.52825	valid_1's rmse: 1.64377
[400]	training's rmse: 1.50109	valid_1's rmse: 1.63778
[450]	training's rmse: 1.47682	valid_1's rmse: 1.63372
[500]	training's rmse: 1.45483	valid_1's rmse: 1.63045
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 1.45483	valid_1's rmse: 1.63045
LGBM: RMSE val: 1.63045  - RMSE train: 1.45483


In [40]:
test_y_pred = model.predict(test_X, num_iteration=model.best_iteration)

In [42]:
test_y_pred[1:10]

array([0.00917878, 0.08503638, 0.00628249, 0.00508158, 0.07352799,
       0.00508158, 0.02012852, 0.02896378, 0.00454976])

In [48]:
prepare_submission(test_X_orig, test_y_pred, filename = 'submit.csv')

In [46]:
def prepare_submission(test_X, test_y_pred, filename = 'submit.csv'):
    
    submission = test_X[['fullVisitorId']].copy()
    submission.loc[:, 'PredictedLogRevenue'] = test_y_pred
    grouped_submission = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
    grouped_submission.to_csv(filename,index=False)

In [49]:
submission = test_X_orig[['fullVisitorId']].copy()

In [50]:
submission.shape

(18186, 1)

In [51]:
len(test_y_pred)

18186