In [8]:
import numpy as np
import pandas as pd 

x_train_reduced = pd.read_csv('x_train_reduced.csv')
x_test_reduced = pd.read_csv('x_test_reduced.csv')
y_train2 = pd.read_csv('y_train2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [10]:
x_train_reduced = x_train_reduced.loc[:, ~x_train_reduced.columns.str.contains('^Unnamed')]
x_test_reduced = x_test_reduced.loc[:, ~x_test_reduced.columns.str.contains('^Unnamed')]

In [11]:
y_train3 = y_train2.drop(labels=['fullVisitorId'], axis=1)

In [12]:
# Hyperparameter Tuning
# For higher accuracy, tuning max_bin, learning_rate, num_leaves, max_depth, min_data_in_leaf
from sklearn.model_selection import GridSearchCV

In [37]:
%%time
# GBDT
estimator = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', bagging_fraction = 0.7,
        feature_fraction = 0.7,
        bagging_frequency = 6,
        bagging_seed = 42,
        seed = 42)

param_grid = {
    'learning_rate': [0.0015, 0.0025, 0.005, 0.0075, 0.01],
    'num_leaves': [30, 40, 70, 100],
    'max_depth': [7, 10, 15, 20],
    'max_bin':[255, 300, 350],
    'min_data_in_leaf':[20, 50, 100]
    }

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(x_train_reduced, y_train3)

print(gbm.best_params_)

{'learning_rate': 0.01, 'max_bin': 300, 'max_depth': 15, 'min_data_in_leaf': 50, 'num_leaves': 100}
CPU times: user 7d 9h 58min 35s, sys: 3h 49min 13s, total: 7d 13h 47min 49s
Wall time: 15h 37min 13s


In [58]:
%%time
# Dart
estimator1 = lgb.LGBMRegressor(boosting_type='dart', objective='regression', 
                               max_bin=300, min_data_in_leaf=50, max_depth=15,
                               bagging_fraction = 0.7, feature_fraction = 0.7, bagging_frequency = 6, 
                               bagging_seed = 42, seed = 42)

param_grid_dart = {
    'learning_rate': [0.0025, 0.005, 0.01],
    'num_leaves': [40, 70, 100],
    'max_drop':[30, 50, 70],
    'drop_rate':[0.05, 0.1, 0.2],
    'skip_drop':[0.3, 0.5, 0.7]
    }

gbm_dart = GridSearchCV(estimator1, param_grid_dart, cv=3)
gbm_dart.fit(x_train_reduced, y_train3)

print(gbm_dart.best_params_)

{'drop_rate': 0.05, 'learning_rate': 0.01, 'max_drop': 30, 'num_leaves': 100, 'skip_drop': 0.7}
CPU times: user 3d 21min 35s, sys: 1h 17min 33s, total: 3d 1h 39min 8s
Wall time: 5h 53min 43s


In [None]:
# Dart performance is no better than gbdt

In [65]:
%%time
# Further increase learning_rate and num_leaves, since other parameters fall in middle of the value interval 
# GBDT
estimator2 = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', bagging_fraction = 0.7,
        feature_fraction = 0.7,
        bagging_frequency = 6,
        bagging_seed = 42,
        seed = 42,
        max_bin=300,
        max_depth=15,
        min_data_in_leaf=50)

param_grid2 = {
    'learning_rate': [0.01, 0.015, 0.02, 0.03],
    'num_leaves': [100, 150, 200, 250]
    }

gbm2 = GridSearchCV(estimator2, param_grid2, cv=3)
gbm2.fit(x_train_reduced, y_train3)

print(gbm2.best_params_)

{'learning_rate': 0.03, 'num_leaves': 250}
CPU times: user 19h 57min 36s, sys: 13min 29s, total: 20h 11min 5s
Wall time: 1h 12min 13s


In [70]:
%%time
# Continue increase learning_rate and num_leaves 
# GBDT
estimator3 = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', bagging_fraction = 0.7,
        feature_fraction = 0.7,
        bagging_frequency = 6,
        bagging_seed = 42,
        seed = 42,
        max_bin=300,
        max_depth=15)

param_grid3 = {
    'learning_rate': [0.03, 0.05, 0.1, 0.15],
    'num_leaves': [250, 350, 500, 750],
    'min_data_in_leaf': [20, 50, 100, 200]
    }

gbm3 = GridSearchCV(estimator3, param_grid3, cv=3)
gbm3.fit(x_train_reduced, y_train3)

print(gbm3.best_params_)

{'learning_rate': 0.1, 'min_data_in_leaf': 50, 'num_leaves': 500}
CPU times: user 3d 3h 6min 53s, sys: 1h 2min 5s, total: 3d 4h 8min 59s
Wall time: 4h 25min 47s


In [97]:
%%time
# Final fine-tuning
estimator4 = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', bagging_fraction = 0.7,
        feature_fraction = 0.7,
        bagging_frequency = 6,
        bagging_seed = 42,
        seed = 42,
        max_bin=300,
        max_depth=15,
        min_data_in_leaf=50)

param_grid4 = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12],
    'num_leaves': [250, 280, 310, 340, 370, 400, 430, 460, 500, 530]
    }

gbm4 = GridSearchCV(estimator4, param_grid4, cv=3)
gbm4.fit(x_train_reduced, y_train3)

print(gbm4.best_params_)

{'learning_rate': 0.1, 'num_leaves': 500}
CPU times: user 2d 10h 54min 54s, sys: 1h 27min 54s, total: 2d 12h 22min 49s
Wall time: 3h 49min 57s


In [28]:
%%time
num_round = 7000
params = {"objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 500,
        "max_depth" : 15,
        "boosting" : "gbdt",
        "min_data_in_leaf": 100,
        "max_bin": 300,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "seed": 42}

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 14.5 µs


In [29]:
#Output

x_train, x_val, y_train, y_val = train_test_split(x_train_reduced, y_train3, test_size=0.1, random_state=1)
lgb_train_data = lgb.Dataset(x_train, label=y_train)
lgb_val_data = lgb.Dataset(x_val, label=y_val)
rmse_val = lgb.train(params, lgb_train_data, num_round, early_stopping_rounds=20, 
                         valid_sets=[lgb_train_data, lgb_val_data])
y_pred_train = rmse_val.predict(x_train, num_iteration=rmse_val.best_iteration)
y_pred_val = rmse_val.predict(x_val, num_iteration=rmse_val.best_iteration)
rmse_v = np.sqrt(mean_squared_error(y_val, y_pred_val))
rmse_t = np.sqrt(mean_squared_error(y_train, y_pred_train))

[1]	training's rmse: 1.74361	valid_1's rmse: 1.74545
Training until validation scores don't improve for 20 rounds.
[2]	training's rmse: 1.6246	valid_1's rmse: 1.63171
[3]	training's rmse: 1.52284	valid_1's rmse: 1.5338
[4]	training's rmse: 1.42686	valid_1's rmse: 1.44259
[5]	training's rmse: 1.34216	valid_1's rmse: 1.36308
[6]	training's rmse: 1.26613	valid_1's rmse: 1.29258
[7]	training's rmse: 1.19952	valid_1's rmse: 1.23182
[8]	training's rmse: 1.13944	valid_1's rmse: 1.17764
[9]	training's rmse: 1.08688	valid_1's rmse: 1.13163
[10]	training's rmse: 1.03842	valid_1's rmse: 1.08837
[11]	training's rmse: 0.996421	valid_1's rmse: 1.05178
[12]	training's rmse: 0.959042	valid_1's rmse: 1.02049
[13]	training's rmse: 0.92424	valid_1's rmse: 0.991022
[14]	training's rmse: 0.894705	valid_1's rmse: 0.967262
[15]	training's rmse: 0.866771	valid_1's rmse: 0.945768
[16]	training's rmse: 0.841143	valid_1's rmse: 0.925859
[17]	training's rmse: 0.818196	valid_1's rmse: 0.907417
[18]	training's rmse

[148]	training's rmse: 0.305287	valid_1's rmse: 0.666592
[149]	training's rmse: 0.304216	valid_1's rmse: 0.666397
[150]	training's rmse: 0.303082	valid_1's rmse: 0.666185
[151]	training's rmse: 0.302503	valid_1's rmse: 0.666079
[152]	training's rmse: 0.301262	valid_1's rmse: 0.665978
[153]	training's rmse: 0.299893	valid_1's rmse: 0.665703
[154]	training's rmse: 0.298392	valid_1's rmse: 0.6654
[155]	training's rmse: 0.297384	valid_1's rmse: 0.665123
[156]	training's rmse: 0.296163	valid_1's rmse: 0.664848
[157]	training's rmse: 0.294893	valid_1's rmse: 0.664701
[158]	training's rmse: 0.293443	valid_1's rmse: 0.664351
[159]	training's rmse: 0.29223	valid_1's rmse: 0.66404
[160]	training's rmse: 0.291264	valid_1's rmse: 0.663998
[161]	training's rmse: 0.28965	valid_1's rmse: 0.663625
[162]	training's rmse: 0.288731	valid_1's rmse: 0.66328
[163]	training's rmse: 0.287684	valid_1's rmse: 0.663091
[164]	training's rmse: 0.285989	valid_1's rmse: 0.662768
[165]	training's rmse: 0.28532	valid_

[293]	training's rmse: 0.185975	valid_1's rmse: 0.645429
[294]	training's rmse: 0.185308	valid_1's rmse: 0.645335
[295]	training's rmse: 0.185127	valid_1's rmse: 0.645311
[296]	training's rmse: 0.184681	valid_1's rmse: 0.645142
[297]	training's rmse: 0.184323	valid_1's rmse: 0.645071
[298]	training's rmse: 0.183495	valid_1's rmse: 0.644954
[299]	training's rmse: 0.183145	valid_1's rmse: 0.644873
[300]	training's rmse: 0.182786	valid_1's rmse: 0.644792
[301]	training's rmse: 0.182191	valid_1's rmse: 0.644682
[302]	training's rmse: 0.181558	valid_1's rmse: 0.644591
[303]	training's rmse: 0.181238	valid_1's rmse: 0.644522
[304]	training's rmse: 0.180585	valid_1's rmse: 0.644486
[305]	training's rmse: 0.180183	valid_1's rmse: 0.644411
[306]	training's rmse: 0.179891	valid_1's rmse: 0.644428
[307]	training's rmse: 0.179346	valid_1's rmse: 0.644438
[308]	training's rmse: 0.178788	valid_1's rmse: 0.644317
[309]	training's rmse: 0.178434	valid_1's rmse: 0.644213
[310]	training's rmse: 0.177858

[439]	training's rmse: 0.126353	valid_1's rmse: 0.636969
[440]	training's rmse: 0.126008	valid_1's rmse: 0.636894
[441]	training's rmse: 0.125773	valid_1's rmse: 0.636868
[442]	training's rmse: 0.125534	valid_1's rmse: 0.63684
[443]	training's rmse: 0.125293	valid_1's rmse: 0.636795
[444]	training's rmse: 0.124959	valid_1's rmse: 0.636767
[445]	training's rmse: 0.124759	valid_1's rmse: 0.636766
[446]	training's rmse: 0.124478	valid_1's rmse: 0.636732
[447]	training's rmse: 0.124263	valid_1's rmse: 0.636712
[448]	training's rmse: 0.124131	valid_1's rmse: 0.636692
[449]	training's rmse: 0.123657	valid_1's rmse: 0.636601
[450]	training's rmse: 0.12351	valid_1's rmse: 0.636582
[451]	training's rmse: 0.123094	valid_1's rmse: 0.636531
[452]	training's rmse: 0.122792	valid_1's rmse: 0.63649
[453]	training's rmse: 0.122415	valid_1's rmse: 0.636437
[454]	training's rmse: 0.122107	valid_1's rmse: 0.636379
[455]	training's rmse: 0.121762	valid_1's rmse: 0.636341
[456]	training's rmse: 0.121453	va

[584]	training's rmse: 0.0913997	valid_1's rmse: 0.632721
[585]	training's rmse: 0.0911527	valid_1's rmse: 0.632722
[586]	training's rmse: 0.091092	valid_1's rmse: 0.632725
[587]	training's rmse: 0.0908627	valid_1's rmse: 0.632683
[588]	training's rmse: 0.0906124	valid_1's rmse: 0.632651
[589]	training's rmse: 0.0904244	valid_1's rmse: 0.63265
[590]	training's rmse: 0.0903315	valid_1's rmse: 0.632609
[591]	training's rmse: 0.090116	valid_1's rmse: 0.632582
[592]	training's rmse: 0.0899129	valid_1's rmse: 0.632604
[593]	training's rmse: 0.0897307	valid_1's rmse: 0.632561
[594]	training's rmse: 0.0894875	valid_1's rmse: 0.632524
[595]	training's rmse: 0.089337	valid_1's rmse: 0.632514
[596]	training's rmse: 0.0891192	valid_1's rmse: 0.632491
[597]	training's rmse: 0.0888843	valid_1's rmse: 0.632449
[598]	training's rmse: 0.0887567	valid_1's rmse: 0.632446
[599]	training's rmse: 0.0885813	valid_1's rmse: 0.632411
[600]	training's rmse: 0.0883601	valid_1's rmse: 0.632379
[601]	training's r

[726]	training's rmse: 0.07012	valid_1's rmse: 0.630798
[727]	training's rmse: 0.0700566	valid_1's rmse: 0.630786
[728]	training's rmse: 0.0699012	valid_1's rmse: 0.630761
[729]	training's rmse: 0.0697802	valid_1's rmse: 0.630748
[730]	training's rmse: 0.0696102	valid_1's rmse: 0.630716
[731]	training's rmse: 0.0694943	valid_1's rmse: 0.630721
[732]	training's rmse: 0.0693503	valid_1's rmse: 0.630718
[733]	training's rmse: 0.0692184	valid_1's rmse: 0.630703
[734]	training's rmse: 0.0690591	valid_1's rmse: 0.630679
[735]	training's rmse: 0.068914	valid_1's rmse: 0.630656
[736]	training's rmse: 0.0687691	valid_1's rmse: 0.630647
[737]	training's rmse: 0.0686208	valid_1's rmse: 0.630624
[738]	training's rmse: 0.0685145	valid_1's rmse: 0.63061
[739]	training's rmse: 0.0683628	valid_1's rmse: 0.630596
[740]	training's rmse: 0.0682342	valid_1's rmse: 0.630597
[741]	training's rmse: 0.0680876	valid_1's rmse: 0.630562
[742]	training's rmse: 0.068014	valid_1's rmse: 0.630563
[743]	training's rm

[868]	training's rmse: 0.0556295	valid_1's rmse: 0.629586
[869]	training's rmse: 0.0555462	valid_1's rmse: 0.629599
[870]	training's rmse: 0.0554897	valid_1's rmse: 0.629593
[871]	training's rmse: 0.0554057	valid_1's rmse: 0.629601
[872]	training's rmse: 0.0553426	valid_1's rmse: 0.629591
[873]	training's rmse: 0.055258	valid_1's rmse: 0.629603
[874]	training's rmse: 0.0551826	valid_1's rmse: 0.629602
[875]	training's rmse: 0.0550864	valid_1's rmse: 0.629606
[876]	training's rmse: 0.055015	valid_1's rmse: 0.629595
[877]	training's rmse: 0.0549351	valid_1's rmse: 0.629582
[878]	training's rmse: 0.0548545	valid_1's rmse: 0.629587
[879]	training's rmse: 0.0547555	valid_1's rmse: 0.629588
[880]	training's rmse: 0.0547073	valid_1's rmse: 0.629581
[881]	training's rmse: 0.0546173	valid_1's rmse: 0.629563
[882]	training's rmse: 0.0545618	valid_1's rmse: 0.629566
[883]	training's rmse: 0.0545171	valid_1's rmse: 0.629565
[884]	training's rmse: 0.0544589	valid_1's rmse: 0.62956
[885]	training's 

[1011]	training's rmse: 0.0460831	valid_1's rmse: 0.629019
[1012]	training's rmse: 0.0460232	valid_1's rmse: 0.629009
[1013]	training's rmse: 0.0459615	valid_1's rmse: 0.629008
[1014]	training's rmse: 0.0459297	valid_1's rmse: 0.629007
[1015]	training's rmse: 0.045884	valid_1's rmse: 0.629004
[1016]	training's rmse: 0.0458281	valid_1's rmse: 0.629007
[1017]	training's rmse: 0.0457693	valid_1's rmse: 0.629005
[1018]	training's rmse: 0.0457098	valid_1's rmse: 0.62901
[1019]	training's rmse: 0.0456239	valid_1's rmse: 0.628997
[1020]	training's rmse: 0.0455705	valid_1's rmse: 0.628998
[1021]	training's rmse: 0.0455302	valid_1's rmse: 0.628995
[1022]	training's rmse: 0.0454708	valid_1's rmse: 0.62899
[1023]	training's rmse: 0.0454176	valid_1's rmse: 0.628994
[1024]	training's rmse: 0.0453568	valid_1's rmse: 0.629005
[1025]	training's rmse: 0.0452872	valid_1's rmse: 0.629019
[1026]	training's rmse: 0.0452215	valid_1's rmse: 0.629008
[1027]	training's rmse: 0.0451593	valid_1's rmse: 0.629004


[1152]	training's rmse: 0.0390279	valid_1's rmse: 0.62867
[1153]	training's rmse: 0.0389908	valid_1's rmse: 0.628668
[1154]	training's rmse: 0.0389447	valid_1's rmse: 0.628673
[1155]	training's rmse: 0.0388917	valid_1's rmse: 0.628666
[1156]	training's rmse: 0.0388584	valid_1's rmse: 0.628666
[1157]	training's rmse: 0.0387998	valid_1's rmse: 0.628661
[1158]	training's rmse: 0.0387683	valid_1's rmse: 0.628661
[1159]	training's rmse: 0.0387455	valid_1's rmse: 0.62866
[1160]	training's rmse: 0.0386948	valid_1's rmse: 0.628661
[1161]	training's rmse: 0.0386484	valid_1's rmse: 0.628659
[1162]	training's rmse: 0.0386027	valid_1's rmse: 0.628658
[1163]	training's rmse: 0.0385724	valid_1's rmse: 0.628657
[1164]	training's rmse: 0.0385399	valid_1's rmse: 0.628657
[1165]	training's rmse: 0.0384826	valid_1's rmse: 0.628653
[1166]	training's rmse: 0.0384253	valid_1's rmse: 0.628648
[1167]	training's rmse: 0.0383781	valid_1's rmse: 0.62865
[1168]	training's rmse: 0.0383259	valid_1's rmse: 0.62866
[

In [103]:
y_pred_submit = rmse_val.predict(x_test_reduced, num_iteration=rmse_val.best_iteration)

In [169]:
test_id = pd.read_csv('test_grouped_id.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [171]:
test_id = test_id.loc[:, ~test_id.columns.str.contains('^Unnamed')]

In [173]:
submission = pd.DataFrame({'fullVisitorId':test_id['fullVisitorId'],'PredictedLogTotalRevenue_perCustomer':y_pred_submit})
submission['fullVisitorId']= submission['fullVisitorId'].astype(str)
submission['PredictedLogTotalRevenue_perCustomer']=submission['PredictedLogTotalRevenue_perCustomer'].apply(lambda x: 0 if x<0 else x)

In [178]:
type(y_pred_submit)

numpy.ndarray

In [179]:
y_pred_submit.shape

(296530,)

In [175]:
submission.to_csv('submission_LGBM.csv')

In [176]:
rmse_v12

0.6106969969400216

In [177]:
rmse_t12

0.028971592404011762

In [91]:
rmse_v11

0.6106975016256623

In [92]:
rmse_t11

0.02897159240402142

In [88]:
rmse_v10

0.6110207603960199

In [89]:
rmse_t10

0.023738188273136025

In [85]:
rmse_t9

0.020294617196461973

In [86]:
rmse_v9

0.6273708900068489

In [73]:
rmse_v8

0.6607254141969766

In [74]:
rmse_t8

0.05903756234033532

In [68]:
rmse_v7

0.6581985771200763

In [69]:
rmse_t7

0.25507579206222797

In [61]:
rmse_t6

0.5168227779735912

In [62]:
rmse_v6

0.7167359094023436

In [55]:
rmse_v5

0.8671640628381363

In [56]:
rmse_t5

0.77272212017713

In [50]:
rmse_t4

0.7585244443135454

In [51]:
rmse_v4

0.8553213135732253

In [46]:
rmse_t3

0.5815554234060244

In [47]:
rmse_v3

0.7532258686645664

In [42]:
rmse_t2

0.5987741886405407

In [43]:
rmse_v2

0.7531989950098282

In [35]:
rmse_t1

0.9075212398674616

In [39]:
rmse_v1

0.9414817504736281