In [92]:

import pandas as pd
import numpy as np
import lightgbm as lgb
import sys
import math
import gc
import sklearn.metrics as skl_metrics
from sklearn.metrics import mean_squared_error

# import math
# import sklearn.metrics as skl_metrics
# from sklearn.metrics import mean_squared_error

from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)

pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s]\
    [%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

2018-01-04 14:01:29,021 __main__ 35 [INFO]    [<module>] start 


In [62]:
train_out = pd.read_pickle('../data/storeitem_train_1s.p')
val_out = pd.read_pickle('../data/storeitem_val_1s.p')
X_test_out = pd.read_pickle('../data/storeitem_test_1s.p')

item_train_out = pd.read_pickle('../data/item_train_1s.p')
item_val_out = pd.read_pickle('../data/item_val_1s.p')
item_X_test_out = pd.read_pickle('../data/item_test_1s.p')

df_test = pd.read_csv(
    "../input/test_1s.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)


items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")
items = items.reindex(train_out.item_nbr)

items_val = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")
items_val = items_val.reindex(val_out['item_nbr'])


In [63]:
all_columns = train_out.columns.tolist()

y_columns = ['day'+str(i) for i in range(1, 17)]
x_columns = [item for item in all_columns if item not in y_columns]


In [64]:
train_out.head(1)

Unnamed: 0,index,date,day_1_2017,item_nbr,mean_140_2017,mean_14_2017,mean_182_2017,mean_21_2017,mean_30_2017,mean_364_2017,...,day7,day8,day9,day10,day11,day12,day13,day14,day15,day16
0,0,2016-08-03,0.0,96995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
item_train_out.head(1)

Unnamed: 0,index,date,item_day_1_2017,item_mean_182_2017,item_mean_21_2017,item_mean_364_2017,item_mean_42_2017,item_mean_7_2017,item_mean_91_2017,item_nbr,...,item_dow_26_4_mean,item_dow_52_4_mean,item_dow_4_5_mean,item_dow_13_5_mean,item_dow_26_5_mean,item_dow_52_5_mean,item_dow_4_6_mean,item_dow_13_6_mean,item_dow_26_6_mean,item_dow_52_6_mean
0,0,2016-08-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
del train_out["index"]
del item_train_out["index"]

In [67]:
train_out = pd.merge(train_out, item_train_out, how='inner', on=['item_nbr','date'])
val_out = pd.merge(val_out, item_val_out, how='inner', on=['item_nbr','date'])
X_test_out = pd.merge(X_test_out, item_X_test_out, how='inner', on=['item_nbr','date'])


In [68]:
train_out.columns

Index(['date', 'day_1_2017', 'item_nbr', 'mean_140_2017', 'mean_14_2017',
       'mean_182_2017', 'mean_21_2017', 'mean_30_2017', 'mean_364_2017',
       'mean_3_2017',
       ...
       'item_dow_26_4_mean', 'item_dow_52_4_mean', 'item_dow_4_5_mean',
       'item_dow_13_5_mean', 'item_dow_26_5_mean', 'item_dow_52_5_mean',
       'item_dow_4_6_mean', 'item_dow_13_6_mean', 'item_dow_26_6_mean',
       'item_dow_52_6_mean'],
      dtype='object', length=178)

In [69]:
train_merge.shape

(46241, 194)

In [70]:
train_out.shape

(46241, 178)

In [71]:
item_train_out.shape

(46956, 37)

In [72]:
train_merge.head(2)

Unnamed: 0,date,day_1_2017,item_nbr,mean_140_2017,mean_14_2017,mean_182_2017,mean_21_2017,mean_30_2017,mean_364_2017,mean_3_2017,...,day7_y,day8_y,day9_y,day10_y,day11_y,day12_y,day13_y,day14_y,day15_y,day16_y
0,2016-08-03,0.0,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2016-08-03,0.0,99197,0.155837,0.0,0.119875,0.0,0.0,0.180301,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
all_columns = train_out.columns.tolist()

y_columns = ['day'+str(i) for i in range(1, 17)]
x_columns = [item for item in all_columns if item not in y_columns]

features_all = x_columns
features_all.remove("date") 
features_all.remove("item_nbr") 
features_all.remove("store_nbr") 


In [74]:

X_train_out = train_out[x_columns]

In [75]:
val_out.shape

(3557, 180)

In [76]:

X_val_out = val_out[x_columns]

In [77]:
x_columns

['day_1_2017',
 'mean_140_2017',
 'mean_14_2017',
 'mean_182_2017',
 'mean_21_2017',
 'mean_30_2017',
 'mean_364_2017',
 'mean_3_2017',
 'mean_42_2017',
 'mean_60_2017',
 'mean_7_2017',
 'mean_91_2017',
 'mean_ly_14_2017',
 'mean_ly_21_2017',
 'mean_ly_30_2017',
 'mean_ly_7_2017',
 'mean_ly_n16d_2017',
 'promo_140_2017',
 'promo_14_2017',
 'promo_60_2017',
 'ly_1d_d0',
 'l2y_1d_d0',
 'ly_1d_d1',
 'l2y_1d_d1',
 'ly_1d_d2',
 'l2y_1d_d2',
 'ly_1d_d3',
 'l2y_1d_d3',
 'ly_1d_d4',
 'l2y_1d_d4',
 'ly_1d_d5',
 'l2y_1d_d5',
 'ly_1d_d6',
 'l2y_1d_d6',
 'ly_1d_d7',
 'l2y_1d_d7',
 'ly_1d_d8',
 'l2y_1d_d8',
 'ly_1d_d9',
 'l2y_1d_d9',
 'ly_1d_d10',
 'l2y_1d_d10',
 'ly_1d_d11',
 'l2y_1d_d11',
 'ly_1d_d12',
 'l2y_1d_d12',
 'ly_1d_d13',
 'l2y_1d_d13',
 'ly_1d_d14',
 'l2y_1d_d14',
 'ly_1d_d15',
 'l2y_1d_d15',
 'dow_1_0_mean',
 'dow_4_0_mean',
 'dow_8_0_mean',
 'dow_13_0_mean',
 'dow_26_0_mean',
 'dow_52_0_mean',
 'dow_ly3w_0_mean',
 'dow_ly8w_0_mean',
 'dow_1_1_mean',
 'dow_4_1_mean',
 'dow_8_1_mean',
 

In [78]:
y_columns

['day1',
 'day2',
 'day3',
 'day4',
 'day5',
 'day6',
 'day7',
 'day8',
 'day9',
 'day10',
 'day11',
 'day12',
 'day13',
 'day14',
 'day15',
 'day16']

In [79]:
y_train = train_out[y_columns].values

In [90]:
y_train

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.69314718, ...,  0.        ,
         0.69314718,  0.69314718],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.09861229],
       [ 1.79175947,  1.60943791,  2.7080502 , ...,  0.        ,
         1.38629436,  1.60943791],
       [ 0.        ,  0.        ,  0.        , ...,  0.69314718,
         0.        ,  0.        ]])

In [80]:

y_train = train_out[y_columns].values
y_val = val_out[y_columns].values

In [83]:

X_train_allF = X_train_out[features_all]
X_val_allF = X_val_out[features_all]
X_test_allF = X_test_out[features_all]

#del train_out, val_out
#del X_train_out, X_val_out, X_test_out
gc.collect()

params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []


train_week_2017 = 7

features_all = X_train_allF.columns.tolist()


In [88]:

i = 0 

print("=" * 70)
print("=" * 70)
features_t = features_all.copy()

for j in range(16):
    if j != i:
        features_t.remove('ly_1d_d{}'.format(j))
        features_t.remove('l2y_1d_d{}'.format(j))

for j in range(7):
    if j != i%7:
        features_t.remove('dow_1_{}_mean'.format(j))
        features_t.remove('dow_4_{}_mean'.format(j))
        features_t.remove('dow_8_{}_mean'.format(j))
        features_t.remove('dow_13_{}_mean'.format(j))
        features_t.remove('dow_26_{}_mean'.format(j))
        features_t.remove('dow_52_{}_mean'.format(j))
        features_t.remove('dow_ly3w_{}_mean'.format(j))
        features_t.remove('dow_ly8w_{}_mean'.format(j))

        features_t.remove('item_dow_4_{}_mean'.format(j))
        features_t.remove('item_dow_13_{}_mean'.format(j))
        features_t.remove('item_dow_26_{}_mean'.format(j))
        features_t.remove('item_dow_52_{}_mean'.format(j))


X_train = X_train_allF[features_t]
X_val = X_val_allF[features_t]
X_test = X_test_allF[features_t]

dtrain = lgb.Dataset(
    X_train, label=y_train[:, i],
    categorical_feature=cate_vars,
    weight=pd.concat([items["perishable"]]) * 0.25 + 1
)




In [89]:

dval = lgb.Dataset(
     X_val, label=y_val[:, i], reference=dtrain,
     weight=items_val["perishable"] * 0.25 + 1,
     categorical_feature=cate_vars)

bst = lgb.train(
    params, dtrain, num_boost_round=MAX_ROUNDS,
    valid_sets=[dtrain, dval], early_stopping_rounds=50,
    verbose_eval=100
)




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.249349	valid_1's l2: 0.239901
[200]	training's l2: 0.230128	valid_1's l2: 0.222061
[300]	training's l2: 0.214812	valid_1's l2: 0.20676
[400]	training's l2: 0.201696	valid_1's l2: 0.193711
[500]	training's l2: 0.189965	valid_1's l2: 0.181778
