In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import sys

import math
import sklearn.metrics as skl_metrics

from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

#------------------------------------------------------------------------------------#

df_train = pd.read_csv(
    '../input/train_1s.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
    float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

df_test = pd.read_csv(
    "../input/test_1s.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)



items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")




2018-01-02 11:03:05,263 __main__ 32 [INFO][<module>] start 
  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
df_train.shape

(2562153, 5)

In [3]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,5,1)]
#del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
#del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

In [118]:
df_2017_nbr = pd.DataFrame(df_2017.copy())
df_2017_nbr.reset_index(inplace = True)

In [6]:
df_2017.head(2)

Unnamed: 0_level_0,date,2017-05-01 00:00:00,2017-05-02 00:00:00,2017-05-03 00:00:00,2017-05-04 00:00:00,2017-05-05 00:00:00,2017-05-06 00:00:00,2017-05-07 00:00:00,2017-05-08 00:00:00,2017-05-09 00:00:00,2017-05-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
promo_2017.head(2)

Unnamed: 0_level_0,date,2017-05-01 00:00:00,2017-05-02 00:00:00,2017-05-03 00:00:00,2017-05-04 00:00:00,2017-05-05 00:00:00,2017-05-06 00:00:00,2017-05-07 00:00:00,2017-05-08 00:00:00,2017-05-09 00:00:00,2017-05-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [135]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]


def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
       "item_nbr": df_2017_nbr.item_nbr,
       "store_nbr": df_2017_nbr.store_nbr,
       "date": (t2017 + delta), 
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values
    })
    
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values

    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    
#    X['dow'] = X['date'].dt.dayofweek
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X


In [197]:

print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
#del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)


Preparing dataset...


In [139]:
y_train.shape

(13220, 16)

In [140]:
y_columns = ['Day'+str(i) for i in range(1, 17)]

In [141]:
df_y_train = pd.DataFrame(data= y_train,
             columns= y_columns)

In [142]:
X_train.shape

(13220, 30)

In [143]:
df_y_train.shape

(13220, 16)

In [145]:
df_y_train.head(1)

Unnamed: 0,Day1,Day2,Day3,Day4,Day5,Day6,Day7,Day8,Day9,Day10,Day11,Day12,Day13,Day14,Day15,Day16
0,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
X_train.head(1)

Unnamed: 0,date,item_nbr,mean_14_2017,mean_3_2017,mean_7_2017,promo_14_2017,store_nbr,mean_4_dow0_2017,mean_4_dow1_2017,mean_4_dow2_2017,...,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,2017-06-21,96995,0.099021,0.0,0.099021,0,1,0.346574,0.173287,0.173287,...,0,0,0,0,0,0,0,0,0,0


In [162]:
df_y_train.index

RangeIndex(start=0, stop=13220, step=1)

In [176]:
range(len(X_train))

range(0, 13220)

In [199]:
X_train.reset_index(inplace = True)

In [200]:
X_train = X_train.reindex(index=df_y_train.index)

In [195]:
train_out = pd.concat([ X_train, df_y_train], axis = 1)

In [201]:
train_out.to_pickle('../data/train_storeitem.p')

In [202]:
train_out = pd.read_pickle('../data/train_storeitem.p')

In [218]:
all_columns = train_out.columns.tolist()

In [None]:
y_columns = ['Day'+str(i) for i in range(1, 17)]

In [243]:
x_columns = [item for item in all_columns if item not in y_columns]

In [235]:
X_train = train_out[x_columns]

In [212]:
y_train = train_out[y_columns].values

In [244]:
features_all = x_columns
features_all.remove("index") 
features_all.remove("date") 
features_all.remove("item_nbr") 
features_all.remove("store_nbr") 


In [245]:
print(features_all)

['mean_14_2017', 'mean_3_2017', 'mean_7_2017', 'promo_14_2017', 'mean_4_dow0_2017', 'mean_4_dow1_2017', 'mean_4_dow2_2017', 'mean_4_dow3_2017', 'mean_4_dow4_2017', 'mean_4_dow5_2017', 'mean_4_dow6_2017', 'promo_0', 'promo_1', 'promo_2', 'promo_3', 'promo_4', 'promo_5', 'promo_6', 'promo_7', 'promo_8', 'promo_9', 'promo_10', 'promo_11', 'promo_12', 'promo_13', 'promo_14', 'promo_15']


In [241]:
features_all = X_train.columns.tolist()
for i in range(16):
    features_t = features_all.copy()
    for j in range(16):
        if j != i:
            features_t.remove("promo_{}".format(j))
    print(X_train[features_t].head(1))
#X_train.groupby(["dow"]).size()
#X_train["mean_4_dow{}_2017".format(0)].head(2)

   index  mean_14_2017  mean_3_2017  mean_7_2017  promo_14_2017  store_nbr  \
0      0      0.099021          0.0     0.099021              0          1   

   mean_4_dow0_2017  mean_4_dow1_2017  mean_4_dow2_2017  mean_4_dow3_2017  \
0          0.346574          0.173287          0.173287          0.173287   

   mean_4_dow4_2017  mean_4_dow5_2017  mean_4_dow6_2017  promo_0  
0          0.173287               0.0               0.0        0  
   index  mean_14_2017  mean_3_2017  mean_7_2017  promo_14_2017  store_nbr  \
0      0      0.099021          0.0     0.099021              0          1   

   mean_4_dow0_2017  mean_4_dow1_2017  mean_4_dow2_2017  mean_4_dow3_2017  \
0          0.346574          0.173287          0.173287          0.173287   

   mean_4_dow4_2017  mean_4_dow5_2017  mean_4_dow6_2017  promo_1  
0          0.173287               0.0               0.0        0  
   index  mean_14_2017  mean_3_2017  mean_7_2017  promo_14_2017  store_nbr  \
0      0      0.099021        

In [334]:
y_train

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.69314718,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.38629436],
       [ 0.69314718,  1.38629436,  1.38629436, ...,  0.        ,
         0.69314718,  1.94591015],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.69314718,
         1.79175947,  1.60943791],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [338]:
logger.info('Training and predicting models...')

params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []

i = 0

print("=" * 50)
print("Step %d" % (i+1))
print("=" * 50)

dtrain = lgb.Dataset(
    X_train, label=y_train[:, i],
    categorical_feature=cate_vars
)
dval = lgb.Dataset(
     X_val, label=y_val[:, i], reference=dtrain,
     weight=items["perishable"] * 0.25 + 1,
     categorical_feature=cate_vars)



2018-01-01 20:21:38,530 __main__ 1 [INFO][<module>] Training and predicting models... 
2018-01-01 20:21:38,530 __main__ 1 [INFO][<module>] Training and predicting models... 
2018-01-01 20:21:38,530 __main__ 1 [INFO][<module>] Training and predicting models... 


Step 1


In [341]:
y_train

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.69314718,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.38629436],
       [ 0.69314718,  1.38629436,  1.38629436, ...,  0.        ,
         0.69314718,  1.94591015],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.69314718,
         1.79175947,  1.60943791],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [339]:

bst = lgb.train(
    params, dtrain, num_boost_round=MAX_ROUNDS,
    valid_sets=[dtrain, dval], early_stopping_rounds=50,
    verbose_eval=100
)





Training until validation scores don't improve for 50 rounds.


TypeError: 'NoneType' object is not iterable

In [None]:
features_all = X_train.columns.tolist()
i = 0
features_t = features_all.copy()
for j in range(16):
    if j != i:
        features_t.remove("promo_{}".format(j))
X_train[features_t].head(1)

Unnamed: 0,mean_14_2017,mean_3_2017,mean_7_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,1.601666,1.595831,1.326757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
