In [34]:

"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding
more average features and weekly average features on it.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
import lightgbm as lgb
import sys
import math
import gc
import sklearn.metrics as skl_metrics

# import math
# import sklearn.metrics as skl_metrics
# from sklearn.metrics import mean_squared_error

from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)

pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s]\
[%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')


df_train = pd.read_csv(
    '../input/train_2s.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
)

df_test = pd.read_csv(
    "../input/test_2s.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)


2018-01-04 20:42:49,548 __main__ 40 [INFO][<module>] start 
2018-01-04 20:42:49,548 __main__ 40 [INFO][<module>] start 
2018-01-04 20:42:49,548 __main__ 40 [INFO][<module>] start 
  interactivity=interactivity, compiler=compiler, result=result)


In [35]:
items = pd.read_csv("../input/items.csv",)
items.head(1)

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0


In [78]:

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range
              (dt - timedelta(days=minus), periods=periods, freq=freq)]



def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "family": df_2017_nbr.family,
        "date": (t2017), 
        "s_f_day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "s_f_mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "s_f_mean_21_2017": get_timespan(df_2017, t2017, 21, 21).mean(axis=1).values,
        "s_f_mean_42_2017": get_timespan(df_2017, t2017, 42, 42).mean(axis=1).values,
        "s_f_mean_91_2017": get_timespan(df_2017, t2017, 91, 91).mean(axis=1).values,
        "s_f_mean_182_2017": get_timespan(df_2017, t2017, 182, 182).mean(axis=1).values,
        "s_f_mean_364_2017": get_timespan(df_2017, t2017, 364, 364).mean(axis=1).values,
    })
  
    for i in range(7):
        X['s_f_dow_4_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['s_f_dow_13_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 91-i, 13, freq='7D').mean(axis=1).values
        X['s_f_dow_26_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 182-i, 26, freq='7D').mean(axis=1).values
        X['s_f_dow_52_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 364-i, 52, freq='7D').mean(axis=1).values        


    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [56]:
df_train_store_items = pd.merge(df_train, items, on =['item_nbr'], how = 'inner')

In [57]:
df_train_store_items.head(1)

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,family,class,perishable
0,2013-01-01,25,103665,2.079442,,BREAD/BAKERY,2712,1


In [58]:

# Aggregate to item level
df_train_store_family = df_train_store_items[['family','date', 'store_nbr', 'unit_sales']].groupby(['family','date'])\
    .agg({'unit_sales': 'sum', 'store_nbr':'count'}).reset_index()

In [59]:

df_train_store_family["item_avg_sales"] = df_train_store_family["unit_sales"] / df_train_store_family["store_nbr"]


In [60]:

df_2017 = df_train_store_family.set_index(
    ["family", "store_nbr", "date"])[["item_avg_sales"]].unstack(
        level=-1).fillna(0)



In [61]:
df_2017.columns = df_2017.columns.get_level_values(1)

df_2017_nbr = pd.DataFrame(df_2017.copy())
df_2017_nbr.reset_index(inplace = True)


In [62]:
df_2017_nbr.head(1)

date,family,store_nbr,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00,2013-01-07 00:00:00,2013-01-08 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
0,AUTOMOTIVE,1,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.0


In [63]:
    
df_2017[pd.datetime(2016, 12, 25)] = 0
df_2017[pd.datetime(2015, 12, 25)] = 0
df_2017[pd.datetime(2014, 12, 25)] = 0
df_2017[pd.datetime(2017, 1, 1)] = 0
df_2017[pd.datetime(2016, 1, 1)] = 0
df_2017[pd.datetime(2015, 1, 1)] = 0    
df_2017[pd.datetime(2015, 7, 7)] = 0

In [64]:
param_1 = '1s'

In [79]:
##########################################################################
logger.info('Preparing traing dataset...')

X_l, y_l = [], []

t2016 = date(2016, 8, 3)
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2016 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)

train_week_2017 = 7
if param_1 != "val":
    train_week_2017 = 9

t2017 = date(2017, 5, 31)
for i in range(train_week_2017):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)


X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

delta = timedelta(0)

X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

##########################################################################
logger.info('Save Store Item Features ...')

y_columns = ["day" + str(i) for i in range(1, 17)]

df_y_train = pd.DataFrame(data = y_train, columns = y_columns)
X_train.reset_index(inplace = True)
X_train.reindex(index = df_y_train.index)
#train_out = pd.concat([X_train, df_y_train], axis = 1) 
train_out = X_train

df_y_val = pd.DataFrame(data = y_val, columns = y_columns)
X_val.reset_index(inplace = True)
X_val.reindex(index = df_y_val.index)
#val_out = pd.concat([X_val, df_y_val], axis = 1)
val_out = X_val


2018-01-04 20:50:50,816 __main__ 2 [INFO][<module>] Preparing traing dataset... 
2018-01-04 20:50:50,816 __main__ 2 [INFO][<module>] Preparing traing dataset... 
2018-01-04 20:50:50,816 __main__ 2 [INFO][<module>] Preparing traing dataset... 
2018-01-04 20:50:52,065 __main__ 39 [INFO][<module>] Save Store Item Features ... 
2018-01-04 20:50:52,065 __main__ 39 [INFO][<module>] Save Store Item Features ... 
2018-01-04 20:50:52,065 __main__ 39 [INFO][<module>] Save Store Item Features ... 


In [80]:
train_out.info(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36088 entries, 0 to 36087
Data columns (total 38 columns):
index                36088 non-null int64
date                 36088 non-null object
family               36088 non-null object
s_f_day_1_2017       36088 non-null float64
s_f_mean_182_2017    36088 non-null float64
s_f_mean_21_2017     36088 non-null float64
s_f_mean_364_2017    36088 non-null float64
s_f_mean_42_2017     36088 non-null float64
s_f_mean_7_2017      36088 non-null float64
s_f_mean_91_2017     36088 non-null float64
s_f_dow_4_0_mean     36088 non-null float64
s_f_dow_13_0_mean    36088 non-null float64
s_f_dow_26_0_mean    36088 non-null float64
s_f_dow_52_0_mean    36088 non-null float64
s_f_dow_4_1_mean     36088 non-null float64
s_f_dow_13_1_mean    36088 non-null float64
s_f_dow_26_1_mean    36088 non-null float64
s_f_dow_52_1_mean    36088 non-null float64
s_f_dow_4_2_mean     36088 non-null float64
s_f_dow_13_2_mean    36088 non-null float64
s_f_dow_26_2_me

In [81]:
s_f_train_out = pd.merge(train_out, items[['item_nbr','family']], how = 'inner' )

In [84]:
del s_f_train_out['family']

KeyError: 'family'