In [26]:
import pandas as pd
import numpy as np
from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

import sklearn.metrics as skl_metrics
import math

logger = getLogger(__name__)

DIR = '../result_tmp/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')


def NWRMSLE(y, pred, weights=None):
    err2 = skl_metrics.mean_squared_log_error(y, pred, sample_weight=weights)
    return math.sqrt(err2)


dtypes = {'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}

# Test period 07-26 to 08-10
# train 35328
train_all = pd.read_csv('../../input/train_small.csv',  dtype=dtypes, parse_dates=['date'],
                    skiprows=range(1, 26565) #Skip dates before 2016-08-01
                    )
logger.info('load data successful')



2017-11-29 17:05:34,490 __main__ 25 [INFO][<module>] start 
2017-11-29 17:05:34,490 __main__ 25 [INFO][<module>] start 
2017-11-29 17:05:34,490 __main__ 25 [INFO][<module>] start 
2017-11-29 17:05:34,515 __main__ 40 [INFO][<module>] load data successful 
2017-11-29 17:05:34,515 __main__ 40 [INFO][<module>] load data successful 
2017-11-29 17:05:34,515 __main__ 40 [INFO][<module>] load data successful 


In [27]:
train = train_all.loc[(train_all.date <= '20170725'), ]
train = train.loc[(train.store_nbr == 9), ]

test = train_all.loc[(train_all.date > '20170725'), ]
test = test.loc[(test.store_nbr == 9), ]

train.tail()


Unnamed: 0,date,store_nbr,item_nbr,unit_sales
8634,2017-07-21,9,103501,1.0
8659,2017-07-22,9,103501,2.0
8684,2017-07-23,9,103501,9.0
8711,2017-07-24,9,103501,5.0
8737,2017-07-25,9,103501,1.0


In [28]:
test.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales
8763,2017-07-26,9,103501,3.0
8786,2017-07-27,9,103501,6.0
8809,2017-07-28,9,103501,5.0
8833,2017-07-29,9,103501,6.0
8858,2017-07-30,9,103501,10.0


In [29]:



train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion
train['dow'] = train['date'].dt.dayofweek


# creating records for all items, in all markets on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
)

logger.info('reindex train data')

train.head(1)

2017-11-29 17:05:34,657 __main__ 22 [INFO][<module>] reindex train data 
2017-11-29 17:05:34,657 __main__ 22 [INFO][<module>] reindex train data 
2017-11-29 17:05:34,657 __main__ 22 [INFO][<module>] reindex train data 


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales,dow
date,store_nbr,item_nbr,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-08-01,9,103501,2.079442,0


In [30]:



train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
train.reset_index(inplace=True) # reset index and restoring unique columns
lastdate = train.iloc[train.shape[0]-1].date


In [31]:
#Days of Week Means
#By tarobxl: https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/42948
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw')
ma_dw.reset_index(inplace=True)
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk')
ma_wk.reset_index(inplace=True)
ma_dw.head(5)

Unnamed: 0,item_nbr,store_nbr,dow,madw
0,103501,9,0,1.727702
1,103501,9,1,1.780197
2,103501,9,2,1.670718
3,103501,9,3,1.68987
4,103501,9,4,1.723627


In [34]:
ma_dw

Unnamed: 0,item_nbr,store_nbr,dow,madw
0,103501,9,0,1.727702
1,103501,9,1,1.780197
2,103501,9,2,1.670718
3,103501,9,3,1.68987
4,103501,9,4,1.723627
5,103501,9,5,1.983202
6,103501,9,6,2.191067


In [33]:
#Moving Averages
logger.info('start calcualte moving average')
ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais226')
for i in [112,56,28,14,7,3,1]:
    tmp = train[train.date>lastdate-timedelta(int(i))]
    tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
    ma_is = ma_is.join(tmpg, how='left')

del tmp,tmpg,train

ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)
ma_is.head(5)

2017-11-29 17:05:52,510 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-29 17:05:52,510 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-29 17:05:52,510 __main__ 2 [INFO][<module>] start calcualte moving average 


Unnamed: 0,item_nbr,store_nbr,mais226,mais112,mais56,mais28,mais14,mais7,mais3,mais1,mais
0,103501,9,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645


In [39]:
ma_is

Unnamed: 0,item_nbr,store_nbr,mais226,mais112,mais56,mais28,mais14,mais7,mais3,mais1,mais
0,103501,9,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645


In [40]:
#Load test
#logger.info('load test data')
#test = pd.read_csv('../../input/test_small.csv', dtype=dtypes, parse_dates=['date'])

test = train_all.loc[(train_all.date > '20170725'), ]
test = test.loc[(test.store_nbr == 9), ]

test['dow'] = test['date'].dt.dayofweek
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])

#del ma_is, ma_wk, ma_dw

#Forecasting Test
test['pred_sales'] = test.mais
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'pred_sales'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:, "pred_sales"].fillna(0, inplace=True)
test['pred_sales'] = test['pred_sales'].apply(pd.np.expm1) # restoring unit values

#50% more for promotion items
#test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.5

test = test.loc[(test.store_nbr == 9), ]
test.head(10)

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,dow,mais226,mais112,mais56,mais28,mais14,mais7,mais3,mais1,mais,mawk,madw,pred_sales
0,2017-07-26,9,103501,3.0,2,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.670718,3.46216
1,2017-07-27,9,103501,6.0,3,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.68987,3.539323
2,2017-07-28,9,103501,5.0,4,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.723627,3.678591
3,2017-07-29,9,103501,6.0,5,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.983202,4.902463
4,2017-07-30,9,103501,10.0,6,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,2.191067,6.109638
5,2017-07-31,9,103501,6.0,0,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.727702,3.695688
6,2017-08-01,9,103501,4.0,1,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.780197,3.921622
7,2017-08-02,9,103501,6.0,2,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.670718,3.46216
8,2017-08-03,9,103501,9.0,3,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.68987,3.539323
9,2017-08-04,9,103501,3.0,4,1.821337,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.723627,3.678591


In [67]:
test.shape

(20, 17)

In [72]:
weights = np.ones(20)
result = NWRMSLE(test.unit_sales,test.pred_sales, weights)
result

0.49131943023621705