In [83]:
import pandas as pd
import numpy as np
from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

import sklearn.metrics as skl_metrics
import math

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../result_tmp/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')


def NWRMSLE(y, pred, weights=None):
    err2 = skl_metrics.mean_squared_log_error(y, pred, sample_weight=weights)
    return math.sqrt(err2)

def NWRMSLE_A(y, pred, weights):
    y = np.array(y)
    pred = np.array(pred)
    weights = np.array(weights)
    weighted_errors = np.dot(np.square(np.log1p(pred) - np.log1p(y)), np.transpose(weights))
    weights_sum = np.sum(weights)
    return math.sqrt(weighted_errors/weights_sum)


dtypes = {'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}

# Test period 07-26 to 08-10
# train 35328
train_all = pd.read_csv('../../input/train_small.csv',  dtype=dtypes, parse_dates=['date'],
                    skiprows=range(1, 26565) #Skip dates before 2016-08-01
                    )
items = pd.read_csv('../../input/items.csv'  )
logger.info('load data successful')



2017-11-30 21:33:20,962 __main__ 26 [INFO][<module>] start 
2017-11-30 21:33:20,962 __main__ 26 [INFO][<module>] start 
2017-11-30 21:33:20,962 __main__ 26 [INFO][<module>] start 
2017-11-30 21:33:20,962 __main__ 26 [INFO][<module>] start 
2017-11-30 21:33:20,962 __main__ 26 [INFO][<module>] start 
2017-11-30 21:33:20,962 __main__ 26 [INFO][<module>] start 
2017-11-30 21:33:20,962 __main__ 26 [INFO][<module>] start 
2017-11-30 21:33:20,994 __main__ 50 [INFO][<module>] load data successful 
2017-11-30 21:33:20,994 __main__ 50 [INFO][<module>] load data successful 
2017-11-30 21:33:20,994 __main__ 50 [INFO][<module>] load data successful 
2017-11-30 21:33:20,994 __main__ 50 [INFO][<module>] load data successful 
2017-11-30 21:33:20,994 __main__ 50 [INFO][<module>] load data successful 
2017-11-30 21:33:20,994 __main__ 50 [INFO][<module>] load data successful 
2017-11-30 21:33:20,994 __main__ 50 [INFO][<module>] load data successful 


In [84]:
train = train_all.loc[(train_all.date <= '20170725'), ]
#train = train.loc[(train.store_nbr == 9), ]


test = train_all.loc[(train_all.date > '2017-07-25') & (train_all.date <= '2017-08-10' ), ]
#test = test.loc[(test.store_nbr == 9), ]

test.tail()


Unnamed: 0,date,store_nbr,item_nbr,unit_sales
9150,2017-08-10,34,103501,9.0
9151,2017-08-10,35,103501,2.0
9152,2017-08-10,36,103501,4.0
9153,2017-08-10,40,103501,4.0
9154,2017-08-10,43,103501,10.0


In [85]:
test.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales
8763,2017-07-26,9,103501,3.0
8764,2017-07-26,11,103501,1.0
8765,2017-07-26,14,103501,2.0
8766,2017-07-26,15,103501,2.0
8767,2017-07-26,16,103501,1.0


In [86]:



train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion
train['dow'] = train['date'].dt.dayofweek


# creating records for all items, in all markets on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
)

logger.info('reindex train data')

train.head(1)

2017-11-30 21:33:21,145 __main__ 22 [INFO][<module>] reindex train data 
2017-11-30 21:33:21,145 __main__ 22 [INFO][<module>] reindex train data 
2017-11-30 21:33:21,145 __main__ 22 [INFO][<module>] reindex train data 
2017-11-30 21:33:21,145 __main__ 22 [INFO][<module>] reindex train data 
2017-11-30 21:33:21,145 __main__ 22 [INFO][<module>] reindex train data 
2017-11-30 21:33:21,145 __main__ 22 [INFO][<module>] reindex train data 
2017-11-30 21:33:21,145 __main__ 22 [INFO][<module>] reindex train data 


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales,dow
date,store_nbr,item_nbr,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-08-01,9,103501,2.079442,0.0


In [87]:



train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
train.reset_index(inplace=True) # reset index and restoring unique columns
lastdate = train.iloc[train.shape[0]-1].date


In [88]:
#Days of Week Means
#By tarobxl: https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/42948
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw')
ma_dw.reset_index(inplace=True)
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk')
ma_wk.reset_index(inplace=True)
ma_dw.head(5)

Unnamed: 0,item_nbr,store_nbr,dow,madw
0,103501,9,0.0,1.727702
1,103501,9,1.0,1.780197
2,103501,9,2.0,1.670718
3,103501,9,3.0,1.68987
4,103501,9,4.0,1.723627


In [89]:
#Moving Averages
logger.info('start calcualte moving average')
ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais226')
for i in [112,56,28,14,7,3,1]:
    tmp = train[train.date>lastdate-timedelta(int(i))]
    tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
    ma_is = ma_is.join(tmpg, how='left')

del tmp,tmpg,train

ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)
ma_is.head(5)

2017-11-30 21:33:21,249 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-30 21:33:21,249 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-30 21:33:21,249 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-30 21:33:21,249 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-30 21:33:21,249 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-30 21:33:21,249 __main__ 2 [INFO][<module>] start calcualte moving average 
2017-11-30 21:33:21,249 __main__ 2 [INFO][<module>] start calcualte moving average 


Unnamed: 0,item_nbr,store_nbr,mais226,mais112,mais56,mais28,mais14,mais7,mais3,mais1,mais
0,103501,9,1.80093,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645
1,103501,10,1.051843,1.061635,1.074589,1.110264,0.978075,1.007036,1.155245,0.693147,1.056739
2,103501,11,1.43629,1.204969,1.328972,1.181671,1.060936,1.510919,1.833753,1.609438,1.382631
3,103501,12,1.03644,0.765786,0.817835,0.688981,0.69695,0.683927,1.133732,1.098612,0.79181
4,103501,13,0.879902,0.698174,0.798217,0.720248,0.639914,0.65205,0.828302,0.693147,0.709211


In [90]:
#Load test
#logger.info('load test data')
#test = pd.read_csv('../../input/test_small.csv', dtype=dtypes, parse_dates=['date'])

#test = train_all.loc[(train_all.date > '20170725'), ]
#test = test.loc[(test.store_nbr == 9), ]

test['dow'] = test['date'].dt.dayofweek
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])

#del ma_is, ma_wk, ma_dw

#Forecasting Test
test['pred_sales'] = test.mais
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'pred_sales'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:, "pred_sales"].fillna(0, inplace=True)
test['pred_sales'] = test['pred_sales'].apply(pd.np.expm1) # restoring unit values

test.loc[:, "unit_sales"].fillna(0, inplace=True)

#50% more for promotion items
#test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.5


test.head(1)

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,dow,mais226,mais112,mais56,mais28,mais14,mais7,mais3,mais1,mais,mawk,madw,pred_sales
0,2017-07-26,9,103501,3.0,2,1.80093,1.750748,1.7323,1.663622,1.601667,1.326757,1.595831,0.693147,1.632645,1.823769,1.670718,3.46216


In [91]:
print("Number of rows in test is", test.shape[0])


Number of rows in test is 392


In [92]:

test_e = pd.merge(test, items, on='item_nbr',how='inner')
test_e['weights'] = 1
test_e.loc[(test_e.perishable== 0), ('weights')] = 1.25
test_e.head(2)


Unnamed: 0,date,store_nbr,item_nbr,unit_sales,dow,mais226,mais112,mais56,mais28,mais14,...,mais3,mais1,mais,mawk,madw,pred_sales,family,class,perishable,weights
0,2017-07-26,9,103501,3.0,2,1.80093,1.750748,1.7323,1.663622,1.601667,...,1.595831,0.693147,1.632645,1.823769,1.670718,3.46216,CLEANING,3008,0,1.25
1,2017-07-26,11,103501,1.0,2,1.43629,1.204969,1.328972,1.181671,1.060936,...,1.833753,1.609438,1.382631,1.538973,1.534993,2.971148,CLEANING,3008,0,1.25


In [96]:
#weights = np.ones(test.shape[0])
result = NWRMSLE(test_e.unit_sales.astype(np.float64),test_e.pred_sales.astype(np.float64), test_e.weights)
print("Forecast Period From:", min(test_e.date)," To: ", max(test_e.date))
print("NWRMSLE = ",result)

Forecast Period From: 2017-07-26 00:00:00  To:  2017-08-10 00:00:00
NWRMSLE =  0.4504985981921225


In [98]:
#print(test.loc[:, "unit_sales"].isnull().values.any())
#print(test.loc[:, "pred_sales"].isnull().values.any())
weights = np.ones(test_e.shape[0])
result = NWRMSLE_A(test_e.unit_sales.astype(np.float64),test_e.pred_sales.astype(np.float64), test_e.weights)
print("Forecast Period From:", min(test.date)," To: ", max(test.date))
print("NWRMSLE_A = ",result)


Forecast Period From: 2017-07-26 00:00:00  To:  2017-08-10 00:00:00
NWRMSLE_A =  0.45049859819212257


In [100]:
#### check result on first 6 days.
test_p1 = test_e.loc[(test.date < '2017-08-01'), ]
#weights_p1 = np.ones(test_p1.shape[0])
result_p1 = NWRMSLE_A(test_p1.unit_sales.astype(np.float32),test_p1.pred_sales.astype(np.float32), test_p1.weights)
print("Number of rows in test is", test_p1.shape[0])
print("Forecast Period From:", min(test_p1.date)," To: ", max(test_p1.date))
print("NWRMSLE = ",result_p1)

Number of rows in test is 147
Forecast Period From: 2017-07-26 00:00:00  To:  2017-07-31 00:00:00
NWRMSLE =  0.4286617686087826
