In [47]:
import pandas as pd
import numpy as np
from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

import sklearn.metrics as skl_metrics
import xgboost as xgb
from sklearn.cross_validation import train_test_split

import math

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = './result_tmp/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def NWRMSLE(y, pred, weights=None):
    err2 = skl_metrics.mean_squared_log_error(y, pred, sample_weight=weights)
    return math.sqrt(err2)

def NWRMSLE_A(y, pred, weights):
    y = np.array(y)
    pred = np.array(pred)
    weights = np.array(weights)
    weighted_errors = np.dot(np.square(np.log1p(pred) - np.log1p(y)), np.transpose(weights))
    weights_sum = np.sum(weights)
    return math.sqrt(weighted_errors/weights_sum)

def NWRMSLE_lgb(pred, dtrain):
    y = list(dtrain.get_label())
    score = NWRMSLE(y, pred)
    return 'NWRMSLE', score, False

def eval_test(test_e):

    test_e['weights'] = 1
    test_e.loc[(test_e.perishable == 1), ('weights')] = 1.25

    result = NWRMSLE(test_e.unit_sales.astype(np.float64),test_e.pred_sales.astype(np.float64), test_e.weights)

    print("Eval All, Number of rows in test is", test_e.shape[0])
    print("Eval all, Forecast Period From:", min(test_e.date)," To: ", max(test_e.date))

    #### check result on first 6 days.
    test_p1 = test_e.loc[(test_e.date < '2017-08-01'), ]
    result_p1 = NWRMSLE_A(test_p1.unit_sales.astype(np.float32),test_p1.pred_sales.astype(np.float32), test_p1.weights)

    print("Eval P1, Number of rows in test is", test_p1.shape[0])
    print("Eval P1, Forecast Period From:", min(test_p1.date)," To: ", max(test_p1.date))

    #### check result on last 10 days.
    test_p2 = test_e.loc[(test_e.date >= '2017-08-01'), ]
    result_p2 = NWRMSLE_A(test_p2.unit_sales.astype(np.float32),test_p2.pred_sales.astype(np.float32), test_p2.weights)

    print("Eval P2, Number of rows in test is", test_p2.shape[0])
    print("Eval P2, Forecast Period From:", min(test_p2.date)," To: ", max(test_p2.date))

    print("Eval All Weighted NWRMSLE = ",result)
    print("Eval P1  Weighted NWRMSLE = ",result_p1)
    print("Eval P2  Weighted NWRMSLE = ",result_p2)

#--------------------------------------------------------------------------------------------------

logger.info('start')

items = pd.read_csv('../input/items.csv'  )

dtypes = {'id':'uint32', 'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}
#train_all = pd.read_csv('../input/train.csv', usecols=[1,2,3,4,5], dtype=dtypes, parse_dates=['date'] )
train_all = pd.read_csv('../input/train_small.csv', usecols=[1,2,3,4,5], dtype=dtypes, parse_dates=['date'] )

df_train = train_all.loc[((train_all.date >= '2016-06-01') & (train_all.date <= '2016-08-31' ))
                         |((train_all.date >= '2017-06-01') & (train_all.date <= '2017-08-31' )) , ]
del train_all

logger.info('load data successful')

#train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion

df_train.loc[(df_train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
df_train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs

df_train['DOW'] = df_train['date'].dt.dayofweek
df_train['WOY'] = df_train['date'].dt.weekofyear
df_train['Year'] = df_train['date'].dt.year
df_train['Month'] = df_train['date'].dt.month
df_train['Day'] = df_train['date'].dt.day

print('training data prepared')

#-------------------------------------------------------------------------------------------------

train_m06 = df_train.loc[(df_train.Month == 6),]
    
ma_m06 = train_m06[['item_nbr','store_nbr','Year','unit_sales']].groupby(['item_nbr','store_nbr','Year'])['unit_sales'].mean().to_frame('avg_m06')
ma_m06.reset_index(inplace=True)

df_train = pd.merge(df_train, ma_m06, how='left', on=['item_nbr','store_nbr','Year'])

#features = (['DOW', 'WOY', 'Month', 'Day', 'avg_m06'])
#print(features)

#-------------------------------------------------------------------------------------------------

train_m07 = df_train.loc[(df_train.Month == 7),]
    
ma_m07 = train_m07[['item_nbr','store_nbr','Year','unit_sales']].groupby(['item_nbr','store_nbr','Year'])['unit_sales'].mean().to_frame('avg_m07')
ma_m07.reset_index(inplace=True)
df_train = pd.merge(df_train, ma_m07, how='left', on=['item_nbr','store_nbr','Year'])

features = (['DOW', 'WOY', 'Month', 'Day', 'avg_m06', 'avg_m07'])

#--------------------------------------------------------------------------------------------------
train = df_train.loc[(df_train.date >= '2016-07-01') & (df_train.date <= '2016-08-31' ), ]
test = df_train.loc[(df_train.date > '2017-07-25') & (df_train.date <= '2017-08-10' ), ]


print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.5, random_state=10)
y_train = np.log1p(X_train.unit_sales)
y_valid = np.log1p(X_valid.unit_sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model_xgb = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=20, verbose_eval=True)


create_feature_map(features)
importance = model_xgb.get_fscore(fmap='xgb.fmap')
print(importance)


#-------------------------------------------------------------------------------------
#Load test
#test = valid
test['pred_sales'] = np.exp(model_xgb.predict(xgb.DMatrix(test[features])))


#---------------------- test_e to evaluate the result --------------------------------
#weights = np.ones(test.shape[0])
test_e = pd.merge(test, items, on='item_nbr',how='inner')
eval_test(test_e)

test_e['error'] =  abs(test_e.pred_sales - test_e.unit_sales)
print("Bias =",  (test_e.pred_sales.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
print("WMAPE =",  abs(test_e.error.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())

#-------------------------------------------------------------------------------------
logger.info('end')


FileNotFoundError: [Errno 2] No such file or directory: '/home/zyp/kaggle/FavoritaGrocery/script/jupyter/result_tmp/train.py.log'

In [15]:
df_train.groupby(['date']).size()
#df_train.info()
#np.random.seed(2013)
#df_1 = train.sample(frac=0.1)

date
2013-01-02    22
2013-01-03    22
2013-01-04    21
2013-01-05    22
2013-01-06    22
2013-01-07    22
2013-01-08    20
2013-01-09    20
2013-01-10    21
2013-01-11    20
2013-01-12    21
2013-01-13    22
2013-01-14    21
2013-01-15    22
2013-01-16    21
2013-01-17    21
2013-01-18    20
2013-01-19    22
2013-01-20    21
2013-01-21    22
2013-01-22    22
2013-01-23    21
2013-01-24    20
2013-01-25    18
2013-01-26    22
2013-01-27    20
2013-01-28    21
2013-01-29    21
2013-01-30    21
2013-01-31    21
              ..
2017-07-17    25
2017-07-18    27
2017-07-19    26
2017-07-20    26
2017-07-21    25
2017-07-22    25
2017-07-23    27
2017-07-24    26
2017-07-25    26
2017-07-26    23
2017-07-27    23
2017-07-28    24
2017-07-29    25
2017-07-30    27
2017-07-31    25
2017-08-01    26
2017-08-02    26
2017-08-03    26
2017-08-04    24
2017-08-05    24
2017-08-06    24
2017-08-07    23
2017-08-08    25
2017-08-09    27
2017-08-10    20
2017-08-11    24
2017-08-12    25
2017-08-1

In [16]:

df_train['mset'] = 1

#Load test
test = pd.read_csv('../../input/test_small.csv', dtype=dtypes, parse_dates=['date'])
test['mset'] = 0
data = pd.concat([df_train, test], ignore_index=True)
t = data.loc[data['mset'] == 0,]

In [17]:

t[['id']] = t[['id']].astype(np.int32)

In [18]:

#train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion

df_train.loc[(df_train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
df_train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs

df_train['DOW'] = df_train['date'].dt.dayofweek
df_train['WOY'] = df_train['date'].dt.weekofyear
df_train['Year'] = df_train['date'].dt.year
df_train['Month'] = df_train['date'].dt.month
df_train['Day'] = df_train['date'].dt.day

print('training data processed')

df_train.head(1)


training data processed


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,mset,DOW,WOY,Year,Month,Day
0,9389,2013-01-02,9,103501,19.0,,1,2,1,2013,1,2


In [19]:
train_m06 = df_train.loc[(df_train.Month >= 6),]
    
ma_is = train_m06[['item_nbr','store_nbr','Year','unit_sales']].groupby(['item_nbr','store_nbr','Year'])['unit_sales'].mean().to_frame('avg_m06')
ma_is.reset_index(inplace=True)

df_train = pd.merge(df_train, ma_is, how='left', on=['item_nbr','store_nbr','Year'])

In [20]:
df_train.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,mset,DOW,WOY,Year,Month,Day,avg_m06
0,9389,2013-01-02,9,103501,19.0,,1,2,1,2013,1,2,7.596244
1,10410,2013-01-02,10,103501,8.0,,1,2,1,2013,1,2,3.517413
2,11087,2013-01-02,11,103501,13.0,,1,2,1,2013,1,2,6.550239
3,12049,2013-01-02,12,103501,7.0,,1,2,1,2013,1,2,3.956098
4,12764,2013-01-02,13,103501,13.0,,1,2,1,2013,1,2,3.202073


In [21]:

train = df_train.loc[(df_train.date >= '2016-07-01') & (df_train.date <= '2016-08-31' ), ]

valid = df_train.loc[(df_train.date > '2017-07-25') & (df_train.date <= '2017-08-10' ), ]


#train = train.loc[(train.store_nbr == 9), ]
#test = test.loc[(test.store_nbr == 9), ]

valid.tail(1)


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,mset,DOW,WOY,Year,Month,Day,avg_m06
35718,124948419,2017-08-10,43,103501,10.0,False,1,3,32,2017,8,10,5.25


In [22]:
train.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,mset,DOW,WOY,Year,Month,Day,avg_m06
25784,83675406,2016-07-01,9,103501,3.0,False,1,4,26,2016,7,1,5.809524
25785,83677540,2016-07-01,10,103501,1.0,False,1,4,26,2016,7,1,2.677419
25786,83678785,2016-07-01,11,103501,7.0,False,1,4,26,2016,7,1,4.45
25787,83680701,2016-07-01,12,103501,5.0,False,1,4,26,2016,7,1,3.277487
25788,83682114,2016-07-01,13,103501,4.0,False,1,4,26,2016,7,1,2.594286


In [23]:
#features = (['DOW', 'Month', 'Day', 'Year', 'WOY','unit_sales'])
features = (['DOW', 'WOY', 'Month', 'Day'])
print(features)

print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.unit_sales)
y_valid = np.log1p(X_valid.unit_sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)


['DOW', 'WOY', 'Month', 'Day']
training data processed
Train a XGBoost model


In [24]:
train.shape

(1530, 13)

In [25]:

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model_xgb = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, verbose_eval=True)

[0]	train-rmse:0.911555	eval-rmse:0.866949
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
[1]	train-rmse:0.742616	eval-rmse:0.6819
[2]	train-rmse:0.642238	eval-rmse:0.569312
[3]	train-rmse:0.578514	eval-rmse:0.480367
[4]	train-rmse:0.542974	eval-rmse:0.426973
[5]	train-rmse:0.52381	eval-rmse:0.400318
[6]	train-rmse:0.513386	eval-rmse:0.382363
[7]	train-rmse:0.508071	eval-rmse:0.371036
[8]	train-rmse:0.505314	eval-rmse:0.367946
[9]	train-rmse:0.504047	eval-rmse:0.365701
[10]	train-rmse:0.503277	eval-rmse:0.366072
[11]	train-rmse:0.502949	eval-rmse:0.363009
[12]	train-rmse:0.502763	eval-rmse:0.364069
[13]	train-rmse:0.502612	eval-rmse:0.365716
[14]	train-rmse:0.502562	eval-rmse:0.367437
[15]	train-rmse:0.502574	eval-rmse:0.369057
[16]	train-rmse:0.502547	eval-rmse:0.370524
[17]	train-rmse:0.502522	eval-rmse:0.370874
[18]	train-rmse:0.502507	eval-rmse:0.372989
[19]	train-rmse:0.502514	eval-rms

[182]	train-rmse:0.502501	eval-rmse:0.371063
[183]	train-rmse:0.502456	eval-rmse:0.368494
[184]	train-rmse:0.502494	eval-rmse:0.369278
[185]	train-rmse:0.502496	eval-rmse:0.371559
[186]	train-rmse:0.502483	eval-rmse:0.3742
[187]	train-rmse:0.502485	eval-rmse:0.373415
[188]	train-rmse:0.502492	eval-rmse:0.373314
[189]	train-rmse:0.502523	eval-rmse:0.371677
[190]	train-rmse:0.502541	eval-rmse:0.369805
[191]	train-rmse:0.502536	eval-rmse:0.36994
[192]	train-rmse:0.50258	eval-rmse:0.373807
[193]	train-rmse:0.502588	eval-rmse:0.373543
[194]	train-rmse:0.502526	eval-rmse:0.373496
[195]	train-rmse:0.502505	eval-rmse:0.370604
[196]	train-rmse:0.502508	eval-rmse:0.369542
[197]	train-rmse:0.50249	eval-rmse:0.36953
[198]	train-rmse:0.50249	eval-rmse:0.367424
[199]	train-rmse:0.502466	eval-rmse:0.364999
[200]	train-rmse:0.502488	eval-rmse:0.368735
[201]	train-rmse:0.5025	eval-rmse:0.369732
[202]	train-rmse:0.502499	eval-rmse:0.366237
[203]	train-rmse:0.502544	eval-rmse:0.366184
[204]	train-rmse:0.

In [26]:
create_feature_map(features)
importance = model_xgb.get_fscore(fmap='xgb.fmap')
print(importance)
#importance = sorted(importance.items(), key=operator.itemgetter(1))

{'Day': 4500, 'Month': 522, 'WOY': 1768, 'DOW': 2032}


In [27]:
print("Validating")
test = valid
test['pred_sales'] = np.exp(model_xgb.predict(xgb.DMatrix(test[features])))




Validating


In [28]:

test_e = pd.merge(test, df_items, on='item_nbr',how='inner')
test_e['weights'] = 1
test_e.loc[(test_e.perishable== 0), ('weights')] = 1.25
test_e.head(2)


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,mset,DOW,WOY,Year,Month,Day,avg_m06,pred_sales,family,class,perishable,weights
0,123314004,2017-07-26,9,103501,3.0,False,1,2,30,2017,7,26,5.44,4.082291,CLEANING,3008,0,1.25
1,123317129,2017-07-26,11,103501,1.0,False,1,2,30,2017,7,26,3.736111,4.082291,CLEANING,3008,0,1.25


In [45]:
#print(test.loc[:, "unit_sales"].isnull().values.any())
#print(test.loc[:, "pred_sales"].isnull().values.any())
weights = np.ones(test_e.shape[0])
result = NWRMSLE_A(test_e.unit_sales,test_e.pred_sales, test_e.weights)

test_e['error'] =  abs(test_e.pred_sales - test_e.unit_sales)
print("Bias =",  (test_e.pred_sales.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
print("WMAPE =",  abs(test_e.error.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
print("Forecast Period From:", min(test.date)," To: ", max(test.date))
print("NWRMSLE_A = ",result)


Bias = 0.20072
WMAPE = 0.425496
Forecast Period From: 2017-07-26 00:00:00  To:  2017-08-10 00:00:00
NWRMSLE_A =  0.5932419392289525


In [30]:
#### check result on first 6 days.
test_p1 = test_e.loc[(test_e.date < '2017-08-01'), ]
weights_p1 = np.ones(test_p1.shape[0])
result_p1 = NWRMSLE_A(test_p1.unit_sales.astype(np.float32),test_p1.pred_sales.astype(np.float32), test_p1.weights)
print("Number of rows in test is", test_p1.shape[0])
print("Forecast Period From:", min(test_p1.date)," To: ", max(test_p1.date))
print("NWRMSLE = ",result_p1)

Number of rows in test is 147
Forecast Period From: 2017-07-26 00:00:00  To:  2017-07-31 00:00:00
NWRMSLE =  0.569663667416176


In [40]:
test_e['error'] =  abs(test_e.pred_sales - test_e.unit_sales)
test_e

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,mset,DOW,WOY,Year,Month,Day,avg_m06,pred_sales,family,class,perishable,weights,error
0,123314004,2017-07-26,9,103501,3.0,False,1,2,30,2017,7,26,5.440000,4.082291,CLEANING,3008,0,1.25,1.082291
1,123317129,2017-07-26,11,103501,1.0,False,1,2,30,2017,7,26,3.736111,4.082291,CLEANING,3008,0,1.25,3.082291
2,123321668,2017-07-26,14,103501,2.0,False,1,2,30,2017,7,26,3.414286,4.082291,CLEANING,3008,0,1.25,2.082291
3,123322947,2017-07-26,15,103501,2.0,False,1,2,30,2017,7,26,4.253521,4.082291,CLEANING,3008,0,1.25,2.082291
4,123324366,2017-07-26,16,103501,1.0,True,1,2,30,2017,7,26,2.278688,4.082291,CLEANING,3008,0,1.25,3.082291
5,123325747,2017-07-26,17,103501,2.0,False,1,2,30,2017,7,26,4.095891,4.082291,CLEANING,3008,0,1.25,2.082291
6,123327421,2017-07-26,18,103501,3.0,False,1,2,30,2017,7,26,3.485714,4.082291,CLEANING,3008,0,1.25,1.082291
7,123329195,2017-07-26,19,103501,2.0,False,1,2,30,2017,7,26,3.408451,4.082291,CLEANING,3008,0,1.25,2.082291
8,123330557,2017-07-26,20,103501,3.0,False,1,2,30,2017,7,26,5.527027,4.082291,CLEANING,3008,0,1.25,1.082291
9,123332450,2017-07-26,21,103501,4.0,False,1,2,30,2017,7,26,5.733333,4.082291,CLEANING,3008,0,1.25,0.082291


In [46]:
#test_e.loc[(test_e.store_nbr == 9 ) & (test_e.item_nbr <= 103501 ), ]
#test_e.loc[(test_e.store_nbr == 9 ) , ]
test_e['error'] =  abs(test_e.pred_sales - test_e.unit_sales)
print("Bias =",  (test_e.pred_sales.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
print("WMAPE =",  abs(test_e.error.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())

Bias = 0.20072
WMAPE = 0.425496
