In [41]:
import pandas as pd
import numpy as np
from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

import sklearn.metrics as skl_metrics
import xgboost as xgb
from sklearn.cross_validation import train_test_split

import math

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../result_tmp/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def NWRMSLE(y, pred, weights=None):
    err2 = skl_metrics.mean_squared_log_error(y, pred, sample_weight=weights)
    return math.sqrt(err2)

def NWRMSLE_A(y, pred, weights):
    y = np.array(y)
    pred = np.array(pred)
    weights = np.array(weights)
    weighted_errors = np.dot(np.square(np.log1p(pred) - np.log1p(y)), np.transpose(weights))
    weights_sum = np.sum(weights)
    return math.sqrt(weighted_errors/weights_sum)

def NWRMSLE_lgb(pred, dtrain):
    y = list(dtrain.get_label())
    score = NWRMSLE(y, pred)
    return 'NWRMSLE', score, False


dtypes = {'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}

# Test period 07-26 to 08-10
# train 35328
train = pd.read_csv('../../input/train_small.csv',  dtype=dtypes, parse_dates=['date'])
items = pd.read_csv('../../input/items.csv'  )
logger.info('load data successful')

#train_all.loc[(train_all.onpromotion!='True'),]

2017-12-01 22:17:18,317 __main__ 29 [INFO][<module>] start 
2017-12-01 22:17:18,317 __main__ 29 [INFO][<module>] start 
2017-12-01 22:17:18,317 __main__ 29 [INFO][<module>] start 
2017-12-01 22:17:18,360 __main__ 61 [INFO][<module>] load data successful 
2017-12-01 22:17:18,360 __main__ 61 [INFO][<module>] load data successful 
2017-12-01 22:17:18,360 __main__ 61 [INFO][<module>] load data successful 


In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35841 entries, 0 to 35840
Data columns (total 5 columns):
date           35841 non-null datetime64[ns]
store_nbr      35841 non-null int8
item_nbr       35841 non-null int32
unit_sales     35841 non-null float32
onpromotion    26874 non-null object
dtypes: datetime64[ns](1), float32(1), int32(1), int8(1), object(1)
memory usage: 875.1+ KB


In [43]:

#train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion

train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs

train['DOW'] = train['date'].dt.dayofweek
train['WOY'] = train['date'].dt.weekofyear
train['Year'] = train['date'].dt.year
train['Month'] = train['date'].dt.month
train['Day'] = train['date'].dt.day

print('training data processed')

train.head(1)


training data processed


Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,DOW,WOY,Year,Month,Day
0,2013-01-02,9,103501,19.0,,2,1,2013,1,2


In [44]:
train_all = train

train = train_all.loc[(train_all.date <= '2017-07-25'), ]

valid = train_all.loc[(train_all.date > '2017-07-25') & (train_all.date <= '2017-08-10' ), ]


#train = train.loc[(train.store_nbr == 9), ]
#test = test.loc[(test.store_nbr == 9), ]

valid.tail(1)


Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,DOW,WOY,Year,Month,Day
35718,2017-08-10,43,103501,10.0,False,3,32,2017,8,10


In [45]:
#features = (['DOW', 'Month', 'Day', 'Year', 'WOY','unit_sales'])
features = (['DOW', 'WOY','unit_sales'])
print(features)

print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.unit_sales)
y_valid = np.log1p(X_valid.unit_sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)


['DOW', 'WOY', 'unit_sales']
training data processed
Train a XGBoost model


In [46]:
train.shape

(35327, 10)

In [47]:

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, verbose_eval=True)

[0]	train-rmse:0.842225	eval-rmse:0.862482
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
[1]	train-rmse:0.647974	eval-rmse:0.665377
[2]	train-rmse:0.526309	eval-rmse:0.540369
[3]	train-rmse:0.370388	eval-rmse:0.379089
[4]	train-rmse:0.261896	eval-rmse:0.266882
[5]	train-rmse:0.184408	eval-rmse:0.187768
[6]	train-rmse:0.130449	eval-rmse:0.132593
[7]	train-rmse:0.093986	eval-rmse:0.094654
[8]	train-rmse:0.067839	eval-rmse:0.067835
[9]	train-rmse:0.05951	eval-rmse:0.059354
[10]	train-rmse:0.043969	eval-rmse:0.043453
[11]	train-rmse:0.033821	eval-rmse:0.032875
[12]	train-rmse:0.026908	eval-rmse:0.025899
[13]	train-rmse:0.024099	eval-rmse:0.023372
[14]	train-rmse:0.02259	eval-rmse:0.021983
[15]	train-rmse:0.018376	eval-rmse:0.017608
[16]	train-rmse:0.015563	eval-rmse:0.014917
[17]	train-rmse:0.014507	eval-rmse:0.013955
[18]	train-rmse:0.012741	eval-rmse:0.012431
[19]	train-rmse:0.01175	eval-rms

[183]	train-rmse:0.000552	eval-rmse:0.001549
[184]	train-rmse:0.000552	eval-rmse:0.001549
[185]	train-rmse:0.000552	eval-rmse:0.001549
[186]	train-rmse:0.000552	eval-rmse:0.001549
[187]	train-rmse:0.000552	eval-rmse:0.001549
[188]	train-rmse:0.000552	eval-rmse:0.001549
[189]	train-rmse:0.000552	eval-rmse:0.001549
[190]	train-rmse:0.000552	eval-rmse:0.001549
[191]	train-rmse:0.000552	eval-rmse:0.001549
[192]	train-rmse:0.000552	eval-rmse:0.001549
[193]	train-rmse:0.000552	eval-rmse:0.001549
[194]	train-rmse:0.000552	eval-rmse:0.001549
[195]	train-rmse:0.000552	eval-rmse:0.001549
[196]	train-rmse:0.000552	eval-rmse:0.001549
[197]	train-rmse:0.000552	eval-rmse:0.001549
[198]	train-rmse:0.000552	eval-rmse:0.001549
[199]	train-rmse:0.000552	eval-rmse:0.001549
[200]	train-rmse:0.000552	eval-rmse:0.001549
[201]	train-rmse:0.000552	eval-rmse:0.001549
[202]	train-rmse:0.000552	eval-rmse:0.001549
[203]	train-rmse:0.000552	eval-rmse:0.001549
[204]	train-rmse:0.000552	eval-rmse:0.001549
[205]	trai

In [48]:
create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance
#importance = sorted(importance.items(), key=operator.itemgetter(1))

{'DOW': 1948, 'WOY': 5537, 'unit_sales': 3086}

In [49]:
print("Validating")
test = valid
test['pred_sales'] = gbm.predict(xgb.DMatrix(test[features]))




Validating


In [50]:

test_e = pd.merge(test, items, on='item_nbr',how='inner')
test_e['weights'] = 1
test_e.loc[(test_e.perishable== 0), ('weights')] = 1.25
test_e.head(2)


Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,DOW,WOY,Year,Month,Day,pred_sales,family,class,perishable,weights
0,2017-07-26,9,103501,3.0,False,2,30,2017,7,26,1.386789,CLEANING,3008,0,1.25
1,2017-07-26,11,103501,1.0,False,2,30,2017,7,26,0.693758,CLEANING,3008,0,1.25


In [51]:
#print(test.loc[:, "unit_sales"].isnull().values.any())
#print(test.loc[:, "pred_sales"].isnull().values.any())
weights = np.ones(test_e.shape[0])
result = NWRMSLE_A(test_e.unit_sales,test_e.pred_sales, test_e.weights)
print("Forecast Period From:", min(test.date)," To: ", max(test.date))
print("NWRMSLE_A = ",result)


Forecast Period From: 2017-07-26 00:00:00  To:  2017-08-10 00:00:00
NWRMSLE_A =  0.6973722969698247


In [52]:
#### check result on first 6 days.
test_p1 = test_e.loc[(test_e.date < '2017-08-01'), ]
weights_p1 = np.ones(test_p1.shape[0])
result_p1 = NWRMSLE_A(test_p1.unit_sales.astype(np.float32),test_p1.pred_sales.astype(np.float32), test_p1.weights)
print("Number of rows in test is", test_p1.shape[0])
print("Forecast Period From:", min(test_p1.date)," To: ", max(test_p1.date))
print("NWRMSLE = ",result_p1)

Number of rows in test is 147
Forecast Period From: 2017-07-26 00:00:00  To:  2017-07-31 00:00:00
NWRMSLE =  0.7215935018713918
