In [17]:
import pandas as pd
import numpy as np
from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

import sklearn.metrics as skl_metrics
import xgboost as xgb
from sklearn.cross_validation import train_test_split

import math

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../result_tmp/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def NWRMSLE(y, pred, weights=None):
    err2 = skl_metrics.mean_squared_log_error(y, pred, sample_weight=weights)
    return math.sqrt(err2)

def NWRMSLE_A(y, pred, weights):
    y = np.array(y)
    pred = np.array(pred)
    weights = np.array(weights)
    weighted_errors = np.dot(np.square(np.log1p(pred) - np.log1p(y)), np.transpose(weights))
    weights_sum = np.sum(weights)
    return math.sqrt(weighted_errors/weights_sum)

def NWRMSLE_lgb(pred, dtrain):
    y = list(dtrain.get_label())
    score = NWRMSLE(y, pred)
    return 'NWRMSLE', score, False

def eval_test(test_e):

    test_e['weights'] = 1
    test_e.loc[(test_e.perishable == 1), ('weights')] = 1.25

    result = NWRMSLE(test_e.unit_sales.astype(np.float64),test_e.pred_sales.astype(np.float64), test_e.weights)

    print("Eval All, Number of rows in test is", test_e.shape[0])
    print("Eval all, Forecast Period From:", min(test_e.date)," To: ", max(test_e.date))

    #### check result on first 6 days.
    test_p1 = test_e.loc[(test_e.date < '2017-08-01'), ]
    result_p1 = NWRMSLE_A(test_p1.unit_sales.astype(np.float32),test_p1.pred_sales.astype(np.float32), test_p1.weights)

    print("Eval P1, Number of rows in test is", test_p1.shape[0])
    print("Eval P1, Forecast Period From:", min(test_p1.date)," To: ", max(test_p1.date))

    #### check result on last 10 days.
    test_p2 = test_e.loc[(test_e.date >= '2017-08-01'), ]
    result_p2 = NWRMSLE_A(test_p2.unit_sales.astype(np.float32),test_p2.pred_sales.astype(np.float32), test_p2.weights)

    print("Eval P2, Number of rows in test is", test_p2.shape[0])
    print("Eval P2, Forecast Period From:", min(test_p2.date)," To: ", max(test_p2.date))

    print("Eval All Weighted NWRMSLE = ",result)
    print("Eval P1  Weighted NWRMSLE = ",result_p1)
    print("Eval P2  Weighted NWRMSLE = ",result_p2)

#--------------------------------------------------------------------------------------------------

logger.info('start')

items = pd.read_csv('../../input/items.csv'  )

dtypes = {'id':'uint32', 'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}
#train_all = pd.read_csv('../input/train.csv', usecols=[1,2,3,4,5], dtype=dtypes, parse_dates=['date'] )
train_all = pd.read_csv('../../input/train_small.csv', usecols=[1,2,3,4,5], dtype=dtypes, parse_dates=['date'] )

train_16_17 = train_all.loc[((train_all.date >= '2016-06-01') & (train_all.date <= '2016-08-31' ))
                         |((train_all.date >= '2017-06-01') & (train_all.date <= '2017-08-31' )) , ]

del train_all

#Load test
train_16_17['mset'] = 1

#Load test
#test_all = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['date'])
test_all = pd.read_csv('../../input/test_small.csv', dtype=dtypes, parse_dates=['date'])
test_all['mset'] = 0

df_train = pd.concat([train_16_17, test_all], ignore_index=True)

#-------------------------------------------------------------------------------------------------

df_train.loc[(df_train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
df_train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs

df_train['DOW'] = df_train['date'].dt.dayofweek
df_train['WOY'] = df_train['date'].dt.weekofyear
df_train['Year'] = df_train['date'].dt.year
df_train['Month'] = df_train['date'].dt.month
df_train['Day'] = df_train['date'].dt.day

print('training data prepared')

#-------------------------------------------------------------------------------------------------

train_m06 = df_train.loc[(df_train.Month == 6) ,]
    
ma_m06 = train_m06[['item_nbr','store_nbr','Year','unit_sales']].groupby(['item_nbr','store_nbr','Year'])['unit_sales'].mean().to_frame('avg_m06')
ma_m06.reset_index(inplace=True)

df_train = pd.merge(df_train, ma_m06, how='left', on=['item_nbr','store_nbr','Year'])

#features = (['DOW', 'WOY', 'Month', 'Day', 'avg_m06'])
#print(features)

#-------------------------------------------------------------------------------------------------

train_m07 = df_train.loc[(df_train.Month == 7) ,]
    
ma_m07 = train_m07[['item_nbr','store_nbr','Year','unit_sales']].groupby(['item_nbr','store_nbr','Year'])['unit_sales'].mean().to_frame('avg_m07')
ma_m07.reset_index(inplace=True)
df_train = pd.merge(df_train, ma_m07, how='left', on=['item_nbr','store_nbr','Year'])

features = (['DOW', 'WOY', 'Month', 'Day', 'avg_m06', 'avg_m07'])

#--------------------------------------------------------------------------------------------------
train = df_train.loc[(df_train.date >= '2016-08-01') & (df_train.date <= '2016-08-31' ), ]
#test = df_train.loc[(df_train.date > '2017-07-25') & (df_train.date <= '2017-08-10' ), ]
test = df_train.loc[df_train['mset'] == 0,]

print('training data processed')



2017-12-12 22:07:51,444 __main__ 29 [INFO][<module>] start 
2017-12-12 22:07:51,444 __main__ 29 [INFO][<module>] start 
2017-12-12 22:07:51,451 __main__ 84 [INFO][<module>] start 
2017-12-12 22:07:51,451 __main__ 84 [INFO][<module>] start 


training data prepared
training data processed


In [18]:
df_train

Unnamed: 0,date,id,item_nbr,mset,onpromotion,store_nbr,unit_sales,DOW,WOY,Year,Month,Day,avg_m06,avg_m07
0,2016-06-01,,103501,1,False,9,6.0,2,22,2016,6,1,4.724138,5.451613
1,2016-06-01,,103501,1,False,10,1.0,2,22,2016,6,1,2.269231,2.740741
2,2016-06-01,,103501,1,False,11,6.0,2,22,2016,6,1,3.517241,4.357143
3,2016-06-01,,103501,1,False,13,1.0,2,22,2016,6,1,2.083333,2.892857
4,2016-06-01,,103501,1,False,14,6.0,2,22,2016,6,1,3.620690,3.040000
5,2016-06-01,,103501,1,False,15,5.0,2,22,2016,6,1,3.307692,3.774194
6,2016-06-01,,103501,1,False,16,3.0,2,22,2016,6,1,4.500000,4.066667
7,2016-06-01,,103501,1,False,17,7.0,2,22,2016,6,1,4.933333,5.064516
8,2016-06-01,,103501,1,False,18,4.0,2,22,2016,6,1,2.800000,2.615385
9,2016-06-01,,103501,1,False,19,6.0,2,22,2016,6,1,3.400000,4.419355


In [19]:
test

Unnamed: 0,date,id,item_nbr,mset,onpromotion,store_nbr,unit_sales,DOW,WOY,Year,Month,Day,avg_m06,avg_m07
4159,2017-08-16,125497042.0,103501,0,False,1,0.0,2,33,2017,8,16,,
4160,2017-08-16,125500943.0,103501,0,False,2,0.0,2,33,2017,8,16,,
4161,2017-08-16,125504844.0,103501,0,False,3,0.0,2,33,2017,8,16,,
4162,2017-08-16,125508745.0,103501,0,False,4,0.0,2,33,2017,8,16,,
4163,2017-08-16,125512646.0,103501,0,False,5,0.0,2,33,2017,8,16,,
4164,2017-08-16,125516547.0,103501,0,False,6,0.0,2,33,2017,8,16,,
4165,2017-08-16,125520448.0,103501,0,False,7,0.0,2,33,2017,8,16,,
4166,2017-08-16,125524349.0,103501,0,False,8,0.0,2,33,2017,8,16,,
4167,2017-08-16,125528250.0,103501,0,False,9,0.0,2,33,2017,8,16,5.466667,5.645161
4168,2017-08-16,125532151.0,103501,0,False,10,0.0,2,33,2017,8,16,2.750000,2.571429


In [20]:

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.5, random_state=10)
y_train = np.log1p(X_train.unit_sales)
y_valid = np.log1p(X_valid.unit_sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model_xgb = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=20, verbose_eval=True)


create_feature_map(features)
importance = model_xgb.get_fscore(fmap='xgb.fmap')
print(importance)



Train a XGBoost model
[0]	train-rmse:0.892159	eval-rmse:0.877252
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 20 rounds.
[1]	train-rmse:0.688105	eval-rmse:0.693599
[2]	train-rmse:0.550685	eval-rmse:0.594721
[3]	train-rmse:0.452054	eval-rmse:0.539614
[4]	train-rmse:0.381113	eval-rmse:0.498497
[5]	train-rmse:0.34845	eval-rmse:0.472817
[6]	train-rmse:0.305938	eval-rmse:0.467332
[7]	train-rmse:0.272888	eval-rmse:0.461479
[8]	train-rmse:0.236785	eval-rmse:0.467187
[9]	train-rmse:0.207402	eval-rmse:0.469224
[10]	train-rmse:0.185984	eval-rmse:0.476761
[11]	train-rmse:0.170477	eval-rmse:0.47833
[12]	train-rmse:0.157788	eval-rmse:0.485381
[13]	train-rmse:0.147169	eval-rmse:0.489382
[14]	train-rmse:0.133598	eval-rmse:0.493025
[15]	train-rmse:0.121473	eval-rmse:0.495781
[16]	train-rmse:0.110743	eval-rmse:0.500483
[17]	train-rmse:0.098547	eval-rmse:0.503705
[18]	train-rmse:0.090438	eval-rmse:0.506459
[19]	train-

In [24]:
test.loc[:, 'avg_m06'].fillna(0, inplace=True) # fill NaNs
test.loc[:, 'avg_m07'].fillna(0, inplace=True) # fill NaNs

#-------------------------------------------------------------------------------------
#Load test
#test = valid

test['unit_sales'] = np.exp(model_xgb.predict(xgb.DMatrix(test[features])))

test.loc[ (test['avg_m07'] == 0) | (test['avg_m06'] == 0) , 'unit_sales'] = 0


#test[['id']] = test[['id']].astype(np.int32)
#test[['id','unit_sales']].to_csv('xgb_v01.csv.gz', index=False, float_format='%.3f', compression='gzip')

#---------------------- test_e to evaluate the result --------------------------------
#weights = np.ones(test.shape[0])
#test_e = pd.merge(test, items, on='item_nbr',how='inner')
#eval_test(test_e)

#-------------------------------------------------------------------------------------
logger.info('end')

2017-12-12 22:10:01,202 __main__ 22 [INFO][<module>] end 
2017-12-12 22:10:01,202 __main__ 22 [INFO][<module>] end 


In [25]:
test

Unnamed: 0,date,id,item_nbr,mset,onpromotion,store_nbr,unit_sales,DOW,WOY,Year,Month,Day,avg_m06,avg_m07
4159,2017-08-16,125497042.0,103501,0,False,1,0.000000,2,33,2017,8,16,0.000000,0.000000
4160,2017-08-16,125500943.0,103501,0,False,2,0.000000,2,33,2017,8,16,0.000000,0.000000
4161,2017-08-16,125504844.0,103501,0,False,3,0.000000,2,33,2017,8,16,0.000000,0.000000
4162,2017-08-16,125508745.0,103501,0,False,4,0.000000,2,33,2017,8,16,0.000000,0.000000
4163,2017-08-16,125512646.0,103501,0,False,5,0.000000,2,33,2017,8,16,0.000000,0.000000
4164,2017-08-16,125516547.0,103501,0,False,6,0.000000,2,33,2017,8,16,0.000000,0.000000
4165,2017-08-16,125520448.0,103501,0,False,7,0.000000,2,33,2017,8,16,0.000000,0.000000
4166,2017-08-16,125524349.0,103501,0,False,8,0.000000,2,33,2017,8,16,0.000000,0.000000
4167,2017-08-16,125528250.0,103501,0,False,9,7.210409,2,33,2017,8,16,5.466667,5.645161
4168,2017-08-16,125532151.0,103501,0,False,10,2.748190,2,33,2017,8,16,2.750000,2.571429
