In [1]:

"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import sys

import math
import sklearn.metrics as skl_metrics

from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

#------------------------------------------------------------------------------------#

df_train = pd.read_csv(
    '../input/train_1s.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
    float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
)

df_test = pd.read_csv(
    "../input/test_1s.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)


items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

df_2017[pd.datetime(2017, 1, 1)] = 0
df_2017[pd.datetime(2016, 12, 25)] = 0

#------------------------------------------------------------------------------------------#
# Functions

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def NWRMSLE(y, pred, weights=None):
    err2 = skl_metrics.mean_squared_log_error(y, pred, sample_weight=weights)
    return math.sqrt(err2)

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        #"mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        #"mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        #"mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        
        #"mean_21_2017": get_timespan(df_2017, t2017, 21, 21).mean(axis=1).values,
        #"mean_42_2017": get_timespan(df_2017, t2017, 42, 42).mean(axis=1).values,
        #"mean_91_2017": get_timespan(df_2017, t2017, 91, 91).mean(axis=1).values,
        #"mean_150_2017": get_timespan(df_2017, t2017, 150, 150).mean(axis=1).values,
        
        #"promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        #"promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        #"promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X


def eval_test(test_e):

    test_e['weights'] = 1
    test_e.loc[(test_e.perishable == 1), ('weights')] = 1.25

    result = NWRMSLE(test_e.unit_sales.astype(np.float64),test_e.pred_sales.astype(np.float64), test_e.weights)

    print("Eval All, Number of rows in test is", test_e.shape[0])
    print("Eval all, Forecast Period From:", min(test_e.date)," To: ", max(test_e.date))

    #### check result on first 6 days.
    test_p1 = test_e.loc[(test_e.date < '2017-08-01'), ]
    result_p1 = NWRMSLE(test_p1.unit_sales.astype(np.float32),test_p1.pred_sales.astype(np.float32), test_p1.weights)

    print("Eval P1, Number of rows in test is", test_p1.shape[0])
    print("Eval P1, Forecast Period From:", min(test_p1.date)," To: ", max(test_p1.date))

    #### check result on last 10 days.
    test_p2 = test_e.loc[(test_e.date >= '2017-08-01'), ]
    result_p2 = NWRMSLE(test_p2.unit_sales.astype(np.float32),test_p2.pred_sales.astype(np.float32), test_p2.weights)

    print("Eval P2, Number of rows in test is", test_p2.shape[0])
    print("Eval P2, Forecast Period From:", min(test_p2.date)," To: ", max(test_p2.date))

    print("Eval All Weighted NWRMSLE = ",result)
    print("Eval P1  Weighted NWRMSLE = ",result_p1)
    print("Eval P2  Weighted NWRMSLE = ",result_p2)

    
    test_e['error'] =  abs(test_e.pred_sales - test_e.unit_sales)
    print("Bias =",  (test_e.pred_sales.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
    print("WMAPE =",  abs(test_e.error.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
    
#------------------------------------------------------------------------------------------#
logger.info('Preparing datasetn...')

t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train_allF = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val_allF, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)


2018-01-02 11:01:02,485 __main__ 37 [INFO][<module>] start 
  interactivity=interactivity, compiler=compiler, result=result)
2018-01-02 11:01:08,620 __main__ 168 [INFO][<module>] Preparing datasetn... 


In [2]:
features_all = X_train_allF.columns.tolist()
i = 15

print("=" * 50)
print("Step %d" % (i+1))
print("=" * 50)

features_t = features_all.copy()

for j in range(16):
    if j != i:
        features_t.remove("promo_{}".format(j))

for j in range(7):
    if j != i%7:
        features_t.remove('mean_4_dow{}_2017'.format(j))
        features_t.remove('mean_20_dow{}_2017'.format(j))
        
X_train = X_train_allF[features_t]
X_val = X_val_allF[features_t]


Step 16


In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20754 entries, 0 to 3458
Data columns (total 7 columns):
day_1_2017           20754 non-null float64
mean_14_2017         20754 non-null float64
mean_3_2017          20754 non-null float64
mean_7_2017          20754 non-null float64
mean_4_dow1_2017     20754 non-null float64
mean_20_dow1_2017    20754 non-null float64
promo_15             20754 non-null uint8
dtypes: float64(6), uint8(1)
memory usage: 1.1 MB


In [4]:

#------------------------------------------------------------------------------------------#
logger.info('Training and predicting models...')

params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []

features_all = X_train_allF.columns.tolist()

   
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    
    features_t = features_all.copy()
    
    for j in range(16):
        if j != i:
            features_t.remove("promo_{}".format(j))
    

    X_train = X_train_allF[features_t]
    X_val = X_val_allF[features_t]
    
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))


2018-01-02 11:01:08,983 __main__ 3 [INFO][<module>] Training and predicting models... 


Step 1
Training until validation scores don't improve for 50 rounds.




Early stopping, best iteration is:
[44]	training's l2: 0.285433	valid_1's l2: 0.284064
mean_7_2017: 32992.22
mean_14_2017: 21698.02
mean_20_dow0_2017: 6910.73
promo_0: 2657.10
mean_3_2017: 1411.59
mean_20_dow6_2017: 859.31
mean_4_dow0_2017: 671.51
day_1_2017: 561.85
mean_4_dow6_2017: 487.25
mean_4_dow5_2017: 386.60
mean_4_dow2_2017: 280.50
mean_20_dow5_2017: 270.05
mean_20_dow1_2017: 214.10
mean_4_dow1_2017: 194.33
mean_4_dow3_2017: 187.49
mean_20_dow2_2017: 143.05
mean_20_dow4_2017: 120.69
mean_4_dow4_2017: 94.70
mean_20_dow3_2017: 81.85
Step 2
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.27712	valid_1's l2: 0.306631
Early stopping, best iteration is:
[56]	training's l2: 0.292833	valid_1's l2: 0.304865
mean_7_2017: 20819.52
mean_14_2017: 20593.56
mean_20_dow1_2017: 6482.15
mean_4_dow1_2017: 1290.83
promo_1: 1125.96
mean_20_dow0_2017: 721.99
mean_3_2017: 500.73
mean_4_dow5_2017: 338.52
mean_4_dow3_2017: 333.04
day_1_2017: 314.28
mean_4_dow2_2017:

Early stopping, best iteration is:
[31]	training's l2: 0.327274	valid_1's l2: 0.340981
mean_14_2017: 21951.21
mean_7_2017: 15097.79
mean_20_dow5_2017: 7396.12
mean_4_dow5_2017: 2183.31
promo_12: 1921.69
mean_20_dow6_2017: 1679.24
mean_4_dow6_2017: 1380.27
mean_4_dow3_2017: 327.84
mean_4_dow1_2017: 260.95
mean_20_dow2_2017: 230.90
mean_3_2017: 228.06
mean_20_dow0_2017: 219.30
mean_4_dow4_2017: 188.98
mean_20_dow1_2017: 183.41
mean_20_dow4_2017: 173.15
mean_4_dow0_2017: 165.61
day_1_2017: 126.48
mean_4_dow2_2017: 122.17
mean_20_dow3_2017: 94.80
Step 14
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[32]	training's l2: 0.338146	valid_1's l2: 0.382329
mean_14_2017: 20104.40
mean_20_dow6_2017: 15740.98
mean_7_2017: 6077.04
promo_13: 1947.17
mean_20_dow5_2017: 1831.11
mean_4_dow0_2017: 1290.81
mean_4_dow6_2017: 1029.22
mean_4_dow5_2017: 996.19
mean_4_dow2_2017: 630.49
mean_20_dow0_2017: 480.64
mean_20_dow2_2017: 424.04
mean_20_dow1_2017: 385.

In [5]:

del X_train, y_train
#------------------------------------------------------------------------------------------#
# Validate 
#### Need to use expm1 when y is log1p
logger.info('validate accuracy ...')

valid = pd.DataFrame(
    np.expm1(y_val), index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")

pred = pd.DataFrame(
    np.expm1(np.array(val_pred).transpose()), index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("pred_sales")

valid = valid.reset_index()
pred = pred.reset_index()

test_e = pd.merge(valid, pred, on=['item_nbr','store_nbr', 'level_2'])
#items = items.reset_index()

#test_e = pd.merge(valid_m, items, on='item_nbr',how='inner')
test_e["date"] = test_e.level_2

#del valid, pred
#del X_val, y_val


test_e.to_pickle('./data/T006_lgb_val.p')

#------------------------------------------------------------------------------------------#
# Submit
logger.info('Making submission...')

y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('../submit/T006_lgb_moreWK.csv', float_format='%.4f', index=None)

####### PZ, Check overral result
print("SUM =",  submission.unit_sales.sum())
print("MEAN =",  submission.unit_sales.mean())

print(mean_squared_error(y_val, np.array(val_pred).transpose()))
print(submission.unit_sales.sum())
print(submission.unit_sales.mean())

2018-01-02 11:01:16,845 __main__ 6 [INFO][<module>] validate accuracy ... 


FileNotFoundError: [Errno 2] No such file or directory: './data/T006_lgb_val.p'