In [1]:

"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import sys

import math
import sklearn.metrics as skl_metrics

from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'


#------------------------------------------------------------------------------------#


df_train = pd.read_csv(
    '../input/train_p.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
    float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
   ###skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../input/test_p.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)


items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2016,8,16)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

#------------------------------------------------------------------------------------------#
# Functions

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def NWRMSLE(y, pred, weights=None):
    err2 = skl_metrics.mean_squared_log_error(y, pred, sample_weight=weights)
    return math.sqrt(err2)

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        
        "mean_21_2017": get_timespan(df_2017, t2017, 21, 21).mean(axis=1).values,
        "mean_42_2017": get_timespan(df_2017, t2017, 42, 42).mean(axis=1).values,
        "mean_91_2017": get_timespan(df_2017, t2017, 91, 91).mean(axis=1).values,
#        "mean_183_2017": get_timespan(df_2017, t2017, 183, 183).mean(axis=1).values,
        
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X


def eval_test(test_e):

    test_e['weights'] = 1
    test_e.loc[(test_e.perishable == 1), ('weights')] = 1.25

    result = NWRMSLE(test_e.unit_sales.astype(np.float64),test_e.pred_sales.astype(np.float64), test_e.weights)

    print("Eval All, Number of rows in test is", test_e.shape[0])
    print("Eval all, Forecast Period From:", min(test_e.date)," To: ", max(test_e.date))

    #### check result on first 6 days.
    test_p1 = test_e.loc[(test_e.date < '2017-08-01'), ]
    result_p1 = NWRMSLE(test_p1.unit_sales.astype(np.float32),test_p1.pred_sales.astype(np.float32), test_p1.weights)

    print("Eval P1, Number of rows in test is", test_p1.shape[0])
    print("Eval P1, Forecast Period From:", min(test_p1.date)," To: ", max(test_p1.date))

    #### check result on last 10 days.
    test_p2 = test_e.loc[(test_e.date >= '2017-08-01'), ]
    result_p2 = NWRMSLE(test_p2.unit_sales.astype(np.float32),test_p2.pred_sales.astype(np.float32), test_p2.weights)

    print("Eval P2, Number of rows in test is", test_p2.shape[0])
    print("Eval P2, Forecast Period From:", min(test_p2.date)," To: ", max(test_p2.date))

    print("Eval All Weighted NWRMSLE = ",result)
    print("Eval P1  Weighted NWRMSLE = ",result_p1)
    print("Eval P2  Weighted NWRMSLE = ",result_p2)

    
    test_e['error'] =  abs(test_e.pred_sales - test_e.unit_sales)
    print("Bias =",  (test_e.pred_sales.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
    print("WMAPE =",  abs(test_e.error.sum() - test_e.unit_sales.sum()) /  test_e.unit_sales.sum())
    
#------------------------------------------------------------------------------------------#
logger.info('Preparing datasetn...')

t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

#------------------------------------------------------------------------------------------#
logger.info('Training and predicting models...')

params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))



Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.287325	valid_1's l2: 0.299561
Early stopping, best iteration is:
[147]	training's l2: 0.281521	valid_1's l2: 0.298549
mean_7_2017: 304957.52
mean_14_2017: 102140.10
mean_3_2017: 18009.70
promo_0: 18009.42
day_1_2017: 13548.25
mean_42_2017: 11187.76
mean_4_dow0_2017: 4840.04
promo_14_2017: 3809.64
mean_30_2017: 3052.78
mean_21_2017: 2092.91
promo_7: 1678.91
promo_60_2017: 1448.13
mean_4_dow2_2017: 1404.61
mean_4_dow6_2017: 1352.82
mean_60_2017: 1031.05
mean_20_dow0_2017: 1011.57
mean_20_dow4_2017: 992.73
promo_140_2017: 856.54
mean_4_dow5_2017: 697.07
mean_4_dow4_2017: 678.21
mean_4_dow1_2017: 661.28
mean_4_dow3_2017: 623.29
mean_20_dow2_2017: 560.80
mean_91_2017: 525.02
mean_20_dow1_2017: 493.85
promo_14: 478.50
mean_20_dow3_2017: 455.74
mean_20_dow6_2017: 447.47
mean_20_dow5_2017: 413.25
promo_9: 332.35
mean_140_2017: 293.55
promo_13: 191.48
promo_2: 140.74
promo_3: 135.25
promo_15: 128.41
promo_10: 8

Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.332124	valid_1's l2: 0.418741
Early stopping, best iteration is:
[71]	training's l2: 0.337356	valid_1's l2: 0.418025
mean_14_2017: 173973.67
mean_7_2017: 165593.47
promo_7: 29687.58
mean_42_2017: 25369.06
mean_30_2017: 14184.96
mean_4_dow0_2017: 11087.76
mean_21_2017: 7415.44
promo_0: 3795.70
day_1_2017: 3593.97
mean_3_2017: 3241.22
promo_14_2017: 2724.99
mean_60_2017: 2466.80
promo_14: 1839.89
promo_140_2017: 1452.50
promo_60_2017: 1367.06
mean_4_dow2_2017: 1134.08
mean_91_2017: 771.40
mean_20_dow0_2017: 770.63
mean_20_dow4_2017: 727.49
promo_8: 653.81
promo_6: 627.92
mean_4_dow6_2017: 507.83
mean_4_dow4_2017: 500.67
promo_5: 472.78
mean_20_dow3_2017: 437.23
promo_9: 423.59
mean_4_dow5_2017: 405.96
mean_4_dow1_2017: 384.32
promo_3: 366.01
mean_4_dow3_2017: 326.97
promo_15: 300.36
mean_20_dow1_2017: 295.39
mean_20_dow6_2017: 274.53
promo_10: 260.21
mean_20_dow2_2017: 246.07
mean_140_2017: 244.02
promo_

Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.352231	valid_1's l2: 0.373311
Early stopping, best iteration is:
[74]	training's l2: 0.357012	valid_1's l2: 0.372936
mean_21_2017: 161758.89
mean_14_2017: 71110.56
mean_7_2017: 62503.58
mean_42_2017: 36740.59
promo_14: 34859.12
mean_30_2017: 32998.33
mean_4_dow0_2017: 13016.40
mean_60_2017: 3393.20
mean_3_2017: 3267.29
promo_14_2017: 2619.42
day_1_2017: 2386.47
promo_7: 2380.98
mean_4_dow2_2017: 2267.43
promo_140_2017: 2222.43
promo_0: 1760.72
promo_60_2017: 1730.82
promo_13: 1569.68
mean_20_dow0_2017: 1042.85
promo_12: 953.55
mean_20_dow4_2017: 902.23
mean_4_dow5_2017: 753.23
mean_91_2017: 714.25
mean_4_dow4_2017: 677.74
promo_15: 648.67
mean_20_dow2_2017: 536.66
mean_20_dow3_2017: 535.93
mean_20_dow1_2017: 510.21
mean_20_dow6_2017: 405.09
promo_2: 403.00
mean_4_dow1_2017: 394.04
promo_9: 392.47
promo_10: 369.95
mean_4_dow6_2017: 345.01
mean_20_dow5_2017: 247.15
mean_140_2017: 224.03
mean_4_dow3_2017:

In [16]:

#------------------------------------------------------------------------------------------#
# Validate 
#### Need to use expm1 when y is log1p
logger.info('validate accuracy ...')

valid = pd.DataFrame(
    np.expm1(y_val), index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")

pred = pd.DataFrame(
    np.expm1(np.array(val_pred).transpose()), index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("pred_sales")

valid = valid.reset_index()
pred = pred.reset_index()

test_e = pd.merge(valid, pred, on=['item_nbr','store_nbr', 'level_2'])

test_e["date"] = test_e.level_2
test_e.drop('level_2', axis = 1, inplace = True ) 
#items = items.reset_index()

In [18]:
test_e.unit_sales.sum()

1936930.355999994

In [17]:
test_e.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 368224 entries, 0 to 368223
Data columns (total 5 columns):
store_nbr     368224 non-null int64
item_nbr      368224 non-null int64
unit_sales    368224 non-null float64
pred_sales    368224 non-null float64
date          368224 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 16.9 MB


In [19]:
test_e.pred_sales.sum()

1235094.4429989103

In [53]:
T006_val = pd.read_pickle('../practice/data/T006_lgb_val.p')

print("T006 pred_sales:", T006_val['pred_sales'].sum())

T006 pred_sales: 10019783.053320102


In [80]:
T006_val.shape

(2680240, 6)

In [74]:
val_new = pd.merge(T006_val, test_e,  on=['item_nbr','store_nbr', 'date'], how = 'left')

In [78]:
val_new.shape

(2680240, 8)

In [82]:
val_new.head(1)

Unnamed: 0,store_nbr,item_nbr,level_2,unit_sales_x,pred_sales_x,date,unit_sales_y,pred_sales_y
0,1,96995,2017-07-26,0.0,0.135491,2017-07-26,,


In [89]:
val_new['pred_sales'] = val_new.pred_sales_y.combine_first(val_new.pred_sales_x)

In [90]:
print("T006 Validation mse:", mean_squared_error(
    np.log1p(val_new.unit_sales_x),np.log1p(val_new.pred_sales_x)))

T006 Validation mse: 0.362248695799


In [91]:
print("T006 Validation mse:", mean_squared_error(
    np.log1p(val_new.unit_sales_x),np.log1p(val_new.pred_sales)))

T006 Validation mse: 0.361901295649


In [30]:
val_new.pred_sales_y.sum()

1152626.3060800293

In [31]:
val_new.pred_sales_x.sum()

1234586.9191219707

In [64]:
T011_val = T006_val[['item_nbr','store_nbr','date', 'unit_sales', 'pred_sales']]

In [65]:
T011_val['pred_sales'] = test_e.pred_sales.combine_first(T006_val.pred_sales)

In [73]:
test_e.loc[(test_e['item_nbr'] == 96995) & (test_e['store_nbr'] == 1),]

Unnamed: 0,store_nbr,item_nbr,unit_sales,pred_sales,date


In [66]:
T011_val.head(5)

Unnamed: 0,item_nbr,store_nbr,date,unit_sales,pred_sales
0,96995,1,2017-07-26,0.0,0.011361
1,96995,1,2017-07-27,0.0,0.016732
2,96995,1,2017-07-28,0.0,0.022911
3,96995,1,2017-07-29,0.0,0.041643
4,96995,1,2017-07-30,0.0,0.04771


In [95]:

#------------------------------------------------------------------------------------------#
# Submit
logger.info('Making submission...')

y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('../submit/T011_p_submit.csv', float_format='%.4f', index=None)


In [100]:
p_sub = pd.read_csv('../submit/T011_p_submit.csv')
t006 = pd.read_csv('../submit/T006_lgb_moreWK.csv')

In [97]:
p_sub.shape

(354672, 2)

In [107]:
t006.unit_sales.sum()

9954040.964900294

In [101]:
t011 = pd.merge(t006, p_sub,  on=['id'], how = 'left')

In [104]:
t011.head(1)

Unnamed: 0,id,unit_sales_x,unit_sales_y
0,125497040,0.211,


In [105]:
t011['unit_sales'] = t011.unit_sales_y.combine_first(t011.unit_sales_x)

In [108]:
t011.unit_sales.sum()

10018876.614398308

In [109]:
t011[['id','unit_sales']].to_csv('../submit/T011_merged_submit.csv', float_format='%.4f', index=None)