In [1]:
"""LGBM Starter

This is watered-down version of one of my earlier scripts. 
Only very basic features are retained so hopefully it won't ruin the fun for you.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

df_train = pd.read_csv(
    '../../input/train_small.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 21410)  # 2016-01-01
)

items = pd.read_csv("../../input/items.csv",).set_index("item_nbr")
#df_train = df_train.loc[(df_train.store_nbr == 9), ]

df_test = pd.read_csv(
    "../../input/test_small.csv", usecols=[  1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)


df_train = df_train.loc[(df_train.store_nbr == 9), ]
#df_test = df_test.loc[(df_test.store_nbr == 9), ].set_index(
#    ['store_nbr', 'item_nbr', 'date']
#)



In [3]:
df_train


Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-02,9,103501,2.564949,False
26,2016-01-03,9,103501,2.890372,False
51,2016-01-04,9,103501,2.079442,False
78,2016-01-05,9,103501,2.302585,False
103,2016-01-06,9,103501,1.386294,False
125,2016-01-07,9,103501,1.945910,False
179,2016-01-10,9,103501,0.693147,False
201,2016-01-11,9,103501,1.386294,False
222,2016-01-12,9,103501,1.386294,False
246,2016-01-13,9,103501,2.079442,False


In [4]:
df_2017 = df_train[df_train.date.isin(
    pd.date_range("2017-05-31", periods=7 * 11))].copy()
#del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
#del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

In [5]:
df_2017

Unnamed: 0_level_0,date,2017-05-31 00:00:00,2017-06-01 00:00:00,2017-06-02 00:00:00,2017-06-03 00:00:00,2017-06-04 00:00:00,2017-06-05 00:00:00,2017-06-06 00:00:00,2017-06-07 00:00:00,2017-06-08 00:00:00,2017-06-09 00:00:00,...,2017-08-05 00:00:00,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
9,103501,1.94591,2.079442,2.079442,2.484907,2.772589,1.386294,1.791759,1.791759,1.098612,1.386294,...,1.791759,2.079442,0.693147,2.197225,2.197225,0.693147,1.386294,1.098612,1.791759,2.079442


In [6]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]


def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values
    })
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X


In [7]:

print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
#del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
#X_test = prepare_dataset(date(2017, 8, 16), is_train=False)


Preparing dataset...


In [12]:
y_train

array([[ 1.60943791,  1.60943791,  1.60943791,  2.30258509,  1.94591015,
         1.60943791,  1.60943791,  1.60943791,  1.38629436,  2.07944154,
         2.19722458,  2.94443898,  1.60943791,  2.7080502 ,  1.60943791,
         2.19722458],
       [ 1.60943791,  1.38629436,  2.07944154,  2.19722458,  2.94443898,
         1.60943791,  2.7080502 ,  1.60943791,  2.19722458,  0.69314718,
         0.69314718,  0.69314718,  1.79175947,  1.94591015,  2.30258509,
         1.38629436],
       [ 1.60943791,  2.19722458,  0.69314718,  0.69314718,  0.69314718,
         1.79175947,  1.94591015,  2.30258509,  1.38629436,  0.69314718,
         2.19722458,  2.07944154,  2.39789527,  2.07944154,  1.09861229,
         1.60943791],
       [ 2.30258509,  1.38629436,  0.69314718,  2.19722458,  2.07944154,
         2.39789527,  2.07944154,  1.09861229,  1.60943791,  0.69314718,
         1.09861229,  2.30258509,  1.79175947,  0.69314718,  1.38629436,
         1.94591015]])

In [26]:

print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
#    test_pred.append(bst.predict(
#        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

print("Making submission...")
#y_test = np.array(test_pred).transpose()
#df_preds = pd.DataFrame(
#    y_test, index=df_2017.index,
#    columns=pd.date_range("2017-08-16", periods=16)
#).stack().to_frame("unit_sales")
#df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

#submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
#submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
#submission.to_csv('lgb.csv', float_format='%.4f', index=None)



Training and predicting models...
Step 1
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 3673.15	valid_1's l2: 3695.72
Early stopping, best iteration is:
[1]	training's l2: 0.397393	valid_1's l2: 0.350666
mean_14_2017: 0.00
mean_3_2017: 0.00
mean_7_2017: 0.00
promo_14_2017: 0.00
promo_0: 0.00
promo_1: 0.00
promo_2: 0.00
promo_3: 0.00
promo_4: 0.00
promo_5: 0.00
promo_6: 0.00
promo_7: 0.00
promo_8: 0.00
promo_9: 0.00
promo_10: 0.00
promo_11: 0.00
promo_12: 0.00
promo_13: 0.00
promo_14: 0.00
promo_15: 0.00
Step 2
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 3899.8	valid_1's l2: 3922.85
Early stopping, best iteration is:
[1]	training's l2: 0.360099	valid_1's l2: 0.364542
mean_14_2017: 0.00
mean_3_2017: 0.00
mean_7_2017: 0.00
promo_14_2017: 0.00
promo_0: 0.00
promo_1: 0.00
promo_2: 0.00
promo_3: 0.00
promo_4: 0.00
promo_5: 0.00
promo_6: 0.00
promo_7: 0.00
promo_8: 0.00
promo_9: 0.00
promo_10: 0.00
promo_11: 0.00
promo_1



Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 3366.29	valid_1's l2: 3360.32
Early stopping, best iteration is:
[1]	training's l2: 0.387169	valid_1's l2: 0.334072
mean_14_2017: 0.00
mean_3_2017: 0.00
mean_7_2017: 0.00
promo_14_2017: 0.00
promo_0: 0.00
promo_1: 0.00
promo_2: 0.00
promo_3: 0.00
promo_4: 0.00
promo_5: 0.00
promo_6: 0.00
promo_7: 0.00
promo_8: 0.00
promo_9: 0.00
promo_10: 0.00
promo_11: 0.00
promo_12: 0.00
promo_13: 0.00
promo_14: 0.00
promo_15: 0.00
Step 11
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 6224.47	valid_1's l2: 6244.67
Early stopping, best iteration is:
[1]	training's l2: 0.381792	valid_1's l2: 0.470667
mean_14_2017: 0.00
mean_3_2017: 0.00
mean_7_2017: 0.00
promo_14_2017: 0.00
promo_0: 0.00
promo_1: 0.00
promo_2: 0.00
promo_3: 0.00
promo_4: 0.00
promo_5: 0.00
promo_6: 0.00
promo_7: 0.00
promo_8: 0.00
promo_9: 0.00
promo_10: 0.00
promo_11: 0.00
promo_12: 0.00
promo_13: 0.00
promo_14: 0.00
p

Unnamed: 0,mean_14_2017,mean_3_2017,mean_7_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,1.601666,1.595831,1.326757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
