In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
df_train = pd.read_csv(
    'D:/data mining/infor project/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

In [3]:
df_test = pd.read_csv(
    "D:/data mining/infor project/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "D:/data mining/infor project/items.csv",
).set_index("item_nbr")

In [5]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [6]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [7]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [8]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [9]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [10]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [11]:
items = items.reindex(df_2017.index.get_level_values(1))

In [12]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [13]:
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [14]:
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))


Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.301899	valid_1's l2: 0.293989
[200]	training's l2: 0.298325	valid_1's l2: 0.292727
[300]	training's l2: 0.295884	valid_1's l2: 0.292343
[400]	training's l2: 0.293793	valid_1's l2: 0.292075
[500]	training's l2: 0.292012	valid_1's l2: 0.291989
mean_7_2017: 1978514.90
mean_14_2017: 1138683.77
promo_0: 103431.63
mean_3_2017: 91914.78
day_1_2017: 88981.89
mean_20_dow0_2017: 83401.11
mean_4_dow0_2017: 63034.83
mean_30_2017: 61431.87
promo_14_2017: 28218.50
mean_60_2017: 25992.91
promo_7: 8841.28
promo_60_2017: 7454.01
mean_4_dow5_2017: 6793.31
mean_140_2017: 6127.87
mean_4_dow6_2017: 5793.50
mean_20_dow4_2017: 5611.08
promo_140_2017: 5594.21
mean_20_dow2_2017: 4230.85
mean_4_dow2_2017: 3870.97
promo_9: 3452.53
mean_20_dow3_2017: 3001.48
mean_4_dow1_2017: 2758.68
mean_4_dow3_2017: 2653.85
mean_20_dow1_2017: 2613.85
promo_14: 2413.99
mean_4_dow4_2017: 2363.34
mean_20_dow6_2017: 2243.62
mean_20_dow5_2017: 1938.

Step 8
Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.332548	valid_1's l2: 0.389649
[200]	training's l2: 0.328122	valid_1's l2: 0.388945
[300]	training's l2: 0.325173	valid_1's l2: 0.388443
Early stopping, best iteration is:
[328]	training's l2: 0.324566	valid_1's l2: 0.388373
mean_30_2017: 1183731.36
mean_14_2017: 1049243.91
mean_7_2017: 631079.91
promo_7: 181180.08
mean_20_dow0_2017: 161474.08
mean_60_2017: 100777.77
mean_4_dow0_2017: 72262.15
promo_0: 24062.59
mean_3_2017: 20006.49
day_1_2017: 19287.37
promo_14_2017: 12761.71
promo_60_2017: 11815.54
promo_14: 9988.27
promo_140_2017: 8988.26
mean_140_2017: 7812.30
mean_20_dow2_2017: 5697.92
mean_20_dow4_2017: 5452.09
promo_3: 5333.96
promo_5: 3449.94
promo_6: 3408.86
mean_20_dow1_2017: 3336.79
mean_4_dow6_2017: 2894.49
mean_4_dow5_2017: 2886.03
mean_20_dow3_2017: 2879.61
promo_9: 2866.80
mean_4_dow1_2017: 2503.73
mean_4_dow3_2017: 2299.88
mean_4_dow2_2017: 2184.03
mean_20_dow5_2017: 2174.62
promo

[300]	training's l2: 0.352241	valid_1's l2: 0.361233
Early stopping, best iteration is:
[342]	training's l2: 0.351064	valid_1's l2: 0.361192
mean_30_2017: 1472506.43
mean_14_2017: 432875.62
mean_7_2017: 375627.18
mean_60_2017: 281826.61
mean_20_dow6_2017: 215639.24
promo_13: 162050.25
mean_3_2017: 90129.56
mean_4_dow6_2017: 76470.08
day_1_2017: 24286.43
promo_14_2017: 16644.53
mean_20_dow5_2017: 11921.10
promo_14: 11022.72
mean_4_dow5_2017: 10589.03
promo_60_2017: 9941.11
promo_10: 8703.56
mean_20_dow1_2017: 8473.93
mean_140_2017: 7294.54
promo_140_2017: 6583.40
promo_6: 6478.54
mean_20_dow0_2017: 5783.16
promo_12: 5515.47
mean_4_dow1_2017: 3414.07
mean_4_dow0_2017: 3203.93
mean_20_dow3_2017: 3148.16
promo_0: 3120.02
mean_20_dow4_2017: 2884.00
promo_9: 2880.91
mean_20_dow2_2017: 2702.85
promo_11: 2696.21
mean_4_dow3_2017: 2443.59
mean_4_dow4_2017: 2377.83
promo_15: 2219.09
mean_4_dow2_2017: 1985.44
promo_7: 1768.63
promo_8: 1340.60
promo_2: 822.22
promo_1: 567.02
promo_4: 508.06
promo_

In [36]:
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)

In [18]:
y_val = np.array(val_pred).transpose()
df_preds = pd.DataFrame(
    y_val, index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
df_preds.reset_index().to_csv('lgb_cv.csv', float_format='%.4f', index=None)

In [17]:
df_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-07-26,0.156697
1,96995,2017-07-27,0.166952
1,96995,2017-07-28,0.223916
1,96995,2017-07-29,0.271277
1,96995,2017-07-30,0.183197
1,96995,2017-07-31,0.152104
1,96995,2017-08-01,0.148364
1,96995,2017-08-02,0.154628
1,96995,2017-08-03,0.169900
1,96995,2017-08-04,0.238883


In [34]:
X_train.columns

Index(['day_1_2017', 'mean_140_2017', 'mean_14_2017', 'mean_30_2017',
       'mean_3_2017', 'mean_60_2017', 'mean_7_2017', 'promo_140_2017',
       'promo_14_2017', 'promo_60_2017', 'mean_4_dow0_2017',
       'mean_20_dow0_2017', 'mean_4_dow1_2017', 'mean_20_dow1_2017',
       'mean_4_dow2_2017', 'mean_20_dow2_2017', 'mean_4_dow3_2017',
       'mean_20_dow3_2017', 'mean_4_dow4_2017', 'mean_20_dow4_2017',
       'mean_4_dow5_2017', 'mean_20_dow5_2017', 'mean_4_dow6_2017',
       'mean_20_dow6_2017', 'promo_0', 'promo_1', 'promo_2', 'promo_3',
       'promo_4', 'promo_5', 'promo_6', 'promo_7', 'promo_8', 'promo_9',
       'promo_10', 'promo_11', 'promo_12', 'promo_13', 'promo_14', 'promo_15'],
      dtype='object')

In [27]:
promo_2017_test.head()

Unnamed: 0_level_0,date,2017-08-16 00:00:00,2017-08-17 00:00:00,2017-08-18 00:00:00,2017-08-19 00:00:00,2017-08-20 00:00:00,2017-08-21 00:00:00,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
