In [1]:
import pandas as pd
import lightgbm as lgb

In [2]:
df = pd.read_parquet('m5_preprocessed_downcast.parquet')

In [3]:
# Apply label encoder to those categorical columns
from sklearn.preprocessing import LabelEncoder

for col in ['event_name_1', 'event_type_1', 'weekday']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

In [4]:
# Convert "d" into integers
df['d_int'] = df['d'].str.extract('d_(\d+)').astype(int)

In [5]:
df_ca1 = df[df['store_id'] == 0]
df_ca2 = df[df['store_id'] == 1]
df_ca3 = df[df['store_id'] == 2]
df_ca4 = df[df['store_id'] == 3]
df_tx1 = df[df['store_id'] == 4]
df_tx2 = df[df['store_id'] == 5]
df_tx3 = df[df['store_id'] == 6]
df_wi1 = df[df['store_id'] == 7]
df_wi2 = df[df['store_id'] == 8]
df_wi3 = df[df['store_id'] == 9]

In [3]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0


In [4]:
df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_7', 'lag_28',
       'lag_56', 'lag_91', 'lag_182', 'lag_364', 'rolling_sales_mean_7',
       'rolling_sales_std_7', 'rolling_sales_mean_30', 'rolling_sales_std_30',
       'rolling_sales_mean_90', 'rolling_sales_std_90', 'price',
       'price_change', 'price_max', 'price_norm', 'dayofweek', 'week',
       'is_weekend', 'days_since_first_sale', 'cumsum_sales'],
      dtype='object')

#### Recursive Training (per store)

**CA_1**

In [29]:
feature_cols = [
    'lag_7', 'lag_28',
    'rolling_sales_mean_7', 'rolling_sales_mean_30','dayofweek',
    'weekday', 'month', 'snap_CA', 'event_name_1','event_type_1',
    'sell_price', 'price', 'is_weekend', 'days_since_first_sale'
]

In [30]:
df_ca1[feature_cols].dtypes

lag_7                    float64
lag_28                   float64
rolling_sales_mean_7     float64
rolling_sales_mean_30    float64
dayofweek                  int32
weekday                    int32
month                      int32
snap_CA                     int8
event_name_1               int32
event_type_1               int32
sell_price               float32
price                    float32
is_weekend                  bool
days_since_first_sale      int64
dtype: object

In [None]:
df_ca1 = df[df['store_id'] == 0]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,d_1,0,2011-01-29,11101,...,,,,,,5,4,True,0,0


In [31]:
df_ca1.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales,d_int
59153644,FOODS_3_823_CA_1_evaluation,1432,2,0,0,0,d_1941,2,2016-05-22,11617,...,2.98,0.0,2.98,1.0,6,20,True,1940,1141,1941
59153645,FOODS_3_824_CA_1_evaluation,1433,2,0,0,0,d_1941,0,2016-05-22,11617,...,2.48,0.0,2.68,0.925373,6,20,True,1940,1393,1941
59153646,FOODS_3_825_CA_1_evaluation,1434,2,0,0,0,d_1941,1,2016-05-22,11617,...,3.98,0.0,4.38,0.908676,6,20,True,1940,1866,1941
59153647,FOODS_3_826_CA_1_evaluation,1435,2,0,0,0,d_1941,1,2016-05-22,11617,...,1.28,0.0,1.28,1.0,6,20,True,1940,1790,1941
59153648,FOODS_3_827_CA_1_evaluation,1436,2,0,0,0,d_1941,5,2016-05-22,11617,...,1.0,0.0,1.0,1.0,6,20,True,1940,1684,1941


In [42]:
# Split into training (1-1913) & validation set (1914-1941)
#train_df_ca1 = df_ca1[df_ca1['d_int'] <= 1913]
#valid_df_ca1 = df_ca1[(df_ca1['d_int'] > 1913) & (df_ca1['d_int'] <= 1941)]

train_mask_ca1 = df_ca1['d_int'] <= 1913
valid_mask_ca1 = (df_ca1['d_int'] > 1913) & (df_ca1['d_int'] <= 1941)

train_X_ca1 = df_ca1[train_mask_ca1][feature_cols]
train_y_ca1 = df_ca1[train_mask_ca1]['sales']

valid_X_ca1 = df_ca1[valid_mask_ca1][feature_cols]
valid_y_ca1 = df_ca1[valid_mask_ca1]['sales']

In [37]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_ca1, valid_y_ca1)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_ca1, train_y_ca1, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.748327
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.347741
[LightGBM] [Debug] init for col-wise cost 0.134118 seconds, init for row-wise cost 0.260046 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.172224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.319829
[LightGBM] [Debug] Trained a tree with leaves = 256 and depth = 13
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 256 and depth = 13
[LightGBM] [Debug] Trained a tree 

In [40]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_ca1, train_y_ca1)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.748327
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.347741
[LightGBM] [Debug] init for col-wise cost 0.114348 seconds, init for row-wise cost 0.141510 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.319829
[LightGBM] [Debug] Trained a tree with leaves = 256 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 256 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 256 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 256 and depth = 13
[LightGBM] [Debug] Tr

In [None]:
### Ignore this cell
# Train the model

from lightgbm import LGBMRegressor

lgb_train = lgb.Dataset(train_X_ca1, label=train_y_ca1)
lgb_valid = lgb.Dataset(valid_X_ca1, label=valid_y_ca1)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 128
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102943 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.319829
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.41621	valid_1's rmse: 2.28953
[200]	training's rmse: 2.35317	valid_1's rmse: 2.28325
[300]	training's rmse: 2.31315	valid_1's rmse: 2.28099
Early stopping, best iteration is:
[273]	training's rmse: 2.32303	valid_1's rmse: 2.28058


In [None]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
13  days_since_first_sale        5909
0                   lag_7        5827
10             sell_price        5255
1                  lag_28        5039
3   rolling_sales_mean_30        3912
2    rolling_sales_mean_7        2900
6                   month        2378
4               dayofweek        1289
7                 snap_CA         748
8            event_name_1         590
5                 weekday         579
9            event_type_1         237
12             is_weekend           8
11                  price           0


In [None]:
# Get the recursive prediction for all items of CA_1
forecast_dict = {}

for item_id in df_ca1['id'].unique():
    df_item = df_ca1[df_ca1['id'] == item_id].copy()

    temp_X = df_item[valid_mask_ca1][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

In [44]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.949648,0.719975,0.741060,0.783927,0.791131,1.017027,1.210970,1.081572,0.825659,...,0.857379,0.963952,1.105910,0.884418,0.818652,0.731925,0.715187,0.809846,0.990330,1.085300
1,HOBBIES_1_002_CA_1_evaluation,0.808001,0.908290,0.857706,0.909880,0.889086,1.090219,1.362639,1.231847,1.115872,...,1.058264,1.163598,1.228707,0.911091,0.917943,0.814785,0.848831,0.809482,1.110837,1.148197
2,HOBBIES_1_003_CA_1_evaluation,0.782861,0.839446,0.895024,0.932471,0.861113,1.112701,1.202509,0.947344,0.836134,...,0.858907,0.941025,1.309407,0.943724,0.961179,0.895024,0.880220,0.980671,1.054505,1.354384
3,HOBBIES_1_004_CA_1_evaluation,1.953269,1.327146,1.357941,1.226318,1.365785,1.432918,2.223053,1.642646,1.191773,...,1.102007,1.191070,1.518170,1.137344,1.128976,1.184713,0.987888,1.229453,1.344949,1.724273
4,HOBBIES_1_005_CA_1_evaluation,0.897900,1.078235,0.975187,1.038116,1.076633,1.195147,1.416190,1.133465,1.049059,...,1.073966,1.185356,1.437543,1.115941,1.037965,0.967159,1.039758,0.994278,1.109217,1.382855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_1_evaluation,1.021149,1.208010,1.091338,1.178295,1.229057,1.364006,2.642708,1.306505,1.246712,...,1.124644,1.646912,2.138609,1.181029,1.226454,1.179297,1.215135,1.131442,1.545012,2.905671
3045,FOODS_3_824_CA_1_evaluation,0.608424,1.105577,1.119022,1.018091,1.042345,1.246518,2.124735,1.232747,1.291905,...,1.042345,1.219933,1.940942,1.125461,1.163703,1.103760,1.222730,1.356924,1.254891,1.974734
3046,FOODS_3_825_CA_1_evaluation,0.959905,1.109581,1.092507,1.086591,1.208461,1.535916,1.840513,1.222852,1.189755,...,1.129204,1.534468,2.348139,1.279395,1.155204,1.192051,1.196938,1.168417,1.537162,1.930768
3047,FOODS_3_826_CA_1_evaluation,0.818365,1.106550,1.030073,1.126348,1.304430,1.420027,1.774762,1.365893,1.159822,...,1.305609,1.834723,1.823713,1.167211,1.207924,1.193252,1.110476,1.495337,1.707382,1.652014


In [45]:
forecast_df.to_csv('CA_1_best.csv', index=False)

**CA_2**

In [39]:
df_ca2 = df[df['store_id'] == 1]
df_ca2.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59156693,FOODS_3_823_CA_2_evaluation,1432,2,0,1,0,d_1941,2,2016-05-22,11617,...,6.264668,2.98,0.0,2.98,1.0,6,20,True,1940,3373
59156694,FOODS_3_824_CA_2_evaluation,1433,2,0,1,0,d_1941,0,2016-05-22,11617,...,6.264668,2.48,0.0,2.68,0.925373,6,20,True,1940,922
59156695,FOODS_3_825_CA_2_evaluation,1434,2,0,1,0,d_1941,3,2016-05-22,11617,...,6.246307,3.98,0.0,4.38,0.908676,6,20,True,1940,1237
59156696,FOODS_3_826_CA_2_evaluation,1435,2,0,1,0,d_1941,2,2016-05-22,11617,...,6.233132,1.28,0.0,1.28,1.0,6,20,True,1940,1050
59156697,FOODS_3_827_CA_2_evaluation,1436,2,0,1,0,d_1941,3,2016-05-22,11617,...,6.233132,1.0,0.0,1.0,1.0,6,20,True,1940,1099


In [46]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_ca2 = df_ca2['d_int'] <= 1913
valid_mask_ca2 = (df_ca2['d_int'] > 1913) & (df_ca2['d_int'] <= 1941)

train_X_ca2 = df_ca2[train_mask_ca2][feature_cols]
train_y_ca2 = df_ca2[train_mask_ca2]['sales']

valid_X_ca2 = df_ca2[valid_mask_ca2][feature_cols]
valid_y_ca2 = df_ca2[valid_mask_ca2]['sales']

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_ca2, valid_y_ca2)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_ca2, train_y_ca2, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

In [None]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_ca2, train_y_ca2)

In [42]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
10             sell_price        6031
13  days_since_first_sale        5717
0                   lag_7        5039
1                  lag_28        3981
3   rolling_sales_mean_30        3274
2    rolling_sales_mean_7        2351
6                   month        1685
4               dayofweek        1406
8            event_name_1         746
5                 weekday         483
7                 snap_CA         412
9            event_type_1         231
12             is_weekend          13
11                  price           0


In [49]:
# Get the recursive prediction for all items of CA_2
forecast_dict = {}

for item_id in df_ca2['id'].unique():
    df_item = df_ca2[df_ca2['id'] == item_id].copy()

    temp_X = df_item[valid_mask_ca2][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols].copy()
  temp_X = df_item[valid_mask_ca2][feature_cols]

In [50]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_2_evaluation,0.922344,0.812342,0.732020,0.863191,0.967444,1.298339,1.212961,0.896843,0.761854,...,0.967856,1.148815,1.506789,0.891842,0.879831,0.849328,0.828975,0.968618,1.148230,1.666899
1,HOBBIES_1_002_CA_2_evaluation,0.384475,0.901285,0.813460,0.938281,1.062477,1.337777,1.286306,0.999453,0.832498,...,1.062477,1.264941,1.500989,0.993456,0.969351,0.938895,0.904065,1.166581,1.264317,1.741964
2,HOBBIES_1_003_CA_2_evaluation,0.399663,0.972925,0.903657,1.004624,1.111755,1.391329,1.365530,1.054197,0.963186,...,1.127382,1.331483,1.523836,1.035068,1.026759,1.005239,0.975705,1.219864,1.316268,1.761946
3,HOBBIES_1_004_CA_2_evaluation,2.242841,1.677574,1.574395,1.731868,1.692279,1.975780,1.943643,1.952660,1.580016,...,1.823658,1.930202,2.156424,1.526160,1.752773,1.738151,1.683364,1.642473,1.928200,2.112719
4,HOBBIES_1_005_CA_2_evaluation,1.022545,1.092895,1.046810,1.112940,1.246394,1.543601,1.505627,1.175940,1.040932,...,1.231555,1.705079,1.931302,1.172312,1.141131,1.125834,1.050537,1.261690,1.704455,1.887597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_2_evaluation,1.427495,1.194133,1.181645,1.154496,1.292696,1.674355,1.525245,1.205101,1.171798,...,1.291223,1.775020,1.857322,1.198095,1.182062,1.183080,1.157891,1.257065,1.943885,1.898948
3045,FOODS_3_824_CA_2_evaluation,0.904116,1.146740,1.143111,1.113786,1.251847,1.582014,1.484135,1.169866,1.144983,...,1.252433,1.731745,1.453084,1.140363,1.133917,1.146155,1.129552,1.242552,1.872756,1.519586
3046,FOODS_3_825_CA_2_evaluation,0.771589,1.168351,1.145858,1.084847,1.249657,1.511201,1.487513,1.181533,1.116580,...,1.249657,1.761397,1.441885,1.164617,1.158512,1.159421,1.120456,1.228029,1.836286,1.443933
3047,FOODS_3_826_CA_2_evaluation,0.821200,1.037543,1.010018,1.071951,1.239454,1.536139,1.514434,1.106961,1.053194,...,1.224651,1.787280,1.801924,1.093678,1.081503,1.086532,1.101496,1.194047,1.844801,1.808235


In [51]:
forecast_df.to_csv('CA_2_best.csv', index=False)

**CA_3**

In [49]:
df_ca3 = df[df['store_id'] == 2]
df_ca3.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59159742,FOODS_3_823_CA_3_evaluation,1432,2,0,2,0,d_1941,2,2016-05-22,11617,...,5.729482,2.98,0.0,2.98,1.0,6,20,True,1940,4879
59159743,FOODS_3_824_CA_3_evaluation,1433,2,0,2,0,d_1941,0,2016-05-22,11617,...,5.734894,2.48,0.0,2.68,0.925373,6,20,True,1940,1577
59159744,FOODS_3_825_CA_3_evaluation,1434,2,0,2,0,d_1941,1,2016-05-22,11617,...,5.734894,3.98,0.0,4.38,0.908676,6,20,True,1940,1983
59159745,FOODS_3_826_CA_3_evaluation,1435,2,0,2,0,d_1941,0,2016-05-22,11617,...,5.734894,1.28,0.0,1.28,1.0,6,20,True,1940,1245
59159746,FOODS_3_827_CA_3_evaluation,1436,2,0,2,0,d_1941,0,2016-05-22,11617,...,5.797509,1.0,0.0,1.0,1.0,6,20,True,1940,2703


In [52]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_ca3 = df_ca3['d_int'] <= 1913
valid_mask_ca3 = (df_ca3['d_int'] > 1913) & (df_ca3['d_int'] <= 1941)

train_X_ca3 = df_ca3[train_mask_ca3][feature_cols]
train_y_ca3 = df_ca3[train_mask_ca3]['sales']

valid_X_ca3 = df_ca3[valid_mask_ca3][feature_cols]
valid_y_ca3 = df_ca3[valid_mask_ca3]['sales']

In [53]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_ca3, valid_y_ca3)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_ca3, train_y_ca3, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


2 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\lightgbm\sklearn.py", line 1398, in fit
    super().fit(
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    self._Booster = train(
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\lightgbm\engine.py", line 297, in train
    booster = Booster(params=

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.805851
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.339483
[LightGBM] [Debug] init for col-wise cost 0.061327 seconds, init for row-wise cost 0.229487 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105504 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1561
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.918170
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained 

In [54]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_ca3, train_y_ca3)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.805851
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.339483
[LightGBM] [Debug] init for col-wise cost 0.080297 seconds, init for row-wise cost 0.193622 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.122584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1561
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.918170
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Tr

In [52]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
10             sell_price        2676
13  days_since_first_sale        2158
0                   lag_7        1912
1                  lag_28        1675
3   rolling_sales_mean_30        1092
2    rolling_sales_mean_7         974
6                   month         819
7                 snap_CA         461
4               dayofweek         427
8            event_name_1         309
5                 weekday         240
9            event_type_1          82
12             is_weekend           2
11                  price           0


In [55]:
# Get the recursive prediction for all items of CA_3
forecast_dict = {}

for item_id in df_ca3['id'].unique():
    df_item = df_ca3[df_ca3['id'] == item_id].copy()

    temp_X = df_item[valid_mask_ca3][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols].copy()
  temp_X = df_item[valid_mask_ca3][feature_cols]

In [56]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_3_evaluation,1.055717,1.150055,1.149324,1.013712,1.158637,1.223130,1.270852,1.220924,1.185475,...,1.129378,1.186742,1.260044,1.100264,1.087642,1.114549,1.075774,1.085400,1.228266,1.270673
1,HOBBIES_1_002_CA_3_evaluation,1.084376,1.113845,1.107600,1.018463,1.168072,1.255257,1.310961,1.180646,1.148880,...,1.151376,1.221919,1.284471,1.113836,1.099604,1.135410,1.087735,1.098972,1.262041,1.308084
2,HOBBIES_1_003_CA_3_evaluation,0.480065,1.201089,1.194844,1.099055,1.231703,1.320549,1.416536,1.279730,1.251693,...,1.229263,1.304342,1.377867,1.198253,1.184688,1.202581,1.172819,1.173216,1.346540,1.395262
3,HOBBIES_1_004_CA_3_evaluation,8.894363,7.322560,7.296628,7.144263,7.352941,7.636976,8.171869,7.958372,6.926175,...,6.423071,6.511175,6.566751,6.390001,6.388711,5.913933,5.911951,5.937457,6.495381,6.620123
4,HOBBIES_1_005_CA_3_evaluation,1.080271,1.217389,1.196699,1.081793,1.246062,1.351891,1.431466,1.280721,1.237876,...,1.226765,1.324348,1.407087,1.214711,1.177994,1.141419,1.135531,1.151303,1.349710,1.375799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_3_evaluation,1.894755,1.958232,1.954832,1.957820,1.974394,2.050223,2.224049,2.162339,2.137759,...,1.963413,2.057479,2.125293,1.923161,1.922640,1.946366,1.829965,1.968732,2.023126,2.123173
3045,FOODS_3_824_CA_3_evaluation,0.591074,1.295415,1.292014,1.295002,1.311577,1.363212,1.472497,1.415006,1.389953,...,1.300595,1.375773,1.443587,1.260344,1.259823,1.272447,1.205167,1.305915,1.323980,1.441467
3046,FOODS_3_825_CA_3_evaluation,1.894962,1.925013,1.918267,1.924128,1.941175,2.005592,2.163797,2.101111,2.062705,...,1.915746,2.012849,2.085967,1.885930,1.868635,1.894323,1.806077,1.932167,1.966359,2.083847
3047,FOODS_3_826_CA_3_evaluation,1.191664,1.552844,1.551457,1.554445,1.538381,1.566735,1.649098,1.625656,1.894537,...,1.723923,1.793458,1.914138,1.693783,1.711122,1.687294,1.353365,1.744139,1.759885,1.910420


In [57]:
forecast_df.to_csv('CA_3_best.csv', index=False)

**CA_4**

In [56]:
df_ca4 = df[df['store_id'] == 3]
df_ca4.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59162791,FOODS_3_823_CA_4_evaluation,1432,2,0,3,0,d_1941,0,2016-05-22,11617,...,3.328386,2.98,0.0,2.98,1.0,6,20,True,1940,1065
59162792,FOODS_3_824_CA_4_evaluation,1433,2,0,3,0,d_1941,0,2016-05-22,11617,...,3.323375,2.48,0.0,2.68,0.925373,6,20,True,1940,646
59162793,FOODS_3_825_CA_4_evaluation,1434,2,0,3,0,d_1941,0,2016-05-22,11617,...,3.328386,3.98,0.0,4.38,0.908676,6,20,True,1940,1244
59162794,FOODS_3_826_CA_4_evaluation,1435,2,0,3,0,d_1941,4,2016-05-22,11617,...,3.328386,1.28,0.0,1.28,1.0,6,20,True,1940,1893
59162795,FOODS_3_827_CA_4_evaluation,1436,2,0,3,0,d_1941,0,2016-05-22,11617,...,3.380195,1.0,0.0,1.0,1.0,6,20,True,1940,189


In [58]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_ca4 = df_ca4['d_int'] <= 1913
valid_mask_ca4 = (df_ca4['d_int'] > 1913) & (df_ca4['d_int'] <= 1941)

train_X_ca4 = df_ca4[train_mask_ca4][feature_cols]
train_y_ca4 = df_ca4[train_mask_ca4]['sales']

valid_X_ca4 = df_ca4[valid_mask_ca4][feature_cols]
valid_y_ca4 = df_ca4[valid_mask_ca4]['sales']

In [59]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_ca4, valid_y_ca4)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_ca4, train_y_ca4, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.775809
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.365442
[LightGBM] [Debug] init for col-wise cost 0.115154 seconds, init for row-wise cost 0.204095 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.153996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1184
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.703559
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Trained a tree 

In [60]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_ca4, train_y_ca4)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.775809
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.365442
[LightGBM] [Debug] init for col-wise cost 0.080079 seconds, init for row-wise cost 0.187896 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.117721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1184
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.703559
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Tr

In [59]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
10             sell_price        4854
13  days_since_first_sale        3922
0                   lag_7        3580
1                  lag_28        2721
3   rolling_sales_mean_30        2395
2    rolling_sales_mean_7        1383
6                   month        1223
4               dayofweek         693
8            event_name_1         487
5                 weekday         385
7                 snap_CA         305
9            event_type_1         149
12             is_weekend           1
11                  price           0


In [61]:
# Get the recursive prediction for all items of CA_4
forecast_dict = {}

for item_id in df_ca4['id'].unique():
    df_item = df_ca4[df_ca4['id'] == item_id].copy()

    temp_X = df_item[valid_mask_ca4][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols].copy()
  temp_X = df_item[valid_mask_ca4][feature_cols]

In [62]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_4_evaluation,0.774245,0.559504,0.595336,0.565014,0.572170,0.617310,0.711231,0.656244,0.681069,...,0.642450,0.653871,0.707009,0.609981,0.657869,0.608863,0.622833,0.599351,0.649407,0.707009
1,HOBBIES_1_002_CA_4_evaluation,0.317392,0.643742,0.732477,0.643876,0.668803,0.862762,0.796298,0.793917,0.902462,...,0.843144,0.884933,0.845160,0.844120,0.867984,0.824543,0.885424,0.844891,0.899189,0.809316
2,HOBBIES_1_003_CA_4_evaluation,0.357697,0.614914,0.692181,0.616587,0.698969,0.815520,0.755936,0.726641,0.879631,...,0.791450,0.830071,0.874599,0.808409,0.772008,0.763965,0.832202,0.793320,0.815837,0.796932
3,HOBBIES_1_004_CA_4_evaluation,1.415441,0.795832,0.842911,0.837364,0.851409,0.876084,1.005640,0.899619,0.909355,...,0.859634,0.896834,0.960557,0.881756,0.827389,0.827660,0.884846,0.851409,0.908655,0.985336
4,HOBBIES_1_005_CA_4_evaluation,1.181505,1.047041,1.111836,1.136465,1.005580,1.010101,1.222872,1.185974,1.029503,...,0.997457,1.036078,1.038107,0.989980,0.981473,0.947452,0.987737,0.992330,1.013949,1.151304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_4_evaluation,1.055771,0.987218,1.010024,0.992872,0.989041,1.023385,1.106491,1.101088,1.007092,...,0.973134,1.029917,1.068153,0.932990,0.972866,1.068167,0.975937,1.044382,1.051250,1.073906
3045,FOODS_3_824_CA_4_evaluation,0.859672,1.041967,1.029664,0.966207,0.985330,1.020368,1.085906,1.053208,1.042533,...,0.949769,1.033750,1.069836,0.974312,0.976648,1.088214,0.979718,1.062005,1.070790,1.073364
3046,FOODS_3_825_CA_4_evaluation,1.033140,0.974478,0.995395,0.960290,0.961067,0.982571,1.017450,0.981217,0.937039,...,0.898202,1.012407,1.001767,0.918644,0.926584,0.942272,0.955742,0.987026,1.028471,1.022497
3047,FOODS_3_826_CA_4_evaluation,1.267526,1.187583,1.322871,1.094494,1.066890,1.127527,1.145618,1.221091,1.469239,...,1.032314,1.100122,1.092584,1.030242,1.091893,1.127919,1.070376,1.110305,1.123231,1.159173


In [63]:
forecast_df.to_csv('CA_4_best.csv', index=False)

**TX_1**

In [64]:
feature_cols = [
    'lag_7', 'lag_28',
    'rolling_sales_mean_7', 'rolling_sales_mean_30','dayofweek',
    'weekday', 'month', 'snap_TX', 'event_name_1','event_type_1',
    'sell_price', 'price', 'is_weekend', 'days_since_first_sale'
]

In [64]:
df_tx1 = df[df['store_id'] == 4]
df_tx1.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59165840,FOODS_3_823_TX_1_evaluation,1432,2,0,4,1,d_1941,0,2016-05-22,11617,...,4.470405,2.98,0.0,2.98,1.0,6,20,True,1940,978
59165841,FOODS_3_824_TX_1_evaluation,1433,2,0,4,1,d_1941,0,2016-05-22,11617,...,4.475052,2.48,0.0,2.48,1.0,6,20,True,1940,548
59165842,FOODS_3_825_TX_1_evaluation,1434,2,0,4,1,d_1941,0,2016-05-22,11617,...,4.481283,3.98,0.0,4.38,0.908676,6,20,True,1940,730
59165843,FOODS_3_826_TX_1_evaluation,1435,2,0,4,1,d_1941,0,2016-05-22,11617,...,4.485808,1.28,0.0,1.28,1.0,6,20,True,1940,923
59165844,FOODS_3_827_TX_1_evaluation,1436,2,0,4,1,d_1941,1,2016-05-22,11617,...,4.485808,1.0,0.0,1.0,1.0,6,20,True,1940,436


In [65]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_tx1 = df_tx1['d_int'] <= 1913
valid_mask_tx1 = (df_tx1['d_int'] > 1913) & (df_tx1['d_int'] <= 1941)

train_X_tx1 = df_tx1[train_mask_tx1][feature_cols]
train_y_tx1 = df_tx1[train_mask_tx1]['sales']

valid_X_tx1 = df_tx1[valid_mask_tx1][feature_cols]
valid_y_tx1 = df_tx1[valid_mask_tx1]['sales']

In [66]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_tx1, valid_y_tx1)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_tx1, train_y_tx1, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.770942
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.361192
[LightGBM] [Debug] init for col-wise cost 0.114412 seconds, init for row-wise cost 0.211267 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.155144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.959291
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree 

In [67]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_tx1, train_y_tx1)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.770942
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.361192
[LightGBM] [Debug] init for col-wise cost 0.080997 seconds, init for row-wise cost 0.192830 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.959291
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Tr

In [67]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
10             sell_price        4330
13  days_since_first_sale        3962
0                   lag_7        3807
1                  lag_28        3037
3   rolling_sales_mean_30        2847
2    rolling_sales_mean_7        2123
6                   month        1595
4               dayofweek         820
7                 snap_TX         644
8            event_name_1         581
5                 weekday         353
9            event_type_1         153
12             is_weekend           5
11                  price           0


In [68]:
# Get the recursive prediction for all items of TX_1
forecast_dict = {}

for item_id in df_tx1['id'].unique():
    df_item = df_tx1[df_tx1['id'] == item_id].copy()

    temp_X = df_item[valid_mask_tx1][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols].copy()
  temp_X = df_item[valid_mask_tx1][feature_cols]

In [69]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_TX_1_evaluation,0.564291,0.802587,0.845100,0.816255,0.802319,0.888519,0.896268,0.807415,0.831893,...,0.830395,0.871421,0.854785,0.759714,0.796625,0.796625,0.814392,0.838942,0.899775,0.920524
1,HOBBIES_1_002_TX_1_evaluation,0.282447,0.760230,0.769268,0.771141,0.761740,0.850355,0.886358,0.769253,0.816377,...,0.809692,0.837522,0.839541,0.720209,0.758463,0.758463,0.768180,0.787439,0.857630,0.873957
2,HOBBIES_1_003_TX_1_evaluation,0.323669,0.862431,0.877252,0.879124,0.872696,0.957907,0.997351,0.880208,0.939572,...,0.928341,0.935109,0.934337,0.817503,0.868345,0.865904,0.875732,0.898394,0.956615,0.984912
3,HOBBIES_1_004_TX_1_evaluation,0.344819,0.793808,0.839947,0.827206,0.826312,0.921534,0.957542,0.846290,0.873918,...,0.871278,0.890286,0.881838,0.769953,0.828947,0.821081,0.833760,0.859179,0.920372,0.946368
4,HOBBIES_1_005_TX_1_evaluation,0.319041,0.843776,0.876953,0.875719,0.872924,0.965475,1.004805,0.889750,0.916909,...,0.921498,0.933842,0.954553,0.814365,0.869594,0.870667,0.882294,0.896954,0.974672,0.986224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_TX_1_evaluation,0.340751,0.932483,0.916219,0.914648,0.922562,1.004747,1.083995,0.922258,1.020294,...,1.029231,1.000424,1.067898,0.918127,0.911216,0.911216,0.911864,0.919431,1.002865,1.014208
3045,FOODS_3_824_TX_1_evaluation,0.320008,0.903669,0.891066,0.889193,0.895553,0.973944,1.070867,0.894601,0.977978,...,0.984713,0.985990,1.086516,0.890470,0.885761,0.885761,0.886409,0.891774,0.988431,0.986449
3046,FOODS_3_825_TX_1_evaluation,0.841132,0.875668,0.878558,0.876987,0.879594,0.962952,1.017489,0.875174,0.965208,...,0.980980,0.966042,1.024424,0.879871,0.874203,0.874203,0.875925,0.882492,0.963601,0.977385
3047,FOODS_3_826_TX_1_evaluation,0.417353,1.094919,1.063546,1.064652,1.110795,1.172841,1.217437,1.049755,1.168956,...,1.193122,1.147362,1.260334,1.045460,1.075792,1.044282,1.093719,1.050673,1.139905,1.169783


In [70]:
forecast_df.to_csv('TX_1_best.csv', index=False)

**TX_2**

In [71]:
df_tx2 = df[df['store_id'] == 5]
df_tx2.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59168889,FOODS_3_823_TX_2_evaluation,1432,2,0,5,1,d_1941,3,2016-05-22,11617,...,5.434309,2.98,0.0,2.98,1.0,6,20,True,1940,956
59168890,FOODS_3_824_TX_2_evaluation,1433,2,0,5,1,d_1941,0,2016-05-22,11617,...,5.434309,2.48,0.0,2.48,1.0,6,20,True,1940,809
59168891,FOODS_3_825_TX_2_evaluation,1434,2,0,5,1,d_1941,0,2016-05-22,11617,...,5.43517,3.98,0.0,4.38,0.908676,6,20,True,1940,1846
59168892,FOODS_3_826_TX_2_evaluation,1435,2,0,5,1,d_1941,0,2016-05-22,11617,...,5.43517,1.28,0.0,1.28,1.0,6,20,True,1940,1023
59168893,FOODS_3_827_TX_2_evaluation,1436,2,0,5,1,d_1941,1,2016-05-22,11617,...,5.43517,1.0,0.0,1.0,1.0,6,20,True,1940,194


In [71]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_tx2 = df_tx2['d_int'] <= 1913
valid_mask_tx2 = (df_tx2['d_int'] > 1913) & (df_tx2['d_int'] <= 1941)

train_X_tx2 = df_tx2[train_mask_tx2][feature_cols]
train_y_tx2 = df_tx2[train_mask_tx2]['sales']

valid_X_tx2 = df_tx2[valid_mask_tx2][feature_cols]
valid_y_tx2 = df_tx2[valid_mask_tx2]['sales']

In [72]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_tx2, valid_y_tx2)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_tx2, train_y_tx2, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.756500
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.351471
[LightGBM] [Debug] init for col-wise cost 0.119104 seconds, init for row-wise cost 0.169286 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.158077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.236878
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree 

In [73]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_tx2, train_y_tx2)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.756500
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.351471
[LightGBM] [Debug] init for col-wise cost 0.100624 seconds, init for row-wise cost 0.162421 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.137184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.236878
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Tr

In [76]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
10             sell_price        3781
0                   lag_7        3347
13  days_since_first_sale        3114
1                  lag_28        2753
3   rolling_sales_mean_30        2262
2    rolling_sales_mean_7        1825
6                   month        1398
4               dayofweek         784
8            event_name_1         658
7                 snap_TX         519
5                 weekday         320
9            event_type_1         184
12             is_weekend          10
11                  price           0


In [74]:
# Get the recursive prediction for all items of TX_2
forecast_dict = {}

for item_id in df_tx2['id'].unique():
    df_item = df_tx2[df_tx2['id'] == item_id].copy()

    temp_X = df_item[valid_mask_tx2][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols].copy()
  temp_X = df_item[valid_mask_tx2][feature_cols]

In [75]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_TX_2_evaluation,0.295645,0.825788,0.809715,0.825554,0.831469,0.882715,0.950825,0.813621,0.899977,...,0.905069,0.905683,0.973263,0.839386,0.820966,0.825788,0.817356,0.840164,0.900521,0.916373
1,HOBBIES_1_002_TX_2_evaluation,0.976124,0.850664,0.839443,0.850430,0.857088,0.913045,0.999629,0.839241,0.940191,...,0.941391,0.938102,1.034090,0.867034,0.845842,0.851825,0.839706,0.867812,0.935429,0.951778
2,HOBBIES_1_003_TX_2_evaluation,0.369812,0.899466,0.883060,0.905133,0.916614,0.958262,1.048649,0.895468,0.990859,...,0.974429,0.983632,1.086501,0.925399,0.899466,0.905367,0.899231,0.927337,0.984585,0.997308
3,HOBBIES_1_004_TX_2_evaluation,1.024285,0.921814,0.905408,0.922077,0.939459,0.987541,1.066836,0.917816,1.009046,...,0.989757,1.004820,1.088177,0.948244,0.921814,0.921814,0.922077,0.938962,1.007430,1.023779
4,HOBBIES_1_005_TX_2_evaluation,0.385902,0.912635,0.896229,0.912898,0.930280,0.985740,1.061818,0.908637,1.004028,...,0.984739,0.995641,1.083159,0.939065,0.912635,0.912635,0.912898,0.929783,1.000975,1.014600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_TX_2_evaluation,0.416896,0.924039,0.936548,0.936810,0.965134,1.036227,1.116566,0.954857,1.057558,...,1.069267,1.020024,1.137410,0.954857,0.937320,0.937320,0.936864,0.958971,1.022201,1.044025
3045,FOODS_3_824_TX_2_evaluation,1.029917,0.913748,0.926257,0.927292,0.955542,1.025936,1.109348,0.944566,1.053604,...,1.062050,1.009734,1.130193,0.944566,0.927030,0.927030,0.926573,0.946505,1.011910,1.033734
3046,FOODS_3_825_TX_2_evaluation,0.693611,0.948273,0.960158,0.961817,0.983206,1.049434,1.132402,0.979091,1.068710,...,1.087280,1.044259,1.155423,0.979091,0.960836,0.961555,0.961098,0.980533,1.044259,1.068259
3047,FOODS_3_826_TX_2_evaluation,1.221763,1.062304,1.071481,1.074848,1.137691,1.208275,1.343874,1.112564,1.246807,...,1.263480,1.186701,1.365557,1.093714,1.111342,1.073866,1.074129,1.137220,1.186701,1.208499


In [76]:
forecast_df.to_csv('TX_2_best.csv', index=False)

**TX_3**

In [80]:
df_tx3 = df[df['store_id'] == 6]
df_tx3.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59171938,FOODS_3_823_TX_3_evaluation,1432,2,0,6,1,d_1941,1,2016-05-22,11617,...,7.443914,2.98,0.0,2.98,1.0,6,20,True,1940,406
59171939,FOODS_3_824_TX_3_evaluation,1433,2,0,6,1,d_1941,0,2016-05-22,11617,...,7.443914,2.48,0.0,2.68,0.925373,6,20,True,1940,428
59171940,FOODS_3_825_TX_3_evaluation,1434,2,0,6,1,d_1941,3,2016-05-22,11617,...,7.424493,3.98,0.0,4.38,0.908676,6,20,True,1940,1236
59171941,FOODS_3_826_TX_3_evaluation,1435,2,0,6,1,d_1941,2,2016-05-22,11617,...,7.422181,1.28,0.0,1.28,1.0,6,20,True,1940,1180
59171942,FOODS_3_827_TX_3_evaluation,1436,2,0,6,1,d_1941,2,2016-05-22,11617,...,7.415138,1.0,0.0,1.0,1.0,6,20,True,1940,711


In [77]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_tx3 = df_tx3['d_int'] <= 1913
valid_mask_tx3 = (df_tx3['d_int'] > 1913) & (df_tx3['d_int'] <= 1941)

train_X_tx3 = df_tx3[train_mask_tx3][feature_cols]
train_y_tx3 = df_tx3[train_mask_tx3]['sales']

valid_X_tx3 = df_tx3[valid_mask_tx3][feature_cols]
valid_y_tx3 = df_tx3[valid_mask_tx3]['sales']

In [78]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_tx3, valid_y_tx3)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_tx3, train_y_tx3, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.768227
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.359236
[LightGBM] [Debug] init for col-wise cost 0.116951 seconds, init for row-wise cost 0.233834 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1422
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.043992
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Trained a tree 

In [79]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_tx3, train_y_tx3)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.768227
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.359236
[LightGBM] [Debug] init for col-wise cost 0.074590 seconds, init for row-wise cost 0.257676 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.111633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1422
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.043992
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 14
[LightGBM] [Debug] Tr

In [83]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
0                   lag_7        7418
13  days_since_first_sale        7143
10             sell_price        6078
1                  lag_28        5709
3   rolling_sales_mean_30        4879
2    rolling_sales_mean_7        4045
6                   month        2711
4               dayofweek        1500
5                 weekday         939
7                 snap_TX         765
8            event_name_1         723
9            event_type_1         246
12             is_weekend           8
11                  price           0


In [80]:
# Get the recursive prediction for all items of TX_3
forecast_dict = {}

for item_id in df_tx3['id'].unique():
    df_item = df_tx3[df_tx3['id'] == item_id].copy()

    temp_X = df_item[valid_mask_tx3][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols].copy()
  temp_X = df_item[valid_mask_tx3][feature_cols]

In [81]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_TX_3_evaluation,0.278478,0.840155,0.833069,0.837125,0.806453,0.844257,0.933949,0.820552,0.930730,...,0.905011,0.864042,0.932873,0.854229,0.783567,0.833549,0.835038,0.800844,0.859346,0.879110
1,HOBBIES_1_002_TX_3_evaluation,0.312731,0.834596,0.831516,0.839540,0.836999,0.884247,0.973538,0.860676,0.937748,...,0.952660,0.906675,0.984357,0.851098,0.821645,0.843762,0.831906,0.836999,0.897311,0.916216
2,HOBBIES_1_003_TX_3_evaluation,0.724077,0.951526,0.944145,0.953053,0.949629,1.001503,1.104773,0.973305,1.068983,...,1.083895,1.019304,1.115592,0.965752,0.932462,0.958667,0.944536,0.963727,1.009940,1.028845
3,HOBBIES_1_004_TX_3_evaluation,0.350474,0.896825,0.891720,0.894188,0.899228,0.956395,1.039280,0.925828,1.000086,...,1.014113,0.964832,1.037414,0.911355,0.894135,0.904302,0.892110,0.911301,0.961351,0.979900
4,HOBBIES_1_005_TX_3_evaluation,0.709664,0.948772,0.943416,0.952943,0.954142,1.013355,1.092848,0.974116,1.042268,...,1.057105,1.007701,1.095799,0.957704,0.945831,0.957937,0.943806,0.957651,1.013228,1.028296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_TX_3_evaluation,0.708563,0.956024,0.954962,0.954709,0.970747,1.033311,1.142413,0.970747,1.065273,...,1.087131,1.023812,1.118775,0.976488,0.955352,0.951219,0.955352,0.957763,1.009673,1.024712
3045,FOODS_3_824_TX_3_evaluation,0.333522,0.954069,0.953006,0.952753,0.963444,1.026008,1.144950,0.963444,1.074307,...,1.091799,1.016510,1.134415,0.969185,0.953396,0.949263,0.953396,0.970952,1.019350,1.034389
3046,FOODS_3_825_TX_3_evaluation,1.336709,0.954541,0.950261,0.957178,0.973679,1.031922,1.133572,0.973679,1.065089,...,1.087694,1.024699,1.118356,0.977375,0.956858,0.952404,0.956239,0.963261,1.010560,1.017176
3047,FOODS_3_826_TX_3_evaluation,1.116973,1.193764,1.186476,1.183723,1.191223,1.243976,1.380052,1.192549,1.324673,...,1.340277,1.247720,1.342994,1.194462,1.199960,1.200590,1.198047,1.138910,1.201800,1.219388


In [82]:
forecast_df.to_csv('TX_3_best.csv', index=False)

**WI_1**

In [7]:
feature_cols = [
    'lag_7', 'lag_28',
    'rolling_sales_mean_7', 'rolling_sales_mean_30','dayofweek',
    'weekday', 'month', 'snap_WI', 'event_name_1','event_type_1',
    'sell_price', 'price', 'is_weekend', 'days_since_first_sale'
]

In [89]:
df_wi1 = df[df['store_id'] == 7]
df_wi1.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59174987,FOODS_3_823_WI_1_evaluation,1432,2,0,7,2,d_1941,2,2016-05-22,11617,...,4.103849,2.98,0.0,2.98,1.0,6,20,True,1940,949
59174988,FOODS_3_824_WI_1_evaluation,1433,2,0,7,2,d_1941,1,2016-05-22,11617,...,4.097074,2.48,0.0,2.68,0.925373,6,20,True,1940,635
59174989,FOODS_3_825_WI_1_evaluation,1434,2,0,7,2,d_1941,5,2016-05-22,11617,...,4.085601,3.98,0.0,4.38,0.908676,6,20,True,1940,813
59174990,FOODS_3_826_WI_1_evaluation,1435,2,0,7,2,d_1941,3,2016-05-22,11617,...,4.074892,1.28,0.0,1.28,1.0,6,20,True,1940,1517
59174991,FOODS_3_827_WI_1_evaluation,1436,2,0,7,2,d_1941,10,2016-05-22,11617,...,4.074892,1.0,0.0,1.0,1.0,6,20,True,1940,2836


In [84]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_wi1 = df_wi1['d_int'] <= 1913
valid_mask_wi1 = (df_wi1['d_int'] > 1913) & (df_wi1['d_int'] <= 1941)

train_X_wi1 = df_wi1[train_mask_wi1][feature_cols]
train_y_wi1 = df_wi1[train_mask_wi1]['sales']

valid_X_wi1 = df_wi1[valid_mask_wi1][feature_cols]
valid_y_wi1 = df_wi1[valid_mask_wi1]['sales']

In [85]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_wi1, valid_y_wi1)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_wi1, train_y_wi1, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


1 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\lightgbm\sklearn.py", line 1398, in fit
    super().fit(
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    self._Booster = train(
  File "c:\Users\Rachel\anaconda3\envs\tf_directml\lib\site-packages\lightgbm\engine.py", line 297, in train
    booster = Booster(params=

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.764914
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.360900
[LightGBM] [Debug] init for col-wise cost 0.105896 seconds, init for row-wise cost 0.156564 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1205
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.882787
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained 

In [86]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_wi1, train_y_wi1)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.764914
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.360900
[LightGBM] [Debug] init for col-wise cost 0.106974 seconds, init for row-wise cost 0.228401 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.145529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1205
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.882787
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Tr

In [92]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
10             sell_price        5834
13  days_since_first_sale        4948
0                   lag_7        4487
1                  lag_28        3447
3   rolling_sales_mean_30        2753
6                   month        1761
2    rolling_sales_mean_7        1611
4               dayofweek        1413
8            event_name_1         672
5                 weekday         535
7                 snap_WI         376
9            event_type_1         219
12             is_weekend          11
11                  price           0


In [87]:
# Get the recursive prediction for all items of WI_1
forecast_dict = {}

for item_id in df_wi1['id'].unique():
    df_item = df_wi1[df_wi1['id'] == item_id].copy()

    temp_X = df_item[valid_mask_wi1][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols].copy()
  temp_X = df_item[valid_mask_wi1][feature_cols]

In [88]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_WI_1_evaluation,0.191765,0.635264,0.705101,0.653941,0.668893,0.732521,0.859434,0.695380,0.706033,...,0.658022,0.795509,0.998538,0.651900,0.683414,0.696177,0.717527,0.624332,0.738533,0.894640
1,HOBBIES_1_002_WI_1_evaluation,0.736531,0.914680,0.991861,0.944457,0.989766,1.118587,1.196937,1.015954,1.017563,...,0.976344,1.160360,1.359248,0.959005,1.018040,1.004096,1.024439,1.009583,1.058386,1.279215
2,HOBBIES_1_003_WI_1_evaluation,0.959240,0.908767,0.983405,0.949577,0.990021,1.095561,1.206094,1.011932,1.034464,...,1.002297,1.179185,1.462544,0.934071,0.977499,0.968616,1.011400,0.981113,1.051380,1.256787
3,HOBBIES_1_004_WI_1_evaluation,0.277592,0.733927,0.822948,0.752339,0.771815,0.870496,1.013594,0.823986,0.835919,...,0.790250,0.966979,1.231742,0.747543,0.788993,0.755542,0.814317,0.860902,0.974660,1.074961
4,HOBBIES_1_005_WI_1_evaluation,0.845894,0.981156,1.044163,1.013562,1.050881,1.139055,1.252850,1.067971,1.085436,...,1.053272,1.254572,1.529034,0.980281,1.021340,1.045504,1.066099,1.136348,1.129879,1.650933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_WI_1_evaluation,0.871105,1.046162,1.056745,1.097604,1.114858,1.249700,1.440616,1.162407,1.207781,...,1.162266,1.341429,1.538674,1.079584,1.065307,1.197942,1.132432,1.142119,1.232847,1.692397
3045,FOODS_3_824_WI_1_evaluation,0.453816,1.125148,1.145229,1.158924,1.201748,1.327869,1.528389,1.254776,1.298030,...,1.221684,1.427645,1.944869,1.146137,1.122540,1.273168,1.193400,1.205377,1.310488,1.776594
3046,FOODS_3_825_WI_1_evaluation,0.450766,0.986358,0.979390,1.022885,1.019841,1.173720,1.397501,1.087714,1.124841,...,1.094775,1.260859,1.438390,1.015256,0.982190,1.102372,1.144194,1.054938,1.125119,1.400828
3047,FOODS_3_826_WI_1_evaluation,0.201581,0.854027,0.836816,0.890384,0.903906,1.114872,1.231739,0.998897,1.045494,...,0.969374,1.178178,1.337573,0.915880,0.877472,0.931192,0.929037,0.938019,1.018445,1.267249


In [89]:
forecast_df.to_csv('WI_1_best.csv', index=False)

**WI_2**

In [97]:
df_wi2 = df[df['store_id'] == 8]
df_wi2.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59178036,FOODS_3_823_WI_2_evaluation,1432,2,0,8,2,d_1941,1,2016-05-22,11617,...,6.195685,2.98,0.0,2.98,1.0,6,20,True,1940,920
59178037,FOODS_3_824_WI_2_evaluation,1433,2,0,8,2,d_1941,0,2016-05-22,11617,...,6.20057,2.48,0.0,2.68,0.925373,6,20,True,1940,785
59178038,FOODS_3_825_WI_2_evaluation,1434,2,0,8,2,d_1941,6,2016-05-22,11617,...,6.201808,3.98,0.0,4.38,0.908676,6,20,True,1940,1178
59178039,FOODS_3_826_WI_2_evaluation,1435,2,0,8,2,d_1941,2,2016-05-22,11617,...,6.204837,1.28,0.0,1.28,1.0,6,20,True,1940,1188
59178040,FOODS_3_827_WI_2_evaluation,1436,2,0,8,2,d_1941,4,2016-05-22,11617,...,6.196945,1.0,0.0,1.0,1.0,6,20,True,1940,926


In [8]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_wi2 = df_wi2['d_int'] <= 1913
valid_mask_wi2 = (df_wi2['d_int'] > 1913) & (df_wi2['d_int'] <= 1941)

train_X_wi2 = df_wi2[train_mask_wi2][feature_cols]
train_y_wi2 = df_wi2[train_mask_wi2]['sales']

valid_X_wi2 = df_wi2[valid_mask_wi2][feature_cols]
valid_y_wi2 = df_wi2[valid_mask_wi2]['sales']

In [9]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_wi2, valid_y_wi2)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_wi2, train_y_wi2, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.770936
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.364282
[LightGBM] [Debug] init for col-wise cost 0.125399 seconds, init for row-wise cost 0.234910 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.167291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.121945
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree 

In [10]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_wi2, train_y_wi2)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.770936
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.364282
[LightGBM] [Debug] init for col-wise cost 0.071861 seconds, init for row-wise cost 0.216446 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.121945
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Tr

In [100]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
13  days_since_first_sale        6938
0                   lag_7        6263
3   rolling_sales_mean_30        5696
1                  lag_28        5177
10             sell_price        4681
2    rolling_sales_mean_7        4259
6                   month        2560
7                 snap_WI        1475
4               dayofweek        1470
5                 weekday         925
8            event_name_1         507
9            event_type_1         180
12             is_weekend           1
11                  price           0


In [11]:
# Get the recursive prediction for all items of WI_2
forecast_dict = {}

for item_id in df_wi2['id'].unique():
    df_item = df_wi2[df_wi2['id'] == item_id].copy()

    temp_X = df_item[valid_mask_wi2][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols].copy()
  temp_X = df_item[valid_mask_wi2][feature_cols]

In [12]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_WI_2_evaluation,0.250902,0.825936,0.725977,0.827809,0.810496,0.962855,0.991672,1.113052,1.137896,...,0.889710,1.212050,1.191172,0.839600,0.829515,0.853559,0.843703,0.854533,0.888947,0.990428
1,HOBBIES_1_002_WI_2_evaluation,0.328013,0.919801,0.817250,0.919082,0.890831,1.063868,1.093058,1.221542,1.240852,...,0.963150,1.307274,1.273799,0.909841,0.921837,0.940316,0.935063,0.939185,0.988310,1.078507
2,HOBBIES_1_003_WI_2_evaluation,0.381151,0.989370,0.924420,0.997561,0.952209,1.149253,1.181414,1.329151,1.341961,...,1.042932,1.393014,1.382080,0.989990,1.020059,1.026636,1.026468,1.027316,1.082275,1.166864
3,HOBBIES_1_004_WI_2_evaluation,0.693178,0.799478,0.720234,0.811697,0.781371,0.973791,1.002981,1.136118,1.093669,...,0.874836,1.208578,1.186282,0.841228,0.840529,0.855187,0.846938,0.827485,0.900184,1.003676
4,HOBBIES_1_005_WI_2_evaluation,1.190394,0.927109,0.855240,0.934662,0.889948,1.071531,1.099980,1.256042,1.221674,...,0.997048,1.331013,1.313549,0.945608,0.957797,0.964375,0.936164,0.946994,1.029280,1.096962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_WI_2_evaluation,0.480196,1.035243,1.032915,1.027777,1.245044,1.296678,1.316221,1.447247,1.522557,...,1.243833,1.450326,1.453950,1.201362,1.024940,1.043567,1.208690,1.197205,1.335582,1.380681
3045,FOODS_3_824_WI_2_evaluation,0.447021,1.091459,1.076600,1.077731,1.306074,1.297967,1.493774,1.349934,1.717066,...,1.403632,1.659504,1.950657,1.420926,1.079564,1.099844,1.263315,1.258234,1.708769,1.617023
3046,FOODS_3_825_WI_2_evaluation,1.148674,1.036167,1.050626,1.042823,1.266570,1.144721,1.340948,1.328046,1.570887,...,1.207417,1.488153,1.500930,1.339274,1.017860,1.056149,1.237075,1.298545,1.353464,1.522180
3047,FOODS_3_826_WI_2_evaluation,1.710062,1.796190,1.838206,1.782063,2.030695,2.096632,2.164222,2.232074,2.587715,...,1.926816,2.515187,2.488486,1.956865,1.407158,2.125756,1.947064,1.979813,2.538297,2.484253


In [13]:
forecast_df.to_csv('WI_2_best.csv', index=False)

**WI_3**

In [104]:
df_wi3 = df[df['store_id'] == 9]
df_wi3.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,rolling_sales_std_90,price,price_change,price_max,price_norm,dayofweek,week,is_weekend,days_since_first_sale,cumsum_sales
59181085,FOODS_3_823_WI_3_evaluation,1432,2,0,9,2,d_1941,1,2016-05-22,11617,...,4.347571,2.98,0.0,2.98,1.0,6,20,True,1940,1040
59181086,FOODS_3_824_WI_3_evaluation,1433,2,0,9,2,d_1941,0,2016-05-22,11617,...,4.354973,2.48,0.0,2.68,0.925373,6,20,True,1940,728
59181087,FOODS_3_825_WI_3_evaluation,1434,2,0,9,2,d_1941,2,2016-05-22,11617,...,4.345503,3.98,0.0,4.38,0.908676,6,20,True,1940,1734
59181088,FOODS_3_826_WI_3_evaluation,1435,2,0,9,2,d_1941,0,2016-05-22,11617,...,4.338732,1.28,0.0,1.28,1.0,6,20,True,1940,739
59181089,FOODS_3_827_WI_3_evaluation,1436,2,0,9,2,d_1941,1,2016-05-22,11617,...,4.338732,1.0,0.0,1.0,1.0,6,20,True,1940,1364


In [14]:
# Split into training (1-1913) & validation set (1914-1941)

train_mask_wi3 = df_wi3['d_int'] <= 1913
valid_mask_wi3 = (df_wi3['d_int'] > 1913) & (df_wi3['d_int'] <= 1941)

train_X_wi3 = df_wi3[train_mask_wi3][feature_cols]
train_y_wi3 = df_wi3[train_mask_wi3]['sales']

valid_X_wi3 = df_wi3[valid_mask_wi3][feature_cols]
valid_y_wi3 = df_wi3[valid_mask_wi3]['sales']

In [15]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
lgb_model = LGBMRegressor(objective='regression', verbose=100)

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1],  
    'num_leaves': [128, 256],     
    'max_depth': [10, -1],         
    'n_estimators': [1000]         
}
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
fit_params = {
    "eval_set": [(valid_X_wi3, valid_y_wi3)],
    "eval_metric": "rmse",
    "callbacks": [lgb.early_stopping(50)]
}

grid_search.fit(train_X_wi3, train_y_wi3, **fit_params)

# Best model
best_model = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.768914
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.360053
[LightGBM] [Debug] init for col-wise cost 0.112218 seconds, init for row-wise cost 0.162479 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1418
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.102018
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 9
[LightGBM] [Debug] Trained a tree w

In [16]:
best_params = grid_search.best_params_

final_model = LGBMRegressor(**best_params)

final_model.fit(train_X_wi3, train_y_wi3)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.768914
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.360053
[LightGBM] [Debug] init for col-wise cost 0.101109 seconds, init for row-wise cost 0.149855 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.137172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1418
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.102018
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 128 and depth = 10
[LightGBM] [Debug] Tra

In [107]:
## Feature importance ##

# Get feature names
feature_names = model.feature_name()

# Get importance values
importances = model.feature_importance(importance_type='split')  # 預設是 split

# Convert into dataframe
fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(fi_df)

                  feature  importance
13  days_since_first_sale        4406
0                   lag_7        4006
10             sell_price        3652
1                  lag_28        3234
3   rolling_sales_mean_30        2793
2    rolling_sales_mean_7        2146
6                   month        1457
7                 snap_WI         903
4               dayofweek         823
8            event_name_1         479
5                 weekday         472
9            event_type_1         136
12             is_weekend           4
11                  price           0


In [17]:
# Get the recursive prediction for all items of WI_3
forecast_dict = {}

for item_id in df_wi3['id'].unique():
    df_item = df_wi3[df_wi3['id'] == item_id].copy()

    temp_X = df_item[valid_mask_wi3][feature_cols].copy()
    preds = []

    for day in range(28):
        pred = final_model.predict(temp_X.iloc[[day]])[0]
        preds.append(pred)

        # 更新 lag_7, lag_28
        if day + 1 < len(temp_X):
            temp_X.loc[temp_X.index[day + 1], 'lag_7'] = preds[max(0, day - 6)]
            temp_X.loc[temp_X.index[day + 1], 'lag_28'] = preds[max(0, day - 27)]

    forecast_dict[item_id] = preds

  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols].copy()
  temp_X = df_item[valid_mask_wi3][feature_cols]

In [18]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_WI_3_evaluation,0.260776,0.787078,0.786974,0.797914,0.810591,0.876049,0.881759,0.961507,1.020135,...,0.810591,0.988519,1.007338,0.811328,0.802772,0.802667,0.802772,0.810591,0.877403,0.891046
1,HOBBIES_1_002_WI_3_evaluation,0.349516,0.829348,0.829244,0.847991,0.863841,0.940285,0.970377,1.056267,1.111721,...,0.863841,1.094263,1.113082,0.861405,0.852849,0.852744,0.852849,0.863841,0.938465,0.952107
2,HOBBIES_1_003_WI_3_evaluation,0.386111,0.882445,0.882341,0.882445,0.897220,0.984804,0.990513,1.090563,1.107074,...,0.897728,1.142504,1.153113,0.880842,0.872794,0.875719,0.872794,0.883279,0.958969,0.977788
3,HOBBIES_1_004_WI_3_evaluation,0.609298,0.824113,0.840815,0.840919,0.881546,0.954312,0.946584,1.079169,1.082801,...,0.881546,1.111989,1.119424,0.865169,0.856612,0.840815,0.856612,0.864431,0.937052,0.955871
4,HOBBIES_1_005_WI_3_evaluation,0.330843,0.783932,0.785676,0.785781,0.833961,0.907425,0.927255,1.008885,1.005361,...,0.833961,1.047948,1.058557,0.817584,0.809027,0.783828,0.809535,0.869501,0.895710,0.914529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_WI_3_evaluation,0.390956,0.864581,0.864985,0.880918,0.896412,0.983502,0.989212,1.108809,1.085890,...,0.905851,1.141103,1.100975,0.881837,0.880918,0.880305,0.880918,0.896412,0.967092,0.978391
3045,FOODS_3_824_WI_3_evaluation,0.391740,0.945509,0.945912,0.965562,0.990774,1.094803,1.069541,1.198030,1.180818,...,1.003930,1.236030,1.193956,0.973376,0.965562,0.964949,0.965562,0.990774,1.065171,1.051799
3046,FOODS_3_825_WI_3_evaluation,1.164487,0.906381,0.902192,0.931037,0.933330,1.039465,1.045175,1.156613,1.117258,...,0.959322,1.175199,1.083160,0.902028,0.907168,0.937216,0.931037,0.937703,0.993997,1.004893
3047,FOODS_3_826_WI_3_evaluation,1.109791,1.138912,1.136953,1.142724,1.187543,1.259744,1.283724,1.340205,1.362677,...,1.152692,1.419061,1.420560,1.142759,1.120044,1.111237,1.145358,1.185458,1.268228,1.278920


In [19]:
forecast_df.to_csv('WI_3_best.csv', index=False)

#### Non-Recursive Training (per store)

In [126]:
top_3_stores = (
    df.groupby('store_id')['sales']
    .sum()
    .sort_values(ascending=False)
    .head(3)
    .index.tolist()
)

print("Top 3 stores by sales volume:", top_3_stores)

Top 3 stores by sales volume: [2, 0, 5]


Top 3 sales performance stores are **CA_1**, **CA_3**, **TX_2**.  
Only train the non-recursive models (per store) based on these 3 stores, to get an idea of how to ensemble recursive and non-recursive predictions together.

**CA_1**

In [None]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_ca1[f'target_{i}'] = df_ca1.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_ca1 = df_ca1.dropna(subset=[f'target_{i}' for i in range(1,29)])

In [115]:
valid_df_ca1 = df_ca1[(df_ca1['d_int'] >= 1914) & (df_ca1['d_int'] <= 1941)].copy()

In [10]:
feature_cols_ca = [
    'lag_7', 'lag_28',
    'rolling_sales_mean_7', 'rolling_sales_mean_30','dayofweek',
    'weekday', 'month', 'snap_CA', 'event_name_1','event_type_1',
    'sell_price', 'price', 'is_weekend', 'days_since_first_sale'
]

In [120]:
item_id_list_ca1 = valid_df_ca1['id'].unique()

In [10]:
## Define the model training + predicting function (28 predictions in one shot)

from lightgbm import LGBMRegressor

def train_predict_28_models(train_df, valid_df, feature_cols, item_id_list, model_params=None):
    if model_params is None:
        model_params = {
            'objective': 'regression',
            'learning_rate': 0.05,
            'num_leaves': 128,
            'n_estimators': 1000
        }

    models = {}
    
    # train 28 models (1 model for exactly one horizon (aka. day))
    for horizon in range(1, 29):
        print(f"Training model for horizon F{horizon}...")
        
        train_X = train_df[feature_cols]
        train_y = train_df[f'target_{horizon}']
        
        model = LGBMRegressor(**model_params)
        model.fit(train_X, train_y, eval_metric='rmse')
        
        models[f'F{horizon}'] = model
    
    print("Training done. Start predicting...")

    forecast_dict = {}
    
    for item_id in item_id_list:
        preds = []
        
        X_valid = valid_df[(valid_df['id'] == item_id)][feature_cols]
        
        for horizon in range(1, 29):
            pred = models[f'F{horizon}'].predict(X_valid)[0]
            preds.append(pred)
        
        forecast_dict[item_id] = preds

    return models, forecast_dict


In [121]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_ca1,
    valid_df = valid_df_ca1,
    feature_cols = feature_cols_ca,
    item_id_list = item_id_list_ca1
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.150295 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.319852
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.189684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.319775
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [123]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# column 改成 F1 ~ F28
forecast_df.columns = [f'F{i}' for i in range(1, 29)]

# reset index → id 變 column
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

In [125]:
forecast_df.to_csv('non-recursive_CA_1.csv', index=False)

**CA_2**

In [6]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_ca2[f'target_{i}'] = df_ca2.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_ca2 = df_ca2.dropna(subset=[f'target_{i}' for i in range(1,29)])

  df_ca2[f'target_{i}'] = df_ca2.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ca2[f'target_{i}'] = df_ca2.groupby('id')['sales'].shift(-i)
  df_ca2[f'target_{i}'] = df_ca2.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ca2[f'target_{i}'] = df_ca2.groupby('id')['sales'].shift(-i)
  df_ca2[f'target_{i}'] = df_ca2.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [7]:
valid_df_ca2 = df_ca2[(df_ca2['d_int'] >= 1914) & (df_ca2['d_int'] <= 1941)].copy()

In [8]:
item_id_list_ca2 = valid_df_ca2['id'].unique()

In [11]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_ca2,
    valid_df = valid_df_ca2,
    feature_cols = feature_cols_ca,
    item_id_list = item_id_list_ca2
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1241
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.974827
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.155341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1241
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.974909
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [12]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_2_evaluation,0.986143,0.792850,0.926140,1.234130,1.439280,1.540021,0.870360,0.912572,0.922437,...,1.464527,1.287189,1.047282,1.054415,1.004118,0.939540,1.132286,1.467108,1.575051,1.056145
1,HOBBIES_1_002_CA_2_evaluation,0.469787,0.408931,0.449151,0.641140,0.742310,0.768166,0.474345,0.484127,0.470755,...,0.775168,0.886889,0.573774,0.620304,0.594654,0.565830,0.719328,0.915242,0.957303,0.499388
2,HOBBIES_1_003_CA_2_evaluation,0.475441,0.447451,0.470937,0.683403,0.815835,0.857019,0.548990,0.563256,0.519757,...,0.910262,0.909101,0.604779,0.667761,0.633370,0.618555,0.720549,1.016617,1.090856,0.543186
3,HOBBIES_1_004_CA_2_evaluation,1.834301,1.705974,1.745730,2.312278,2.894756,2.873720,1.716099,1.968250,1.752172,...,3.046895,2.562642,1.987769,1.847852,1.824838,1.857179,2.176314,3.018988,2.907113,2.029714
4,HOBBIES_1_005_CA_2_evaluation,1.067608,1.025390,1.003823,1.312655,1.721818,1.780193,1.129821,1.158024,1.051452,...,1.727043,1.824243,1.256331,1.245035,1.170382,1.147832,1.326500,1.754822,1.858897,1.210624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_2_evaluation,1.527006,1.371080,1.466281,1.716228,2.425145,2.598773,1.579443,1.511421,1.552308,...,2.375030,2.315705,1.563221,1.522599,1.485424,1.456423,1.685198,2.452619,2.486937,1.605882
3045,FOODS_3_824_CA_2_evaluation,0.971176,0.979405,0.927352,1.085414,1.537557,1.583084,0.991770,0.991552,0.979754,...,1.587481,1.522190,1.059636,1.056578,1.088709,1.077909,1.240664,1.659145,1.727327,1.067964
3046,FOODS_3_825_CA_2_evaluation,0.807787,0.815203,0.804295,1.015674,1.267591,1.337256,0.831814,0.865656,0.832552,...,1.357990,1.346040,0.948162,0.900238,0.928637,0.888538,1.072262,1.398754,1.329856,0.876180
3047,FOODS_3_826_CA_2_evaluation,0.972414,0.953226,1.043907,1.347755,1.741618,1.789435,0.985924,1.042280,1.048587,...,1.477452,1.673886,0.928347,0.984039,1.074459,0.954651,1.110852,1.487819,1.598472,0.802338


In [13]:
forecast_df.to_csv('non-recursive_CA_2.csv', index=False)

**CA_3**

In [None]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_ca3[f'target_{i}'] = df_ca3.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_ca3 = df_ca3.dropna(subset=[f'target_{i}' for i in range(1,29)])

In [128]:
valid_df_ca3 = df_ca3[(df_ca3['d_int'] >= 1914) & (df_ca3['d_int'] <= 1941)].copy()

In [130]:
item_id_list_ca3 = valid_df_ca3['id'].unique()

In [131]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_ca3,
    valid_df = valid_df_ca3,
    feature_cols = feature_cols_ca,
    item_id_list = item_id_list_ca3
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.116043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1561
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.918448
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1561
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.918527
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [132]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_3_evaluation,0.945107,0.993302,0.984380,1.049392,1.174371,1.233396,1.110417,1.049462,0.937549,...,1.161316,1.240699,1.043207,0.944222,0.963392,0.887834,1.081603,1.206855,1.229884,0.975567
1,HOBBIES_1_002_CA_3_evaluation,1.060778,0.984867,0.996577,1.021919,1.345452,1.424399,1.146992,1.134346,1.069417,...,1.397079,1.494390,1.227560,1.072646,1.010616,0.973001,1.102140,1.368659,1.487666,1.113276
2,HOBBIES_1_003_CA_3_evaluation,0.468800,0.433009,0.472804,0.487670,0.681109,0.703805,0.557868,0.541880,0.551992,...,0.798962,0.809547,0.713052,0.658283,0.656609,0.631934,0.680851,0.837178,0.874759,0.648677
3,HOBBIES_1_004_CA_3_evaluation,6.835809,7.786350,8.636765,7.157695,9.641088,10.156330,7.862150,7.855222,9.011671,...,11.486362,11.475313,8.622261,9.082927,7.863623,8.838212,8.365821,8.312971,9.268687,8.306550
4,HOBBIES_1_005_CA_3_evaluation,1.075003,0.985724,0.907222,0.981678,1.380480,1.472746,1.099452,1.069808,1.061896,...,1.447457,1.398146,1.174933,1.017397,0.947815,0.908546,0.961289,1.337674,1.396883,1.050041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_3_evaluation,1.811102,1.682046,1.617543,1.715828,2.226603,2.326017,1.894057,1.874125,1.791521,...,2.242425,2.284734,1.986811,1.721550,1.681039,1.623661,1.740244,2.164968,2.350819,1.900213
3045,FOODS_3_824_CA_3_evaluation,0.608951,0.570602,0.620367,0.615125,0.820279,0.844499,0.730381,0.726420,0.750515,...,0.994532,1.062937,1.141035,0.899126,0.908547,0.863955,0.940801,1.089470,1.146553,0.943346
3046,FOODS_3_825_CA_3_evaluation,1.820718,1.677931,1.619522,1.724258,2.182318,2.363360,1.921839,1.887121,1.820495,...,2.309433,2.378193,2.068199,1.793462,1.741748,1.713563,1.799479,2.248097,2.424027,1.956799
3047,FOODS_3_826_CA_3_evaluation,1.248515,1.223151,1.131214,1.287921,1.508498,1.776701,1.346805,1.330760,1.420110,...,1.598763,1.756886,1.639618,1.398489,1.242983,1.146683,1.294457,1.818276,1.549983,1.299624


In [133]:
forecast_df.to_csv('non-recursive_CA_3.csv', index=False)

**CA_4**

In [14]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_ca4[f'target_{i}'] = df_ca4.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_ca4 = df_ca4.dropna(subset=[f'target_{i}' for i in range(1,29)])

  df_ca4[f'target_{i}'] = df_ca4.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ca4[f'target_{i}'] = df_ca4.groupby('id')['sales'].shift(-i)
  df_ca4[f'target_{i}'] = df_ca4.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ca4[f'target_{i}'] = df_ca4.groupby('id')['sales'].shift(-i)
  df_ca4[f'target_{i}'] = df_ca4.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [15]:
valid_df_ca4 = df_ca4[(df_ca4['d_int'] >= 1914) & (df_ca4['d_int'] <= 1941)].copy()

In [16]:
item_id_list_ca4 = valid_df_ca4['id'].unique()

In [18]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_ca4,
    valid_df = valid_df_ca4,
    feature_cols = feature_cols_ca,
    item_id_list = item_id_list_ca4
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.163883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1184
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.703755
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.239065 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1184
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.703878
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [19]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_4_evaluation,0.821292,0.703081,0.665997,0.655718,0.900914,1.010811,0.648763,0.805900,0.989350,...,0.737356,0.781305,0.785821,0.811819,0.638956,0.602013,0.679974,0.852319,0.837931,0.839069
1,HOBBIES_1_002_CA_4_evaluation,0.340938,0.365820,0.271735,0.340150,0.411119,0.421127,0.253372,0.178631,0.488411,...,0.290155,0.347667,0.295139,0.255909,0.207583,0.311706,0.313174,0.362207,0.438720,0.411611
2,HOBBIES_1_003_CA_4_evaluation,0.383620,0.351057,0.396768,0.326415,0.493933,0.459678,0.366329,0.297244,0.521024,...,0.431470,0.394133,0.416470,0.361575,0.317056,0.348192,0.397131,0.343318,0.537674,0.475347
3,HOBBIES_1_004_CA_4_evaluation,1.428043,1.370453,1.321145,1.341639,1.665223,1.816597,1.351566,1.263857,1.623014,...,1.547189,1.343994,1.466555,1.416203,1.220587,1.179414,1.203331,1.691429,1.501338,1.584380
4,HOBBIES_1_005_CA_4_evaluation,1.396753,1.425564,1.485605,1.391746,1.411971,1.650245,1.693034,1.110083,1.694029,...,1.497696,1.882145,1.612641,1.434751,1.458502,1.128389,1.478107,1.431159,1.608503,1.460078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_4_evaluation,0.994741,1.010649,0.993660,1.019131,1.092066,1.191090,1.061663,1.062966,1.127697,...,1.165958,1.151770,1.145857,1.002517,1.005001,0.963768,1.018366,1.208445,1.209310,1.067890
3045,FOODS_3_824_CA_4_evaluation,0.800446,0.775407,0.808151,0.787409,0.918797,0.900512,0.829180,0.878237,0.779162,...,0.899941,0.951070,0.897157,0.834573,0.784849,0.783150,0.839248,0.964418,0.976838,0.855009
3046,FOODS_3_825_CA_4_evaluation,0.937981,1.037341,0.908101,0.925126,1.127793,1.119621,0.973818,1.006507,0.961775,...,1.135971,1.125752,1.067301,0.980844,0.926059,0.914499,0.982816,1.102565,1.129085,1.012318
3047,FOODS_3_826_CA_4_evaluation,1.064020,1.120209,1.109976,1.206896,1.227242,1.395750,0.890612,0.936106,0.802258,...,1.043979,1.079466,1.012068,0.873285,0.865500,0.969448,0.964180,1.187171,1.316794,1.011251


In [20]:
forecast_df.to_csv('non-recursive_CA_4.csv', index=False)

**TX_1**

In [21]:
feature_cols_tx = [
    'lag_7', 'lag_28',
    'rolling_sales_mean_7', 'rolling_sales_mean_30','dayofweek',
    'weekday', 'month', 'snap_TX', 'event_name_1','event_type_1',
    'sell_price', 'price', 'is_weekend', 'days_since_first_sale'
]

In [None]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_tx1[f'target_{i}'] = df_tx1.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_tx1 = df_tx1.dropna(subset=[f'target_{i}' for i in range(1,29)])

In [23]:
valid_df_tx1 = df_tx1[(df_tx1['d_int'] >= 1914) & (df_tx1['d_int'] <= 1941)].copy()

In [24]:
item_id_list_tx1 = valid_df_tx1['id'].unique()

In [25]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_tx1,
    valid_df = valid_df_tx1,
    feature_cols = feature_cols_tx,
    item_id_list = item_id_list_tx1
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.155051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.959380
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.161821 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.959409
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [26]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_TX_1_evaluation,0.517796,0.430208,0.498792,0.518258,0.608594,0.704289,0.561777,0.545359,0.500436,...,0.682382,0.666161,0.594835,0.541311,0.563448,0.540523,0.587673,0.704299,0.788581,0.601272
1,HOBBIES_1_002_TX_1_evaluation,0.297004,0.277740,0.301334,0.315057,0.431449,0.430368,0.370243,0.297052,0.323871,...,0.500337,0.487305,0.399323,0.377703,0.420385,0.412173,0.416367,0.476292,0.572225,0.421900
2,HOBBIES_1_003_TX_1_evaluation,0.327951,0.307815,0.323660,0.352531,0.476661,0.477694,0.405837,0.342061,0.361734,...,0.572291,0.538716,0.460139,0.436293,0.469540,0.453230,0.461549,0.545944,0.647024,0.492084
3,HOBBIES_1_004_TX_1_evaluation,0.353853,0.323173,0.327057,0.359222,0.470300,0.484048,0.408925,0.335184,0.388966,...,0.551801,0.529523,0.458383,0.415237,0.480485,0.451812,0.477945,0.532154,0.645064,0.485390
4,HOBBIES_1_005_TX_1_evaluation,0.324888,0.302710,0.315885,0.356952,0.501697,0.506438,0.396232,0.347467,0.371125,...,0.579074,0.558097,0.470383,0.442902,0.535158,0.462916,0.483931,0.541150,0.642863,0.495141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_TX_1_evaluation,0.346907,0.324099,0.344272,0.363843,0.482651,0.504431,0.416130,0.368037,0.381304,...,0.577650,0.562294,0.486617,0.470630,0.500543,0.484306,0.479818,0.575290,0.660757,0.511377
3045,FOODS_3_824_TX_1_evaluation,0.342583,0.320768,0.334078,0.366475,0.456067,0.487965,0.416380,0.365669,0.366841,...,0.576817,0.559330,0.489097,0.474291,0.508039,0.480427,0.479821,0.562895,0.671340,0.512380
3046,FOODS_3_825_TX_1_evaluation,0.855061,0.800947,0.804935,0.828194,1.097481,1.136469,0.929232,0.858606,0.833419,...,1.157026,1.165803,1.003871,0.896175,0.909396,0.910478,0.942139,1.111622,1.231485,0.980987
3047,FOODS_3_826_TX_1_evaluation,0.450932,0.504540,0.437236,0.554929,0.665643,0.732176,0.601154,0.564769,0.541121,...,0.921847,0.822233,0.896901,0.738549,0.828405,0.792243,0.813305,0.907193,0.944850,0.820455


In [27]:
forecast_df.to_csv('non-recursive_TX_1.csv', index=False)

**TX_2**

In [28]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_tx2[f'target_{i}'] = df_tx2.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_tx2 = df_tx2.dropna(subset=[f'target_{i}' for i in range(1,29)])

  df_tx2[f'target_{i}'] = df_tx2.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tx2[f'target_{i}'] = df_tx2.groupby('id')['sales'].shift(-i)
  df_tx2[f'target_{i}'] = df_tx2.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tx2[f'target_{i}'] = df_tx2.groupby('id')['sales'].shift(-i)
  df_tx2[f'target_{i}'] = df_tx2.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [29]:
valid_df_tx2 = df_tx2[(df_tx2['d_int'] >= 1914) & (df_tx2['d_int'] <= 1941)].copy()

In [138]:
item_id_list_tx2 = valid_df_tx2['id'].unique()

In [139]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_tx2,
    valid_df = valid_df_tx2,
    feature_cols = feature_cols_tx,
    item_id_list = item_id_list_tx2
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.236883
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.236809
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [140]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_TX_2_evaluation,0.283017,0.289738,0.294513,0.330855,0.392002,0.430837,0.356930,0.307497,0.314311,...,0.485194,0.497294,0.446877,0.399167,0.428748,0.438405,0.449720,0.488340,0.590630,0.440743
1,HOBBIES_1_002_TX_2_evaluation,0.907255,0.877223,0.849226,0.967127,1.132521,1.234043,0.961289,0.876174,0.859719,...,1.164809,1.177978,0.988981,0.904607,0.922837,0.898113,0.990860,1.151174,1.256521,0.935470
2,HOBBIES_1_003_TX_2_evaluation,0.354641,0.360726,0.398068,0.427039,0.500305,0.533563,0.441752,0.393091,0.414759,...,0.625388,0.609385,0.540395,0.501414,0.528190,0.523299,0.547681,0.594028,0.722409,0.539498
3,HOBBIES_1_004_TX_2_evaluation,0.943900,0.931662,0.934872,1.012892,1.240606,1.294519,1.026258,0.939390,0.909628,...,1.207473,1.248844,1.027769,0.964283,0.977665,0.937159,1.067631,1.231695,1.337450,1.003484
4,HOBBIES_1_005_TX_2_evaluation,0.349462,0.379899,0.408876,0.459395,0.498513,0.551255,0.475196,0.433695,0.433598,...,0.644479,0.660132,0.568925,0.534491,0.575344,0.549289,0.592994,0.621756,0.757801,0.568858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_TX_2_evaluation,0.406170,0.408681,0.439565,0.452373,0.560482,0.573379,0.476959,0.445417,0.457453,...,0.693635,0.662466,0.590948,0.551926,0.583771,0.573162,0.587911,0.680443,0.770591,0.601998
3045,FOODS_3_824_TX_2_evaluation,0.971903,0.930780,0.931434,1.008337,1.209136,1.306056,1.037779,0.933704,0.928487,...,1.203992,1.186223,0.991980,0.973244,1.021293,0.980060,1.082473,1.233800,1.360460,0.996615
3046,FOODS_3_825_TX_2_evaluation,0.656945,0.641522,0.684301,0.699659,0.870265,0.897801,0.733807,0.681361,0.718512,...,0.960911,0.899550,0.787105,0.744326,0.771208,0.744406,0.773482,0.890872,0.993629,0.786700
3047,FOODS_3_826_TX_2_evaluation,1.346208,1.406083,1.226832,1.163381,1.724302,1.459332,1.446561,1.241940,1.230659,...,1.864743,1.813194,1.867002,2.022707,1.383452,1.361227,1.622980,1.712842,1.699036,1.584322


In [141]:
forecast_df.to_csv('non-recursive_TX_2.csv', index=False)

**TX_3**

In [30]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_tx3[f'target_{i}'] = df_tx3.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_tx3 = df_tx3.dropna(subset=[f'target_{i}' for i in range(1,29)])

  df_tx3[f'target_{i}'] = df_tx3.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tx3[f'target_{i}'] = df_tx3.groupby('id')['sales'].shift(-i)
  df_tx3[f'target_{i}'] = df_tx3.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tx3[f'target_{i}'] = df_tx3.groupby('id')['sales'].shift(-i)
  df_tx3[f'target_{i}'] = df_tx3.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [31]:
valid_df_tx3 = df_tx3[(df_tx3['d_int'] >= 1914) & (df_tx3['d_int'] <= 1941)].copy()

In [32]:
item_id_list_tx3 = valid_df_tx3['id'].unique()

In [33]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_tx3,
    valid_df = valid_df_tx3,
    feature_cols = feature_cols_tx,
    item_id_list = item_id_list_tx3
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.177757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1422
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.044107
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.156353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1422
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.044205
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [34]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_TX_3_evaluation,0.254445,0.258593,0.284519,0.321173,0.332507,0.407296,0.387431,0.276348,0.403579,...,0.452464,0.457493,0.426314,0.374691,0.436390,0.417007,0.418817,0.465413,0.474613,0.462528
1,HOBBIES_1_002_TX_3_evaluation,0.270645,0.268547,0.286955,0.318985,0.330650,0.428786,0.415993,0.287637,0.439217,...,0.479254,0.489623,0.448163,0.378156,0.439584,0.402471,0.406893,0.463839,0.519746,0.462392
2,HOBBIES_1_003_TX_3_evaluation,0.663010,0.629784,0.622476,0.690173,0.793466,0.869343,0.826747,0.658692,0.764017,...,0.899722,0.910900,0.806654,0.730286,0.781291,0.765022,0.753372,0.868633,0.923805,0.807773
3,HOBBIES_1_004_TX_3_evaluation,0.307349,0.314748,0.339815,0.373435,0.395021,0.476480,0.467969,0.345255,0.473968,...,0.553500,0.564530,0.517733,0.443921,0.492189,0.477266,0.472297,0.529898,0.583153,0.524916
4,HOBBIES_1_005_TX_3_evaluation,0.659107,0.626236,0.627479,0.668522,0.752871,0.876809,0.809629,0.683012,0.735171,...,0.912794,0.919840,0.828957,0.748399,0.833692,0.813536,0.799229,0.886365,0.968779,0.848885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_TX_3_evaluation,0.676695,0.634444,0.625871,0.688367,0.792303,0.885280,0.814341,0.681037,0.836697,...,0.904326,0.935363,0.813231,0.732545,0.828478,0.772756,0.762431,0.883801,0.935739,0.810089
3045,FOODS_3_824_TX_3_evaluation,0.331374,0.246586,0.327858,0.377578,0.545486,0.482795,0.577675,0.368942,0.496364,...,0.575940,0.613750,0.664304,0.526189,0.609127,0.553898,0.550319,0.587581,0.693637,0.625867
3046,FOODS_3_825_TX_3_evaluation,1.207340,1.044448,1.196799,1.333034,1.475919,1.609449,1.792472,1.223134,1.433739,...,1.539328,1.574793,1.366858,1.254356,1.632330,1.354679,1.317634,1.466616,1.554736,1.874703
3047,FOODS_3_826_TX_3_evaluation,1.218949,1.101148,1.001208,1.121181,1.267531,1.466478,1.239317,1.202475,1.178663,...,1.398476,1.456533,1.253118,1.242428,1.335720,1.113978,1.253267,1.361011,1.410825,1.269860


In [35]:
forecast_df.to_csv('non-recursive_TX_3.csv', index=False)

**WI_1**

In [6]:
feature_cols_wi = [
    'lag_7', 'lag_28',
    'rolling_sales_mean_7', 'rolling_sales_mean_30','dayofweek',
    'weekday', 'month', 'snap_WI', 'event_name_1','event_type_1',
    'sell_price', 'price', 'is_weekend', 'days_since_first_sale'
]

In [None]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_wi1[f'target_{i}'] = df_wi1.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_wi1 = df_wi1.dropna(subset=[f'target_{i}' for i in range(1,29)])

In [38]:
valid_df_wi1 = df_wi1[(df_wi1['d_int'] >= 1914) & (df_wi1['d_int'] <= 1941)].copy()

In [39]:
item_id_list_wi1 = valid_df_wi1['id'].unique()

In [40]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_wi1,
    valid_df = valid_df_wi1,
    feature_cols = feature_cols_wi,
    item_id_list = item_id_list_wi1
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.116227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1205
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.882866
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.171762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1205
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 0.883037
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [41]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_WI_1_evaluation,0.296018,0.433576,0.391900,0.453613,0.524365,0.482396,0.283549,0.354674,0.385665,...,0.544898,0.409977,0.426520,0.454925,0.385537,0.170660,0.672498,0.649639,0.664441,0.328036
1,HOBBIES_1_002_WI_1_evaluation,0.757252,0.907706,0.814991,0.933302,1.183126,1.184301,0.730979,0.850655,0.820334,...,1.342397,1.076945,1.033019,0.969377,0.874134,0.721957,1.109885,1.471695,1.298360,0.864832
2,HOBBIES_1_003_WI_1_evaluation,1.124336,1.144721,1.109724,1.303743,1.857141,1.789810,1.050215,1.189146,1.180952,...,1.786810,1.615540,1.330030,1.231941,1.245117,0.985000,1.446496,1.827785,1.805531,1.159147
3,HOBBIES_1_004_WI_1_evaluation,0.369396,0.440775,0.378252,0.466149,0.555689,0.574938,0.333688,0.405703,0.417851,...,0.637276,0.538546,0.535326,0.525715,0.458852,0.326117,0.576787,0.777209,0.731837,0.365733
4,HOBBIES_1_005_WI_1_evaluation,1.023769,0.987157,0.963984,1.067955,1.666644,1.590395,0.916677,1.027358,1.071483,...,1.599109,1.416041,1.169294,1.164475,1.171416,0.925208,1.267929,1.631332,1.607145,1.034303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_WI_1_evaluation,0.910318,0.937143,0.860679,1.076236,1.329521,1.384214,0.886203,0.898284,0.949459,...,1.430162,1.258365,1.088556,0.960214,1.061510,0.967098,1.139477,1.452384,1.475979,0.943528
3045,FOODS_3_824_WI_1_evaluation,0.476550,0.522757,0.491105,0.613374,0.794655,0.842790,0.536839,0.565780,0.562905,...,1.041743,0.922137,0.795142,0.724069,0.861351,0.714611,0.862067,1.171323,1.125239,0.695040
3046,FOODS_3_825_WI_1_evaluation,0.457310,0.498533,0.499262,0.585131,0.722019,0.752061,0.463513,0.504553,0.513702,...,0.929404,0.710139,0.631191,0.562534,0.636528,0.570117,0.687992,0.923496,0.909402,0.550508
3047,FOODS_3_826_WI_1_evaluation,0.345553,0.293242,0.401580,0.385765,0.597269,0.606875,0.338767,0.357488,0.326350,...,1.242237,1.023676,0.781775,0.839762,1.065862,0.952007,1.169526,1.417682,1.172499,0.879199


In [42]:
forecast_df.to_csv('non-recursive_WI_1.csv', index=False)

**WI_2**

In [None]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_wi2[f'target_{i}'] = df_wi2.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_wi2 = df_wi2.dropna(subset=[f'target_{i}' for i in range(1,29)])

In [8]:
valid_df_wi2 = df_wi2[(df_wi2['d_int'] >= 1914) & (df_wi2['d_int'] <= 1941)].copy()

In [9]:
item_id_list_wi2 = valid_df_wi2['id'].unique()

In [11]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_wi2,
    valid_df = valid_df_wi2,
    feature_cols = feature_cols_wi,
    item_id_list = item_id_list_wi2
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.122275
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.122656
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [12]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_WI_2_evaluation,0.232952,0.220603,0.232287,0.317262,0.319194,0.294472,0.358888,0.245622,0.336003,...,0.453585,0.427302,0.403973,0.346915,0.419540,0.381180,0.374406,0.409705,0.377247,0.320195
1,HOBBIES_1_002_WI_2_evaluation,0.293087,0.295215,0.315242,0.391035,0.383572,0.400524,0.423548,0.332748,0.403114,...,0.509418,0.479935,0.436270,0.420354,0.480093,0.432121,0.428363,0.522180,0.538617,0.394378
2,HOBBIES_1_003_WI_2_evaluation,0.385523,0.394659,0.406392,0.483093,0.481409,0.485296,0.536658,0.443991,0.495183,...,0.645376,0.607794,0.571026,0.558073,0.564319,0.498665,0.555282,0.646545,0.700714,0.544105
3,HOBBIES_1_004_WI_2_evaluation,0.635486,0.616664,0.678402,0.798327,0.882012,0.908898,0.717473,0.695492,0.705264,...,1.070886,0.936278,0.888966,0.841904,0.886591,0.786717,0.909136,0.974093,0.978220,0.740025
4,HOBBIES_1_005_WI_2_evaluation,1.156628,1.114111,1.193966,1.555054,1.635984,1.432077,1.427156,1.351110,1.230704,...,1.551627,1.415613,1.549349,1.435915,1.323499,1.209416,1.267041,1.475501,1.671491,1.205128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_WI_2_evaluation,0.511922,0.530602,0.488100,0.511372,0.553378,0.578524,0.603652,0.490733,0.666252,...,0.871180,0.724899,0.888733,0.835329,0.905168,0.671792,0.674518,0.928448,0.942468,0.675264
3045,FOODS_3_824_WI_2_evaluation,0.462872,0.474585,0.408533,0.444861,0.507734,0.552659,0.567942,0.464625,0.619033,...,0.906337,0.739189,0.926169,0.895729,0.933307,0.734778,0.721216,0.962678,0.990353,0.722245
3046,FOODS_3_825_WI_2_evaluation,1.087105,1.092221,1.055314,1.133367,1.334851,1.305630,1.154474,1.044275,1.154920,...,1.529056,1.479028,1.562550,1.516094,1.543151,1.273804,1.304170,1.628679,1.585121,1.274120
3047,FOODS_3_826_WI_2_evaluation,1.817266,1.935903,1.909637,1.918752,2.411513,2.170858,1.643667,1.891855,2.077652,...,2.556185,2.324886,2.425135,2.264502,2.466917,2.171178,2.308057,2.726237,2.392382,1.988091


In [13]:
forecast_df.to_csv('non-recursive_WI_2.csv', index=False)

**WI_3**

In [14]:
## Prepare input data ##

# 產生 target_1 ~ target_28
for i in range(1, 29):
    df_wi3[f'target_{i}'] = df_wi3.groupby('id')['sales'].shift(-i)

# 過濾出有完整 target 的 rows
train_df_wi3 = df_wi3.dropna(subset=[f'target_{i}' for i in range(1,29)])

  df_wi3[f'target_{i}'] = df_wi3.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wi3[f'target_{i}'] = df_wi3.groupby('id')['sales'].shift(-i)
  df_wi3[f'target_{i}'] = df_wi3.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wi3[f'target_{i}'] = df_wi3.groupby('id')['sales'].shift(-i)
  df_wi3[f'target_{i}'] = df_wi3.groupby('id')['sales'].shift(-i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [15]:
valid_df_wi3 = df_wi3[(df_wi3['d_int'] >= 1914) & (df_wi3['d_int'] <= 1941)].copy()

In [16]:
item_id_list_wi3 = valid_df_wi3['id'].unique()

In [17]:
## Apply the function

models, forecast_dict = train_predict_28_models(
    train_df = train_df_wi3,
    valid_df = valid_df_wi3,
    feature_cols = feature_cols_wi,
    item_id_list = item_id_list_wi3
)

Training model for horizon F1...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.190498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1418
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.101885
Training model for horizon F2...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.160513 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1418
[LightGBM] [Info] Number of data points in the train set: 5832737, number of used features: 14
[LightGBM] [Info] Start training from score 1.101714
Training model for horizon F3...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [18]:
# 轉成 DataFrame
forecast_df = pd.DataFrame.from_dict(forecast_dict, orient='index')

# rename columns to F1 ~ F28
forecast_df.columns = [f'F{col+1}' for col in forecast_df.columns]

# reset index (id 變成 column)
forecast_df = forecast_df.reset_index().rename(columns={'index': 'id'})

forecast_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_WI_3_evaluation,0.315030,0.255883,0.243258,0.284703,0.310933,0.311998,0.229238,0.236087,0.309784,...,0.394766,0.423553,0.390803,0.375177,0.388156,0.301903,0.339317,0.481008,0.525644,0.327804
1,HOBBIES_1_002_WI_3_evaluation,0.398586,0.336298,0.329649,0.391578,0.438403,0.440823,0.346816,0.335282,0.427255,...,0.604717,0.584404,0.515938,0.499010,0.493982,0.452745,0.498551,0.656548,0.784612,0.449720
2,HOBBIES_1_003_WI_3_evaluation,0.468322,0.400960,0.372412,0.475213,0.536764,0.532669,0.438306,0.422758,0.516836,...,0.682231,0.709865,0.639928,0.607789,0.574956,0.529855,0.560140,0.748149,0.832164,0.544777
3,HOBBIES_1_004_WI_3_evaluation,0.698472,0.587395,0.570758,0.624552,0.789107,0.904067,0.587318,0.612414,0.672675,...,0.925303,0.978335,0.780530,0.759331,0.720766,0.666610,0.735178,0.995180,1.114526,0.700154
4,HOBBIES_1_005_WI_3_evaluation,0.384682,0.318303,0.290017,0.391474,0.439886,0.433615,0.349761,0.333719,0.412098,...,0.628022,0.613490,0.572593,0.565255,0.567173,0.488188,0.525643,0.759656,0.838997,0.576239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_WI_3_evaluation,0.479745,0.428062,0.404370,0.495983,0.545917,0.556714,0.452739,0.449348,0.524948,...,0.715421,0.715594,0.658400,0.645012,0.625291,0.584306,0.600309,0.783738,0.872910,0.573580
3045,FOODS_3_824_WI_3_evaluation,0.453821,0.399898,0.388306,0.475989,0.541814,0.532344,0.432967,0.435301,0.515962,...,0.811776,0.724375,0.689312,0.714619,0.707362,0.645903,0.680578,0.919539,0.941162,0.722030
3046,FOODS_3_825_WI_3_evaluation,1.191975,1.032659,1.077508,1.248079,1.381227,1.410353,1.169232,1.102001,1.119151,...,1.556520,1.537664,1.294060,1.214185,1.208190,1.077097,1.243892,1.585378,1.633272,1.166964
3047,FOODS_3_826_WI_3_evaluation,1.236273,1.135907,1.023405,1.401932,1.634798,1.711979,1.135742,1.298614,1.156275,...,2.141579,1.938433,1.614885,1.545789,1.536829,1.518546,1.553780,2.669841,2.489891,1.657189


In [19]:
forecast_df.to_csv('non-recursive_WI_3.csv', index=False)

#### Ensemble the predictions from **recursive** and **non-recursive** models

In [20]:
non_recursive_df = pd.read_csv(r"D:\UMN MSBA Go!\Courses\Spring 2025\Predictive Analytics\Project\Predictions\Non-recursive (per store)\non-recursive_combined.csv")
recursive_df = pd.read_csv(r"D:\UMN MSBA Go!\Courses\Spring 2025\Predictive Analytics\Project\Predictions\Recursive (best param)\recursive_best_combined.csv")

In [None]:
ratio_df = non_recursive_df.copy()
ratio_df.iloc[:, 1:] = non_recursive_df.iloc[:, 1:] / recursive_df[recursive_df['id'].isin(non_recursive_df['id'])].iloc[:, 1:]

# average across all items
proxy_ratio = ratio_df.iloc[:, 1:].mean(axis=0).values  # array of shape (28,)

In [None]:
# version 1
final_records = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    if id_ in non_recursive_df['id'].values:
        # 有 non-recursive → weighted average
        nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
        preds = 0.7 * row.iloc[1:].values + 0.3 * nonrec_row.iloc[1:].values
    else:
        # 沒有 non-recursive → 用 proxy ratio 修正 recursive
        preds = row.iloc[1:].values * proxy_ratio  # element-wise multiply

    final_records.append([id_] + preds.tolist())

In [4]:
# version 2
final_records_2 = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    if id_ in non_recursive_df['id'].values:
        # 有 non-recursive → weighted average
        nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
        preds = 0.7 * row.iloc[1:].values + 0.3 * nonrec_row.iloc[1:].values
    else:
        # 沒有 non-recursive → 維持原本recursive的結果
        preds = row.iloc[1:].values

    final_records_2.append([id_] + preds.tolist())

In [21]:
# version 3
final_records_3 = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    # weighted average
    nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
    preds = 0.7 * row.iloc[1:].values + 0.3 * nonrec_row.iloc[1:].values

    final_records_3.append([id_] + preds.tolist())

In [26]:
# version 4
final_records_4 = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    # weighted average
    nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
    preds = 0.8 * row.iloc[1:].values + 0.2 * nonrec_row.iloc[1:].values

    final_records_4.append([id_] + preds.tolist())

In [22]:
# version 5
final_records_5 = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    # weighted average
    nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
    preds = 0.8 * row.iloc[1:].values + 0.2 * nonrec_row.iloc[1:].values

    final_records_5.append([id_] + preds.tolist())

In [24]:
# version 6
final_records_6 = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    # weighted average
    nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
    preds = 0.7 * row.iloc[1:].values + 0.3 * nonrec_row.iloc[1:].values

    final_records_6.append([id_] + preds.tolist())

In [26]:
# version 7
final_records_7 = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    # weighted average
    nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
    preds = 0.6 * row.iloc[1:].values + 0.4 * nonrec_row.iloc[1:].values

    final_records_7.append([id_] + preds.tolist())

In [28]:
# version 8
final_records_8 = []

for idx, row in recursive_df.iterrows():
    id_ = row['id']
    
    # weighted average
    nonrec_row = non_recursive_df[non_recursive_df['id'] == id_].iloc[0]
    preds = 0.5 * row.iloc[1:].values + 0.5 * nonrec_row.iloc[1:].values

    final_records_8.append([id_] + preds.tolist())

In [146]:
final_df = pd.DataFrame(final_records, columns=['id'] + [f'F{i}' for i in range(1,29)])

In [149]:
final_df.to_csv('lgbm_ensembled_per_store.csv', index=False)

In [5]:
final_df_2 = pd.DataFrame(final_records_2, columns=['id'] + [f'F{i}' for i in range(1,29)])
final_df_2.to_csv('lgbm_ensembled_per_store_v2.csv', index=False)

In [24]:
final_df_3 = pd.DataFrame(final_records_3, columns=['id'] + [f'F{i}' for i in range(1,29)])
final_df_3.to_csv('lgbm_ensembled_per_store_v3.csv', index=False)

In [27]:
final_df_4 = pd.DataFrame(final_records_4, columns=['id'] + [f'F{i}' for i in range(1,29)])
final_df_4.to_csv('lgbm_ensembled_per_store_v4.csv', index=False)

In [23]:
final_df_5 = pd.DataFrame(final_records_5, columns=['id'] + [f'F{i}' for i in range(1,29)])
final_df_5.to_csv('lgbm_ensembled_per_store_v5.csv', index=False)

In [25]:
final_df_6 = pd.DataFrame(final_records_6, columns=['id'] + [f'F{i}' for i in range(1,29)])
final_df_6.to_csv('lgbm_ensembled_per_store_v6.csv', index=False)

In [27]:
final_df_7 = pd.DataFrame(final_records_7, columns=['id'] + [f'F{i}' for i in range(1,29)])
final_df_7.to_csv('lgbm_ensembled_per_store_v7.csv', index=False)

In [29]:
final_df_8 = pd.DataFrame(final_records_7, columns=['id'] + [f'F{i}' for i in range(1,29)])
final_df_8.to_csv('lgbm_ensembled_per_store_v8.csv', index=False)