In [1]:
import numpy as np
import pandas as pd
import os
import random
import math
from decimal import Decimal as dec
import datetime
import time
import gc
import lightgbm as lgb
import pickle

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Data Preparation and Sales Prediction Pipeline

This code snippet outlines the pipeline for data preparation and sales prediction for the M5 Forecasting Accuracy competition using LightGBM. The pipeline involves the following steps:

1. **Load Necessary Packages and Define Parameters:**
   - The required packages, such as LightGBM, are imported.
   - Lists for `store_id_set_list`, `end_train_day_x_list`, and `prediction_horizon_list` are defined, specifying the stores to predict for, the end of training days, and the prediction horizons, respectively.

2. **Predicting Sales for Each Store and Prediction Horizon:**
   - The code iterates through each combination of `end_train_day_x` and `prediction_horizon`.
   - For each store and prediction horizon, the code loads the trained LightGBM model from the corresponding notebook and predicts sales for the period from the previous prediction horizon up to the current one.
   - The predictions for each store and prediction horizon are stored in a DataFrame `pred_v_df`.

3. **Combining Predictions for All Stores and Horizons:**
   - The code concatenates the predictions for all stores and horizons into a single DataFrame `pred_v_all_df`.

4. **Merging Predictions with the Sample Submission File:**
   - The code loads the sample submission file which contains the required rows to be predicted and the columns format.
   - It adjusts the 'd' column in `pred_v_all_df` by subtracting the respective `end_train_day_x_list` values.
   - The DataFrame `pred_v_all_df` is pivoted to create a wide format with 'id' as the index and days as columns.
   - The predictions are merged with the sample submission file based on 'id'.
   - Any missing values in the merged DataFrame are filled with 0.
   - The final predictions are saved to a CSV file named "m5_predictions.csv".

**Competition Performance:**
The code mentions that the solution can achieve around 0.54907 in the private leaderboard, resulting in a 12th position in the competition, which is a placement in the final ranking within the gold medal area.

**Note:**
The code also suggests that reverting back to different LightGBM parameters, such as using 'gbdt' instead of 'goss' for the boosting parameter, could potentially lead to even higher performances, but that would require running the code on a local computer or Google Cloud Platform.


In [2]:
store_id_set_list = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
end_train_day_x_list = [1913]
prediction_horizon_list = [7, 14, 21, 28]

pred_v_all_df = list()

for end_train_day_x in end_train_day_x_list:
    previous_prediction_horizon = 0
    for prediction_horizon in prediction_horizon_list:
        notebook_name = f"m5-train-day-{end_train_day_x}-horizon-{prediction_horizon}"

        pred_v_df = pd.DataFrame()
        
        for store_index, store_id in enumerate(store_id_set_list):
            
            model_path = str(f'model-bin/lgb_model_{store_id}_{prediction_horizon}.bin')
            print(f'loading {model_path}')
            estimator = pickle.load(open(model_path, 'rb'))
            base_test = pd.read_feather(f"feather/test_{store_id}_{prediction_horizon}.feather")
            enable_features = [col for col in base_test.columns if col not in ['id', 'd', 'sales']]
            
            for predict_day in range(previous_prediction_horizon + 1, prediction_horizon + 1):
                print('[{3} -> {4}] predict {0}/{1} {2} day {5}'.format(
                store_index + 1, len(store_id_set_list), store_id,
                previous_prediction_horizon + 1, prediction_horizon, predict_day))
                mask = base_test['d'] == (end_train_day_x + predict_day)
                base_test.loc[mask, 'sales'] = estimator.predict(base_test[mask][enable_features])
                
            temp_v_df = base_test[
                    (base_test['d'] >= end_train_day_x + previous_prediction_horizon + 1) &
                    (base_test['d'] < end_train_day_x + prediction_horizon + 1)
                    ][['id', 'd', 'sales']]
            
            if len(pred_v_df)!=0:
                pred_v_df = pd.concat([pred_v_df, temp_v_df])
            else:
                pred_v_df = temp_v_df.copy()
            
            del(temp_v_df)
            gc.collect()
        
        previous_prediction_horizon = prediction_horizon
        
        if end_train_day_x == 1913:
            pred_v_df.id = pred_v_df.id.str.replace("evaluation", "validation")
            
        pred_v_all_df.append(pred_v_df)

print(pred_v_all_df)
pred_v_all_df = pd.concat(pred_v_all_df)

loading model-bin/lgb_model_CA_1_7.bin
[1 -> 7] predict 1/10 CA_1 day 1
[1 -> 7] predict 1/10 CA_1 day 2
[1 -> 7] predict 1/10 CA_1 day 3
[1 -> 7] predict 1/10 CA_1 day 4
[1 -> 7] predict 1/10 CA_1 day 5
[1 -> 7] predict 1/10 CA_1 day 6
[1 -> 7] predict 1/10 CA_1 day 7
loading model-bin/lgb_model_CA_2_7.bin
[1 -> 7] predict 2/10 CA_2 day 1
[1 -> 7] predict 2/10 CA_2 day 2
[1 -> 7] predict 2/10 CA_2 day 3
[1 -> 7] predict 2/10 CA_2 day 4
[1 -> 7] predict 2/10 CA_2 day 5
[1 -> 7] predict 2/10 CA_2 day 6
[1 -> 7] predict 2/10 CA_2 day 7
loading model-bin/lgb_model_CA_3_7.bin
[1 -> 7] predict 3/10 CA_3 day 1
[1 -> 7] predict 3/10 CA_3 day 2
[1 -> 7] predict 3/10 CA_3 day 3
[1 -> 7] predict 3/10 CA_3 day 4
[1 -> 7] predict 3/10 CA_3 day 5
[1 -> 7] predict 3/10 CA_3 day 6
[1 -> 7] predict 3/10 CA_3 day 7
loading model-bin/lgb_model_CA_4_7.bin
[1 -> 7] predict 4/10 CA_4 day 1
[1 -> 7] predict 4/10 CA_4 day 2
[1 -> 7] predict 4/10 CA_4 day 3
[1 -> 7] predict 4/10 CA_4 day 4
[1 -> 7] predict 4/

In [3]:
print(pred_v_all_df)

                                   id     d     sales
304900  HOBBIES_1_001_CA_1_validation  1914  0.834398
304901  HOBBIES_1_002_CA_1_validation  1914  0.199196
304902  HOBBIES_1_003_CA_1_validation  1914  0.506929
304903  HOBBIES_1_004_CA_1_validation  1914  1.581996
304904  HOBBIES_1_005_CA_1_validation  1914  1.084607
...                               ...   ...       ...
390267    FOODS_3_823_WI_3_validation  1941  0.378189
390268    FOODS_3_824_WI_3_validation  1941  0.295663
390269    FOODS_3_825_WI_3_validation  1941  0.767562
390270    FOODS_3_826_WI_3_validation  1941  1.301386
390271    FOODS_3_827_WI_3_validation  1941  1.757029

[853720 rows x 3 columns]


In [5]:
submission = pd.read_csv("../data/m5-forecasting-accuracy/sample_submission.csv")

In [6]:
pred_v_all_df.d = pred_v_all_df.d - end_train_day_x_list
pred_h_all_df = pred_v_all_df.pivot(index='id', columns='d', values='sales')
pred_h_all_df = pred_h_all_df.reset_index()
pred_h_all_df.columns = submission.columns

In [7]:
submission = submission[['id']].merge(pred_h_all_df, on=['id'], how='left').fillna(0)
submission.to_csv("m5_predictions_public_leaderboard.csv", index=False)