In [None]:
# Importing Libraries
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
from  datetime import datetime, timedelta
import os
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split



In [None]:
# Loading and Preprocessing calendar.csv
calendarDTypes = {"event_name_1": "category",
                  "event_name_2": "category",
                  "event_type_1": "category",
                  "event_type_2": "category",
                  "weekday": "category",
                  'wm_yr_wk': 'int16',
                  "wday": "int16",
                  "month": "int16",
                  "year": "int16",
                  "snap_CA": "float32",
                  'snap_TX': 'float32',
                  'snap_WI': 'float32' }

# Read csv file
calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv",
                       dtype = calendarDTypes)

calendar["date"] = pd.to_datetime(calendar["date"])

# Transforming Categorical Features
for col, colDType in calendarDTypes.items():
    if colDType == "category":
        calendar[col] = calendar[col].cat.codes.astype("int16")
        calendar[col] -= calendar[col].min()

calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,2,1,1,2011,d_1,0,0,0,0,0.0,0.0,0.0
1,2011-01-30,11101,3,2,1,2011,d_2,0,0,0,0,0.0,0.0,0.0
2,2011-01-31,11101,1,3,1,2011,d_3,0,0,0,0,0.0,0.0,0.0
3,2011-02-01,11101,5,4,2,2011,d_4,0,0,0,0,1.0,1.0,0.0
4,2011-02-02,11101,6,5,2,2011,d_5,0,0,0,0,1.0,0.0,1.0


In [None]:
# Loading and Preprocessing sell_prices.csv
priceDTypes = {"store_id": "category",
               "item_id": "category",
               "wm_yr_wk": "int16",
               "sell_price":"float32"}

# Read csv file
prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv",
                     dtype = priceDTypes)

# Transforming Categorical Features
for col, colDType in priceDTypes.items():
    if colDType == "category":
        prices[col] = prices[col].cat.codes.astype("int16")
        prices[col] -= prices[col].min()

prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,0,11325,9.58
1,0,0,11326,9.58
2,0,0,11327,8.26
3,0,0,11328,8.26
4,0,0,11329,8.26


In [None]:
firstDay = 250 # to focus more on recent data
lastDay = 1913

# Use x sales days (columns) for training
numCols = [f"d_{day}" for day in range(firstDay, lastDay+1)]

# Define all categorical columns
catCols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']

# Define the correct data types for "sales_train_validation.csv"
dtype = {numCol: "float32" for numCol in numCols}
dtype.update({catCol: "category" for catCol in catCols if catCol != "id"})

# Read csv file
ds = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_validation.csv",
                 usecols = catCols + numCols, dtype = dtype)

# Transform categorical features into integers
for col in catCols:
    if col != "id":
        ds[col] = ds[col].cat.codes.astype("int16")
        ds[col] -= ds[col].min()

# Converting wide to long format
ds = pd.melt(ds,
             id_vars = catCols,
             value_vars = [col for col in ds.columns if col.startswith("d_")],
             var_name = "d",
             value_name = "sales")

# Merge sales data with "calendar" and "prices" dataframe
ds = ds.merge(calendar, on = "d", copy = False)
ds = ds.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

ds.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_250,0.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,3.97
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_250,0.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,4.34
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_250,0.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,2.48
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_250,1.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,0.5
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_250,2.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,1.77


In [None]:
#Lag and Rolling Mean Features
dayLags = [7, 28]
lagSalesCols = [f"lag_{dayLag}" for dayLag in dayLags]
for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
    ds[lagSalesCol] = ds[["id","sales"]].groupby("id")["sales"].shift(dayLag)

windows = [7, 28]
for window in windows:
    for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
        ds[f"rmean_{dayLag}_{window}"] = ds[["id", lagSalesCol]].groupby("id")[lagSalesCol].transform(lambda x: x.rolling(window).mean())

In [None]:
dateFeatures = {
    "wday": "weekday",
    "month": "month",
    "quarter": "quarter",
    "year": "year",
    "mday": "day"
}

for featName, featFunc in dateFeatures.items():
    if featName in ds.columns:
        ds[featName] = ds[featName].astype("int16")
    else:
        ds[featName] = getattr(ds["date"].dt, featFunc).astype("int16")

# Special handling for 'week'
ds["week"] = ds["date"].dt.isocalendar().week.astype("int16")


In [None]:
ds.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,quarter,mday,week
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_250,0.0,2011-10-05,11136,...,3.97,,,,,,,4,5,40
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_250,0.0,2011-10-05,11136,...,4.34,,,,,,,4,5,40
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_250,0.0,2011-10-05,11136,...,2.48,,,,,,,4,5,40
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_250,1.0,2011-10-05,11136,...,0.5,,,,,,,4,5,40
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_250,2.0,2011-10-05,11136,...,1.77,,,,,,,4,5,40


In [None]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42372682 entries, 0 to 42372681
Data columns (total 31 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   dept_id       int16         
 3   store_id      int16         
 4   cat_id        int16         
 5   state_id      int16         
 6   d             object        
 7   sales         float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       int16         
 11  wday          int16         
 12  month         int16         
 13  year          int16         
 14  event_name_1  int16         
 15  event_type_1  int16         
 16  event_name_2  int16         
 17  event_type_2  int16         
 18  snap_CA       float32       
 19  snap_TX       float32       
 20  snap_WI       float32       
 21  sell_price    float32       
 22  lag_7         float32       
 23  lag_28        float32       
 

In [None]:
# Remove all rows with NaN value
ds.dropna(inplace = True)

In [None]:
ds.shape

(40695732, 31)

In [None]:
# Define columns that need to be removed
unusedCols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
trainCols = ds.columns[~ds.columns.isin(unusedCols)]
X_train = ds[trainCols]
y_train = ds["sales"]

In [None]:
np.random.seed(777)

# Define categorical features
catFeats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + \
           ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]

validInds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
trainInds = np.setdiff1d(X_train.index.values, validInds)

trainData = lgb.Dataset(X_train.loc[trainInds], label = y_train.loc[trainInds],
                        categorical_feature = catFeats, free_raw_data = False)
validData = lgb.Dataset(X_train.loc[validInds], label = y_train.loc[validInds],
                        categorical_feature = catFeats, free_raw_data = False)

Model Building: LightGBM

In [None]:
#Model Parameters
params = {
    "objective": "poisson",
    "metric": "rmse",
    "force_row_wise": True,
    "learning_rate": 0.01,
    "num_iterations": 3000,
    "sub_row": 0.85,
    "bagging_freq": 1,
    "lambda_l1": 0.05,
    "lambda_l2": 0.2,
    "num_leaves": 128,
    "min_data_in_leaf": 100,
    "verbosity": 1
}


In [None]:
# Train LightGBM model
m_lgb = lgb.train(
    params,
    trainData,
    valid_sets=[validData],
)

[LightGBM] [Info] Total Bins 4596
[LightGBM] [Info] Number of data points in the train set: 38695732, number of used features: 25
[LightGBM] [Info] Start training from score 0.312338


In [None]:
# Save the model
m_lgb.save_model("model.lgb")

<lightgbm.basic.Booster at 0x7ddcccd888d0>

In [None]:
# Last day used for training
trLast = 1913
# Maximum lag day
maxLags = 57

# Create dataset for predictions
def create_ds():

    startDay = trLast - maxLags

    numCols = [f"d_{day}" for day in range(startDay, trLast + 1)]
    catCols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']

    dtype = {numCol:"float32" for numCol in numCols}
    dtype.update({catCol: "category" for catCol in catCols if catCol != "id"})

    ds = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_validation.csv",
                     usecols = catCols + numCols, dtype = dtype)

    for col in catCols:
        if col != "id":
            ds[col] = ds[col].cat.codes.astype("int16")
            ds[col] -= ds[col].min()

    for day in range(trLast + 1, trLast+ 28 +1):
        ds[f"d_{day}"] = np.nan

    ds = pd.melt(ds,
                 id_vars = catCols,
                 value_vars = [col for col in ds.columns if col.startswith("d_")],
                 var_name = "d",
                 value_name = "sales")

    ds = ds.merge(calendar, on = "d", copy = False)
    ds = ds.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

    return ds



In [None]:
def create_features(ds):
    dayLags = [7, 28]
    lagSalesCols = [f"lag_{dayLag}" for dayLag in dayLags]
    for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
        ds[lagSalesCol] = ds[["id", "sales"]].groupby("id")["sales"].shift(dayLag)

    windows = [7, 28]
    for window in windows:
        for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
            ds[f"rmean_{dayLag}_{window}"] = ds[["id", lagSalesCol]].groupby("id")[lagSalesCol].transform(lambda x: x.rolling(window).mean())

    dateFeatures = {
        "wday": "weekday",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day"
    }

    for featName, featFunc in dateFeatures.items():
        if featName in ds.columns:
            ds[featName] = ds[featName].astype("int16")
        else:
            ds[featName] = getattr(ds["date"].dt, featFunc).astype("int16")

    ds["week"] = ds["date"].dt.isocalendar().week.astype("int16")


Model Building: XGBoost

In [None]:

# Sample ~1M rows from training indices to avoid memory overload
sample_size = min(1_000_000, len(trainInds))
xgb_sample_inds = np.random.choice(trainInds, sample_size, replace=False)

# Use float32 to reduce memory footprint
X_xgb = X_train.loc[xgb_sample_inds].astype("float32")
y_xgb = y_train.loc[xgb_sample_inds].astype("float32")

# Create a validation split for early stopping
X_xgb_train, X_xgb_val, y_xgb_train, y_xgb_val = train_test_split(
    X_xgb, y_xgb, test_size=0.1, random_state=42
)

# Define the XGBoost model with tuned parameters
xgb_model = XGBRegressor(
    n_estimators=300,         # Number of boosting rounds
    max_depth=5,              # Controls model complexity
    learning_rate=0.03,       # Step size shrinkage
    subsample=0.8,            # Row sampling
    colsample_bytree=0.8,     # Feature sampling
    tree_method="hist",       # Memory-efficient histogram algorithm
    n_jobs=-1,                # Use all cores
    random_state=42           # Reproducibility
)

# Train the model with early stopping
xgb_model.fit(
    X_xgb_train, y_xgb_train,
    eval_set=[(X_xgb_val, y_xgb_val)],
    early_stopping_rounds=25,
    verbose=50
)

# Save the trained model to disk
joblib.dump(xgb_model, "xgb_model.pkl")

[0]	validation_0-rmse:4.00766




[50]	validation_0-rmse:2.63842
[100]	validation_0-rmse:2.57204
[105]	validation_0-rmse:2.57246


['xgb_model.pkl']