In [2]:
## Project Note
print("This notebook contains the final optimized M5 forecasting pipeline used for the Retail Sales Analytics Dashboard")


This notebook contains the final optimized M5 forecasting pipeline used for the Retail Sales Analytics Dashboard


In [3]:
import os
import gc
import random
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb

warnings.filterwarnings('ignore')

class ProjectConfig:
    DATA_PATH = 'D:/M5 Data'   # <-- IMPORTANT: Set your dataset folder path here
    TRAIN_END = 1913
    FORECAST_HORIZON = 28
    RANDOM_STATE = 42
    
    LGB_PARAMS = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'subsample': 0.5,
        'subsample_freq': 1,
        'learning_rate': 0.03,
        'num_leaves': 2047,
        'min_data_in_leaf': 4095,
        'feature_fraction': 0.5,
        'max_bin': 100,
        'n_estimators': 1400,
        'boost_from_average': False,
        'verbose': -1,
        'n_jobs': -1
    }

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(ProjectConfig.RANDOM_STATE)


In [4]:
# Load datasets from your D drive

sales = pd.read_csv(f"{ProjectConfig.DATA_PATH}/sales_train_validation.csv")
calendar = pd.read_csv(f"{ProjectConfig.DATA_PATH}/calendar.csv")
prices = pd.read_csv(f"{ProjectConfig.DATA_PATH}/sell_prices.csv")

print("Sales shape:", sales.shape)
print("Calendar shape:", calendar.shape)
print("Prices shape:", prices.shape)


Sales shape: (30490, 1919)
Calendar shape: (1969, 14)
Prices shape: (6841121, 4)


In [5]:
def downcast_dtypes(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type).startswith('int'):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage dropped to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    
    return df

In [6]:
def read_data(path):
    print(f"Reading files from {path}...")
    
    calendar = pd.read_csv(f'{path}/calendar.csv')
    calendar = downcast_dtypes(calendar)
    
    prices = pd.read_csv(f'{path}/sell_prices.csv')
    prices = downcast_dtypes(prices)
    
    sales = pd.read_csv(f'{path}/sales_train_validation.csv')
    sales = downcast_dtypes(sales)
    
    return sales, calendar, prices

In [7]:
df_sales, df_calendar, df_prices = read_data(ProjectConfig.DATA_PATH)

Reading files from D:/M5 Data...
Memory usage dropped to  0.12 Mb (41.9% reduction)
Memory usage dropped to 130.48 Mb (37.5% reduction)
Memory usage dropped to 95.00 Mb (78.7% reduction)


In [8]:
def transform_and_merge(sales, calendar, prices, config):
    for day in range(config.FORECAST_HORIZON):
        sales[f'd_{config.TRAIN_END + day + 1}'] = np.nan

    start_idx = max(1, config.TRAIN_END - 1000) 
    value_cols = [c for c in sales.columns if c.startswith('d_') and int(c.split('_')[1]) >= start_idx]
    
    id_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    
    data = pd.melt(sales, id_vars=id_cols, value_vars=value_cols, var_name='d', value_name='sales')
    
    calendar = calendar.drop(['weekday', 'wday', 'month', 'year'], axis=1)
    data = data.merge(calendar, on='d', how='left')
    
    data = data.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
    
    del calendar, prices
    gc.collect()
    
    return data

In [9]:
def feature_engineering_basic(df):
    df['d_num'] = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
    
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_week'] = df['date'].dt.dayofweek.astype(np.int8)
    df['month'] = df['date'].dt.month.astype(np.int8)
    
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(np.int8)
    
    df['price_momentum'] = df['sell_price'] / df.groupby('id')['sell_price'].transform('mean')
    
    df = df.drop(['date', 'd'], axis=1)
    
    return df

In [10]:
def feature_engineering_lags(df):
    lags = [28, 35, 42, 49, 56]
    for lag in lags:
        df[f'lag_{lag}'] = df.groupby('id')['sales'].shift(lag)
        
    windows = [7, 14, 28, 60]
    for win in windows:
        df[f'rolling_mean_{win}'] = df.groupby('id')['lag_28'].transform(
            lambda x: x.rolling(win).mean())
            
    return df

In [11]:
from sklearn.preprocessing import LabelEncoder

def encode_categoricals(df):
    # Added event_name_2 and event_type_2 to the list
    cat_cols = [
        'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 
        'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
    ]
    
    # Fill NaNs for ALL event columns
    df['event_name_1'] = df['event_name_1'].fillna('NoEvent')
    df['event_type_1'] = df['event_type_1'].fillna('NoEvent')
    df['event_name_2'] = df['event_name_2'].fillna('NoEvent')
    df['event_type_2'] = df['event_type_2'].fillna('NoEvent')
    
    encoder = LabelEncoder()
    for col in cat_cols:
        # We convert to string first to handle any mixed types safely
        df[col] = encoder.fit_transform(df[col].astype(str))
        
    return df

In [12]:
def transform_and_merge(sales, calendar, prices, config):
    # --------------------------------------------------
    # 1. Add future forecast columns
    # --------------------------------------------------
    for day in range(config.FORECAST_HORIZON):
        sales[f'd_{config.TRAIN_END + day + 1}'] = np.nan

    # --------------------------------------------------
    # 2. Limit history to last ~1000 days (MEMORY FIX)
    # --------------------------------------------------
    start_idx = max(1, config.TRAIN_END - 1000)

    value_cols = [
        c for c in sales.columns
        if c.startswith('d_') and int(c.split('_')[1]) >= start_idx
    ]

    id_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

    # --------------------------------------------------
    # 3. Melt after limiting columns
    # --------------------------------------------------
    data = pd.melt(
        sales,
        id_vars=id_cols,
        value_vars=value_cols,
        var_name='d',
        value_name='sales'
    )

    # --------------------------------------------------
    # 4. Reduce calendar before merge
    # --------------------------------------------------
    calendar = calendar[
        ['d', 'date', 'wm_yr_wk',
         'event_name_1', 'event_type_1',
         'event_name_2', 'event_type_2']
    ]

    data = data.merge(calendar, on='d', how='left')

    # --------------------------------------------------
    # 5. Reduce prices before merge (CRITICAL)
    # --------------------------------------------------
    prices = prices[['store_id', 'item_id', 'wm_yr_wk', 'sell_price']]

    data = data.merge(
        prices,
        on=['store_id', 'item_id', 'wm_yr_wk'],
        how='left'
    )

    # --------------------------------------------------
    # 6. Cleanup
    # --------------------------------------------------
    del calendar, prices
    gc.collect()

    return data


In [13]:
def perform_split(df, config):
    # --------------------------------------------------
    # 1. Create masks (lightweight)
    # --------------------------------------------------
    train_mask = df['d_num'] <= (config.TRAIN_END - config.FORECAST_HORIZON)
    valid_mask = (
        (df['d_num'] > (config.TRAIN_END - config.FORECAST_HORIZON)) &
        (df['d_num'] <= config.TRAIN_END)
    )

    # --------------------------------------------------
    # 2. Define feature columns FIRST (CRITICAL)
    # --------------------------------------------------
    drop_cols = ['id', 'sales', 'wm_yr_wk', 'd_num']
    feat_cols = [c for c in df.columns if c not in drop_cols]

    # --------------------------------------------------
    # 3. Select only needed columns BEFORE slicing
    # --------------------------------------------------
    df_feat = df[feat_cols + ['sales']]

    # --------------------------------------------------
    # 4. Split (much smaller memory footprint)
    # --------------------------------------------------
    X_tr = df_feat.loc[train_mask, feat_cols]
    y_tr = df_feat.loc[train_mask, 'sales']

    X_val = df_feat.loc[valid_mask, feat_cols]
    y_val = df_feat.loc[valid_mask, 'sales']

    return X_tr, y_tr, X_val, y_val, feat_cols


In [14]:


master_df = pd.read_pickle("processed_dataset.pkl")
print("master_df loaded:", master_df.shape)


master_df loaded: (31374210, 30)


In [15]:
X_tr, y_tr, X_val, y_val, feats = perform_split(master_df, ProjectConfig)
