In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import randint as sp_randint

from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from typing import Optional, List, Tuple,  Callable, Dict, Union
import re
import os
import joblib
import holidays
import pickle
import fsspec

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
brand = 'FACES'
country = 'KSA'
data_start_date = '2023-01-01'
data_end_date = '2025-06-30'
target = 'units_quantity'

gcs_path = "gs://trd-sf-ntb"
experiment_name = "units"  # or something like f"exp_{datetime.now():%Y%m%d_%H%M%S}"
file_input = "pre_process.parquet"
file_output = "monthly_temporal_features.parquet"

In [4]:
brand_clean = brand.lower().replace(" ", "_")

# Construct full dynamic path
full_path = f"{gcs_path}/{brand_clean}/{experiment_name}/{file_input}"

# Save the DataFrame
df = pd.read_parquet(full_path)
df = df[['key','date','business_type',target,'ppu']]
df.rename(columns={target: 'target'}, inplace=True)

In [5]:
full_path

'gs://trd-sf-ntb/faces/units/pre_process.parquet'

In [7]:
def add_monthly_KSA_holiday_count(df: pd.DataFrame, date_col: str = 'date') -> pd.DataFrame:
    """
    Adds a column with the count of KSA holidays per month.

    Parameters:
    df (pd.DataFrame): DataFrame with a monthly date column (e.g., '2023-03-01').
    date_col (str): Name of the date column.

    Returns:
    pd.DataFrame: DataFrame with an additional 'holiday_count' column.
    """
    # Ensure date column is in datetime format
    df[date_col] = pd.to_datetime(df[date_col])

    # Extract year and month
    df['year'] = df[date_col].dt.year
    df['month_num'] = df[date_col].dt.month

    # Build KSA holidays for all years in your data
    years = df['year'].unique()
    KSA_holidays = holidays.country_holidays('SA', years=years)

    # Count holidays per (year, month)
    # Convert holiday_dates to pandas datetime
    holiday_dates = pd.to_datetime(pd.Series(list(KSA_holidays.keys())))

    holiday_df = pd.DataFrame({
        'year': holiday_dates.dt.year,
        'month_num': holiday_dates.dt.month,
        'is_holiday': 1
    })

    holiday_counts = holiday_df.groupby(['year', 'month_num']).size().reset_index(name='holiday_count')

    # Merge with original df
    df = df.merge(holiday_counts, on=['year', 'month_num'], how='left')

    # Fill NaN (months with no holidays) with 0
    if 'holiday_count' in df.columns:
        df['holiday_count'] = df['holiday_count'].fillna(0).astype(int)
    else:
        df['holiday_count'] = 0

    # Drop helper columns if not needed
    df = df.drop(columns=['year', 'month_num'])
    return df

df = add_monthly_KSA_holiday_count(df, date_col='date')
df.head(2)

Unnamed: 0,key,date,business_type,target,ppu,holiday_count
0,3024,2023-01-01,RETAIL,1745,77.263037,0
1,3024,2023-02-01,RETAIL,1793,84.161182,1


In [8]:
def get_date_based_features(df: pd.DataFrame, date_column: str) -> pd.DataFrame:
    """
    Generate various date-based features from a date column in a Pandas DataFrame.

    Parameters:
    -----------
    df : pd.DataFrame
        Input Pandas DataFrame containing the date column from which features will be derived.
    date_column : str
        Name of the date column in the DataFrame.
    date_formats : list
        List of possible date formats to try for parsing.

    Returns:
    --------
    pd.DataFrame
        A new DataFrame with the following additional columns:
        - 'week': Week number of the year (1-52).
        - 'year': The year of the date.
        - 'month': The month of the date.
        - 'adjusted_year': The year adjusted based on week number (for cross-year week handling).
        - 'year_month': A string in the format 'YYYYMM' representing the year and month.
        - 'year_week': A string in the format 'YYYYWW' representing the adjusted year and week number.

    Raises:
    -------
    Exception
        If none of the date formats match, an exception is raised indicating invalid date formats.
    """

    df['date'] = pd.to_datetime(df['date'])
 
    #df['dayofweek'] = df['date'].dt.dayofweek
    #df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    #df['day'] = df['date'].dt.day
    #df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['year'] = df['date'].dt.year
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['days_in_month'] = df['date'].dt.days_in_month
    #df['dayofyear'] = df['date'].dt.dayofyear
 
    # Optional: Fourier features for seasonality
    df['fourier_year_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['fourier_year_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    return df

df = get_date_based_features(df, 'date')
df.head(2)

Unnamed: 0,key,date,business_type,target,ppu,holiday_count,month,quarter,year,is_month_start,is_month_end,days_in_month,fourier_year_sin,fourier_year_cos
0,3024,2023-01-01,RETAIL,1745,77.263037,0,1,1,2023,1,0,31,0.5,0.866025
1,3024,2023-02-01,RETAIL,1793,84.161182,1,2,1,2023,1,0,28,0.866025,0.5


In [9]:
def get_lag_features(
    df: pd.DataFrame,
    lag_check: List[int],
    key: str = 'key',
    date_col: str = 'date',
    target: str = 'target'
) -> pd.DataFrame:
    """
    Generate lag features for the given Pandas DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    lag_check (list of int): A list of integers specifying the lag periods.
    key (str): The column name used as the grouping key.
    date_col (str): The column name used to order within each group.
    target (str): The name of the target column to lag.

    Returns:
    pd.DataFrame: DataFrame with new lag feature columns.
    """
    df = df.copy()
    df.sort_values(by=[key, date_col], inplace=True)
    
    mean_target = df[target].mean()

    for lag_num in lag_check:
        df[f'Lag{lag_num}_y'] = df.groupby(key)[target].shift(lag_num).fillna(mean_target)


    return df

df = get_lag_features(df, [12], 'key', 'date', 'target')
df.head(2)

Unnamed: 0,key,date,business_type,target,ppu,holiday_count,month,quarter,year,is_month_start,is_month_end,days_in_month,fourier_year_sin,fourier_year_cos,Lag12_y
0,3024,2023-01-01,RETAIL,1745,77.263037,0,1,1,2023,1,0,31,0.5,0.866025,2684.294461
1,3024,2023-02-01,RETAIL,1793,84.161182,1,2,1,2023,1,0,28,0.866025,0.5,2684.294461


In [10]:
def get_moving_stats_features(
    df: pd.DataFrame,
    months_back: List[int],
    key: str = 'key',
    date_col: str = 'date',
    target: str = 'target'
) -> pd.DataFrame:
    """
    Generate past-only moving average (MA), rolling std (STD), and exponential moving average (EMA)
    features for monthly time-series data.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing at least the key, date, and target columns.
    - months_back (List[int]): List of window sizes (in months) for calculating moving statistics.
    - key (str): Column name identifying the entity (e.g., product or store).
    - date_col (str): Column name for the date (must be monthly datetime or convertible).
    - target (str): Column name for the value to compute statistics on.

    Returns:
    - pd.DataFrame: Original DataFrame with added moving statistical features.
    """

    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df.sort_values(by=[key, date_col], inplace=True)

    for window in months_back:
        ma_col = f'MA{window}_y'
        std_col = f'STD{window}_y'
        ema_col = f'EMA{window}_y'

        # Use shifted target to avoid peeking into the current period
        shifted = df.groupby(key)[target].shift(1)

        # Moving Average
        df[ma_col] = (
            shifted.groupby(df[key])
                   .rolling(window=window, min_periods=1)
                   .mean()
                   .reset_index(level=0, drop=True)
        )

        # Rolling Standard Deviation
        df[std_col] = (
            shifted.groupby(df[key])
                   .rolling(window=window, min_periods=1)
                   .std()
                   .reset_index(level=0, drop=True)
        )

        # Exponential Moving Average
        df[ema_col] = (
            shifted.groupby(df[key])
                   .transform(lambda x: x.ewm(span=window, adjust=False).mean())
        )

    return df

df = get_moving_stats_features(df, months_back=[6])
df.head()

Unnamed: 0,key,date,business_type,target,ppu,holiday_count,month,quarter,year,is_month_start,is_month_end,days_in_month,fourier_year_sin,fourier_year_cos,Lag12_y,MA6_y,STD6_y,EMA6_y
0,3024,2023-01-01,RETAIL,1745,77.263037,0,1,1,2023,1,0,31,0.5,0.8660254,2684.294461,,,
1,3024,2023-02-01,RETAIL,1793,84.161182,1,2,1,2023,1,0,28,0.866025,0.5,2684.294461,1745.0,,1745.0
2,3024,2023-03-01,RETAIL,2421,80.03387,0,3,1,2023,1,0,31,1.0,6.123234000000001e-17,2684.294461,1769.0,33.941125,1758.714286
3,3024,2023-04-01,RETAIL,4515,77.874419,6,4,2,2023,1,0,30,0.866025,-0.5,2684.294461,1986.333333,377.196677,1947.938776
4,3024,2023-05-01,RETAIL,1902,78.426393,0,5,2,2023,1,0,31,0.5,-0.8660254,2684.294461,2618.5,1301.303321,2681.38484


In [11]:
for prefix in ['EMA', 'MA', 'STD']:
    cols = [col for col in df.columns if col.startswith(prefix) and col.endswith('_y')]
    df[cols] = df[cols].fillna(0)

In [12]:
def get_monthly_seasonality_index(df: pd.DataFrame, date_col: str = 'date', target: str = 'target') -> pd.DataFrame:
    """
    Computes a monthly seasonality index using only past data (up to previous month).

    Parameters:
    - df (pd.DataFrame): Input DataFrame with at least [date_col, target].
    - date_col (str): Name of the datetime column.
    - target (str): Name of the target column (e.g., sales).

    Returns:
    - pd.DataFrame: Original DataFrame with an added 'Seasonality_Index' column.
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df.sort_values(by=date_col, inplace=True)

    # Extract month and year
    df['month'] = df[date_col].dt.month
    df['year'] = df[date_col].dt.year

    # Shift target so current month doesn't influence its own index
    df['target_prev'] = df.groupby('month')[target].shift(1)

    # Compute monthly average of past values
    monthly_avg = (
        df.groupby('month')['target_prev']
          .mean()
          .rename('monthly_avg')
    )

    # Normalize to get a seasonality index
    seasonality_index = (monthly_avg / monthly_avg.mean()).round(2)
    seasonality_index.name = 'Seasonality_Index'

    # Map back to original df
    df = df.merge(seasonality_index, on='month', how='left')

    # Fill missing (e.g. first year) with neutral seasonality
    df['Seasonality_Index'] = df['Seasonality_Index'].fillna(1.0)

    return df.drop(columns=['target_prev'])

df = get_monthly_seasonality_index(df, date_col='date', target='target')
df.head(2)

Unnamed: 0,key,date,business_type,target,ppu,holiday_count,month,quarter,year,is_month_start,is_month_end,days_in_month,fourier_year_sin,fourier_year_cos,Lag12_y,MA6_y,STD6_y,EMA6_y,Seasonality_Index
0,3024,2023-01-01,RETAIL,1745,77.263037,0,1,1,2023,1,0,31,0.5,0.866025,2684.294461,0.0,0.0,0.0,0.83
1,3144,2023-01-01,RETAIL,987,69.694022,0,1,1,2023,1,0,31,0.5,0.866025,2684.294461,0.0,0.0,0.0,0.83


In [13]:
def get_rate_of_sale_monthly(df: pd.DataFrame, dfu_columns: List[str]) -> pd.DataFrame:
    """
    Calculate average monthly rate of sale using only past (previous month) data for each group.

    Parameters:
    df (pd.DataFrame): DataFrame with 'date', 'target', and dfu_columns.
    dfu_columns (List[str]): Columns to group by (e.g., ['key']).

    Returns:
    pd.DataFrame: DataFrame with average rate_of_sale per year per group.
    """
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by=dfu_columns + ['date'], inplace=True)
    df['year'] = df['date'].dt.year

    # Shift target by 1 month (implicitly via .shift)
    df['target_shifted'] = df.groupby(dfu_columns)['target'].shift(1)

    # Filter out rows without a prior month
    df = df[~df['target_shifted'].isna()]

    # Count number of months (not days)
    grouped_df = df.groupby(dfu_columns + ['year']).agg(
        total_sales=pd.NamedAgg(column='target_shifted', aggfunc='sum'),
        total_months=pd.NamedAgg(column='date', aggfunc='nunique')
    ).reset_index()

    # Monthly rate of sale
    grouped_df['rate_of_sale'] = grouped_df['total_sales'] / grouped_df['total_months']

    return grouped_df[dfu_columns + ['year', 'rate_of_sale']]

dfu_cols = ['key']
rate_of_sale_df = get_rate_of_sale_monthly(df, dfu_cols)
df = df.merge(rate_of_sale_df, on=dfu_cols + ['year'], how='left')
df.head(2)

Unnamed: 0,key,date,business_type,target,ppu,holiday_count,month,quarter,year,is_month_start,is_month_end,days_in_month,fourier_year_sin,fourier_year_cos,Lag12_y,MA6_y,STD6_y,EMA6_y,Seasonality_Index,rate_of_sale
0,3024,2023-01-01,RETAIL,1745,77.263037,0,1,1,2023,1,0,31,0.5,0.866025,2684.294461,0.0,0.0,0.0,0.83,2335.909091
1,3144,2023-01-01,RETAIL,987,69.694022,0,1,1,2023,1,0,31,0.5,0.866025,2684.294461,0.0,0.0,0.0,0.83,1664.818182


In [14]:
def create_peak_calendar(start_date=data_start_date, end_date=data_end_date):
    import pandas as pd

    calendar = pd.DataFrame({'date': pd.date_range(start=start_date, end=end_date)})
    calendar['month'] = calendar['date'].dt.month
    calendar['year'] = calendar['date'].dt.year
    calendar['year_month'] = calendar['date'].dt.to_period('M')

    # Peak months
    peak_months = [1, 7, 12]
    calendar['peak_flag'] = calendar['month'].isin(peak_months)

    # Ramadan and Eid logic
    ramadan_ranges = [
        ('2023-03-23', '2023-04-20'),
        ('2024-03-10', '2024-04-08'),
        ('2025-02-28', '2025-03-29'),
        ('2026-02-17', '2026-03-18'),  # Approximate Ramadan 2026
    ]
    eid_fitr = pd.to_datetime(['2023-04-21', '2024-04-10', '2025-03-30', '2026-03-19'])
    eid_adha = pd.to_datetime(['2023-06-28', '2024-06-16', '2025-06-06', '2026-05-27'])

    # Add festive_peak_flag (only Ramadan + Eid)
    calendar['festive_peak_flag'] = False
    for start, end in ramadan_ranges:
        calendar.loc[(calendar['date'] >= start) & (calendar['date'] <= end), 'festive_peak_flag'] = True
    calendar.loc[calendar['date'].isin(eid_fitr) | calendar['date'].isin(eid_adha), 'festive_peak_flag'] = True

    # Also add Ramadan + Eid to peak_flag
    for start, end in ramadan_ranges:
        calendar.loc[(calendar['date'] >= start) & (calendar['date'] <= end), 'peak_flag'] = True
    calendar.loc[calendar['date'].isin(eid_fitr) | calendar['date'].isin(eid_adha), 'peak_flag'] = True

    # National Day (September 23)
    calendar.loc[
        (calendar['month'] == 9) & (calendar['date'].dt.day == 23) & (calendar['year'].isin([2023, 2024, 2025, 2026])),
        'peak_flag'
    ] = True

    # Black Friday / Cyber Monday (last 10 days of November)
    calendar.loc[
        (calendar['month'] == 11) & (calendar['date'].dt.day.between(20, 30)) & 
        (calendar['year'].isin([2023, 2024, 2025, 2026])),
        'peak_flag'
    ] = True

    # Mid-term promotions (February 10–16 and March 15–25)
    calendar.loc[
        ((calendar['month'] == 2) & (calendar['date'].dt.day.between(10, 16))) |
        ((calendar['month'] == 3) & (calendar['date'].dt.day.between(15, 25))),
        'peak_flag'
    ] = True

    # Convert flags to int
    calendar[['peak_flag', 'festive_peak_flag']] = calendar[['peak_flag', 'festive_peak_flag']].astype(int)

    return calendar


def merge_peak_calendar_info(df, start_date=data_start_date, end_date=data_end_date):
    import pandas as pd

    # Generate the calendar with flags
    calendar = create_peak_calendar(start_date=start_date, end_date=end_date)

    # Compute monthly peak ratio
    monthly_peak_ratio = (
        calendar.groupby('year_month')['peak_flag']
        .agg(['sum', 'count'])
        .rename(columns={'sum': 'peak_days', 'count': 'days_in_month'})
        .assign(ksa_shopping_peak_ratio=lambda d: d['peak_days'] / d['days_in_month'])
        .reset_index()
    )

    # Prepare df for merge
    df = df.copy()
    df['year_month'] = df['date'].dt.to_period('M')

    # Drop existing columns if any
    df.drop(columns=['ksa_shopping_peak_ratio'], errors='ignore', inplace=True)

    # Merge shopping peak ratio
    df = df.merge(monthly_peak_ratio[['year_month', 'ksa_shopping_peak_ratio']], on='year_month', how='left')

    # Merge festive_peak_flag (only for Ramadan and Eid)
    df = df.merge(calendar[['date', 'festive_peak_flag']], on='date', how='left')

    # Clean up
    df.drop(columns=['year_month'], inplace=True)

    return df

df = merge_peak_calendar_info(df)

In [15]:
df.ksa_shopping_peak_ratio.unique()

array([1.        , 0.25      , 0.5483871 , 0.7       , 0.        ,
       0.03333333, 0.36666667, 0.24137931, 0.70967742, 0.3       ,
       0.28571429, 0.96774194])

In [16]:
def create_monthly_seasonal_features(df, target_col='target', date_column='date', group_key='key'):
    """
    Create seasonal decomposition features for monthly-level forecasting, optionally grouped by a key.

    Parameters:
    -----------
    df : pd.DataFrame
        Input DataFrame with monthly time series.
    target_col : str or pd.Series
        Name of the target column to analyze (or a reference to it).
    date_column : str or pd.Series
        Name of the date column.
    group_key : str or list of str, optional
        Column(s) to group by when calculating seasonal patterns. If None, computes over entire dataset.

    Returns:
    --------
    pd.DataFrame
        DataFrame with added monthly seasonal features.
    """
    import pandas as pd

    # Handle Series inputs
    if isinstance(target_col, pd.Series):
        target_col = target_col.name
    if isinstance(date_column, pd.Series):
        date_column = date_column.name

    df_month = df.copy()

    # Ensure date is datetime
    if date_column not in df_month.columns:
        raise KeyError(f"Date column '{date_column}' not found.")
    if not pd.api.types.is_datetime64_any_dtype(df_month[date_column]):
        df_month[date_column] = pd.to_datetime(df_month[date_column])

    # Extract month & quarter from date
    df_month['month'] = df_month[date_column].dt.month
    df_month['quarter'] = df_month[date_column].dt.quarter

    # Identify target column
    target_sum_col = f"{target_col}_sum"
    if target_sum_col not in df_month.columns:
        available = [col for col in df_month.columns if str(target_col) in str(col)]
        if available:
            target_sum_col = available[0]
        else:
            raise ValueError(f"No target column containing '{target_col}' found. Available columns: {df_month.columns.tolist()}")

    # Define groupings
    if group_key is not None:
        if isinstance(group_key, str):
            group_key = [group_key]
        monthly_groups = ['month'] + group_key
        quarterly_groups = ['quarter'] + group_key
    else:
        monthly_groups = ['month']
        quarterly_groups = ['quarter']
    
    # Shift target so current month doesn't influence its own index
    df_month['monthly_target_prev'] = df_month.groupby(monthly_groups)[target_col].shift(1)
    df_month['quarterly_target_prev'] = df_month.groupby(quarterly_groups)[target_col].shift(1)
    
    
    # Add seasonal features
    df_month[f'{target_col}_seasonal_monthly'] = df_month.groupby(monthly_groups)['monthly_target_prev'].transform('mean')
    df_month[f'{target_col}_seasonal_quarterly'] = df_month.groupby(quarterly_groups)['quarterly_target_prev'].transform('mean')


    # Optionally: Add seasonal ratio (normalize target by seasonal means)
    # df_month[f'{target_col}_monthly_ratio'] = df_month[target_sum_col] / (df_month[f'{target_col}_seasonal_monthly'] + 1e-6)
    # df_month[f'{target_col}_quarterly_ratio'] = df_month[target_sum_col] / (df_month[f'{target_col}_seasonal_quarterly'] + 1e-6)

    df_month.drop(['monthly_target_prev','quarterly_target_prev'],axis=1,inplace=True)
    return df_month

df = create_monthly_seasonal_features(df, 'target', 'date', 'key')
df.head(2)

Unnamed: 0,key,date,business_type,target,ppu,holiday_count,month,quarter,year,is_month_start,...,Lag12_y,MA6_y,STD6_y,EMA6_y,Seasonality_Index,rate_of_sale,ksa_shopping_peak_ratio,festive_peak_flag,target_seasonal_monthly,target_seasonal_quarterly
0,3024,2023-01-01,RETAIL,1745,77.263037,0,1,1,2023,1,...,2684.294461,0.0,0.0,0.0,0.83,2335.909091,1.0,0,1799.333333,2516.333333
1,3144,2023-01-01,RETAIL,987,69.694022,0,1,1,2023,1,...,2684.294461,0.0,0.0,0.0,0.83,1664.818182,1.0,0,1307.0,1840.222222


In [17]:
df.shape

(1372, 24)

In [18]:
df.columns

Index(['key', 'date', 'business_type', 'target', 'ppu', 'holiday_count',
       'month', 'quarter', 'year', 'is_month_start', 'is_month_end',
       'days_in_month', 'fourier_year_sin', 'fourier_year_cos', 'Lag12_y',
       'MA6_y', 'STD6_y', 'EMA6_y', 'Seasonality_Index', 'rate_of_sale',
       'ksa_shopping_peak_ratio', 'festive_peak_flag',
       'target_seasonal_monthly', 'target_seasonal_quarterly'],
      dtype='object')

In [19]:
df.dtypes

key                                  object
date                         datetime64[ns]
business_type                        object
target                                int64
ppu                                 float64
holiday_count                         int64
month                                 int32
quarter                               int32
year                                  int32
is_month_start                        int64
is_month_end                          int64
days_in_month                         int32
fourier_year_sin                    float64
fourier_year_cos                    float64
Lag12_y                             float64
MA6_y                               float64
STD6_y                              float64
EMA6_y                              float64
Seasonality_Index                   float64
rate_of_sale                        float64
ksa_shopping_peak_ratio             float64
festive_peak_flag                     int64
target_seasonal_monthly         

In [20]:
brand_clean = brand.lower().replace(" ", "_")

# Construct full dynamic path
full_path = f"{gcs_path}/{brand_clean}/{experiment_name}/{file_output}"

# Save the DataFrame
df.to_parquet(full_path, index=False)

In [21]:
full_path

'gs://trd-sf-ntb/faces/units/monthly_temporal_features.parquet'

In [23]:
df.date.min(), df.date.max()

(Timestamp('2023-01-01 00:00:00'), Timestamp('2025-06-01 00:00:00'))