In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import randint as sp_randint

from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from typing import Optional, List, Tuple,  Callable, Dict, Union
import re
import os
import joblib
import holidays
import pickle
import fsspec
import seaborn as sns
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
import category_encoders as ce
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
brand = 'LACOSTE'
country = 'UAE'
data_start_date = '2023-01-01'
data_end_date = '2026-10-31'
target="units"
cutoff_month='2025-07'
gcs_path = "gs://trd-sf-ntb"
experiment_name = "model_pipeline"  # or something like f"exp_{datetime.now():%Y%m%d_%H%M%S}"
# file_name = "pre_process_monthly_units.parquet"
file_output='calendar_features.parquet'

In [3]:
def add_ramadan_features(df,date_column):
  
    """Add Ramadan-related features (approximate dates)"""
    
    # Ramadan approximate dates (you should use hijri-converter for exact dates)
    ramadan_periods = {
        2023: (pd.Timestamp('2023-03-22'), pd.Timestamp('2023-04-21')),
        2024: (pd.Timestamp('2024-03-10'), pd.Timestamp('2024-04-09')),
        2025: (pd.Timestamp('2025-02-28'), pd.Timestamp('2025-03-30')),
        2026: (pd.Timestamp('2026-02-17'), pd.Timestamp('2026-03-19')),
    }
    
    df['is_ramadan'] = 0
    
    for year, (start_date, end_date) in ramadan_periods.items():
        year_mask = df[date_column].dt.year == year
        ramadan_mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
        df.loc[year_mask & ramadan_mask, 'is_ramadan'] = 1
    
    # Pre and post Ramadan periods
    df['pre_ramadan'] = 0
    df['post_ramadan'] = 0
    
    for year, (start_date, end_date) in ramadan_periods.items():
        year_mask = df[date_column].dt.year == year
        
        # 2 weeks before Ramadan
        pre_ramadan_mask = ((df[date_column] >= (start_date - timedelta(days=14))) & 
                           (df[date_column] < start_date))
        df.loc[year_mask & pre_ramadan_mask, 'pre_ramadan'] = 1
        
        # 1 week after Ramadan
        post_ramadan_mask = ((df[date_column] > end_date) & 
                            (df[date_column] <= (end_date + timedelta(days=7))))
        df.loc[year_mask & post_ramadan_mask, 'post_ramadan'] = 1
    
    return df



In [4]:
def add_eid_features(df, date_column):
    """Add Eid-related features"""
    
    # Eid Al-Fitr approximate dates (end of Ramadan)
    eid_fitr_dates = {
       
        2023: [pd.Timestamp('2023-04-21'), pd.Timestamp('2023-04-22'), pd.Timestamp('2023-04-23')],
        2024: [pd.Timestamp('2024-04-09'), pd.Timestamp('2024-04-10'), pd.Timestamp('2024-04-11')],
        2025: [pd.Timestamp('2025-03-30'), pd.Timestamp('2025-03-31'), pd.Timestamp('2025-04-01')],
        2026: [pd.Timestamp('2026-03-19'), pd.Timestamp('2026-03-20'), pd.Timestamp('2026-03-21')],
    }
    
    # Eid Al-Adha approximate dates
    eid_adha_dates = {
    
        2023: [pd.Timestamp('2023-06-28'), pd.Timestamp('2023-06-29'), pd.Timestamp('2023-06-30')],
        2024: [pd.Timestamp('2024-06-16'), pd.Timestamp('2024-06-17'), pd.Timestamp('2024-06-18')],
        2025: [pd.Timestamp('2025-06-06'), pd.Timestamp('2025-06-07'), pd.Timestamp('2025-06-08')],
        2026: [pd.Timestamp('2026-05-26'), pd.Timestamp('2026-05-27'), pd.Timestamp('2026-05-28')],
    }
    
    df['is_eid_fitr'] = 0
    df['is_eid_adha'] = 0
    df['pre_eid_shopping'] = 0
    
    for year in eid_fitr_dates.keys():
        year_mask = df[date_column].dt.year == year
        
        # Eid Al-Fitr
        for eid_date in eid_fitr_dates[year]:
            df.loc[year_mask & (df[date_column] == eid_date), 'is_eid_fitr'] = 1
            
            # Pre-Eid shopping period (1 week before)
            pre_eid_mask = ((df[date_column] >= (eid_date - timedelta(days=7))) & 
                           (df[date_column] < eid_date))
            df.loc[year_mask & pre_eid_mask, 'pre_eid_shopping'] = 1
        
        # Eid Al-Adha
        for eid_date in eid_adha_dates[year]:
            df.loc[year_mask & (df[date_column] == eid_date), 'is_eid_adha'] = 1
    
    return df


In [5]:
def add_shopping_events(df, date_column):
    """Add major shopping events and sales periods"""
    
    # Dubai Shopping Festival
    df['dubai_shopping_festival'] = ((df[date_column].dt.month == 12) | 
    ((df[date_column].dt.month == 1) & (df[date_column].dt.day <= 15))).astype(int)
    
    # Dubai Summer Surprises (typically June-August)
    df['dubai_summer_surprises'] = df[date_column].dt.month.isin([6, 7, 8]).astype(int)
    
    # End-of-season sales
    df['end_of_season_sales'] = df[date_column].dt.month.isin([2, 7, 11]).astype(int)
    
    # Year-end sales
    df['year_end_sales'] = ((df[date_column].dt.month == 12) & 
                           (df[date_column].dt.day >= 15)).astype(int)
    
    # Black Friday (last Friday of November)
    df['black_friday_week'] = 0
    
    for year in df[date_column].dt.year.unique():
        year_mask = df[date_column].dt.year == year
        
        # Find last Friday of November
        nov_dates = pd.date_range(f'{year}-11-01', f'{year}-11-30', freq='D')
        fridays = nov_dates[nov_dates.dayofweek == 4]  # Friday is 4
        if len(fridays) > 0:
            black_friday = fridays[-1]
            
            # Black Friday week
            bf_week_start = black_friday - timedelta(days=3)
            bf_week_end = black_friday + timedelta(days=3)
            bf_mask = ((df[date_column] >= bf_week_start) & 
                      (df[date_column] <= bf_week_end))
            df.loc[year_mask & bf_mask, 'black_friday_week'] = 1
    
    return df



In [6]:
def add_luxury_seasons(df, date_column):
    """Add luxury retail specific seasonal features"""
    
    # Fashion weeks influence (international fashion weeks can affect luxury retail)
    # All Fashion Weeks typically February/March and September
    df['fashion_week_season'] = df[date_column].dt.month.isin([2, 9]).astype(int)
    
    return df

In [7]:
def create_holiday_features(df, date_column):
    """
    Create holiday and UAE-specific retail calendar features.

    Adds daily binary flags for:
    - Ramadan (pre, during, post)
    - Eid Al-Fitr / Eid Al-Adha
    - National/public holidays
    - High sales intent periods
    - Shopping events
    - Fashion weeks (luxury retail)

    Returns a DataFrame with all holiday & seasonal flags.
    """
    df = df.copy()
    df[date_column] = pd.to_datetime(df[date_column])

    # --------------------------
    # UAE National Public Holidays
    df['month_day'] = df[date_column].dt.strftime('%m-%d')
    uae_national_holidays = ['01-01', '06-05', '06-26', '11-30', '12-02', '12-03']
    df['national_holiday_flag'] = df['month_day'].isin(uae_national_holidays).astype(int)
    df.drop(columns='month_day', inplace=True)

    # --------------------------
    # Ramadan, Pre-, Post-
    df = add_ramadan_features(df, date_column)
    df['ramadan_period'] = 0
    df.loc[df['pre_ramadan'] == 1, 'ramadan_period'] = 1
    df.loc[df['post_ramadan'] == 1, 'ramadan_period'] = 1
    df.loc[df['is_ramadan'] == 1, 'ramadan_period'] = 1
    df.drop(columns=['pre_ramadan', 'post_ramadan', 'is_ramadan'], inplace=True)

    # --------------------------
    # Eid Holidays
    df = add_eid_features(df, date_column)

    # Public Holidays Flag
    df['public_holidays'] = 0
    df.loc[(df['is_eid_fitr'] == 1) | (df['is_eid_adha'] == 1), 'public_holidays'] = 1
    df.loc[(df['national_holiday_flag'] == 1), 'public_holidays'] = 1

    # --------------------------
    # Shopping Events
    df = add_shopping_events(df, date_column)

    # High Sales Intent Periods
    df['is_high_sales_intent_flag'] = 0
    black_friday_weekend_mask = (
        (df['black_friday_week'] == 1) &
        (df[date_column].dt.dayofweek.isin([5, 6]))  # Saturday/Sunday
    )
    high_sales_mask = (
        (df['is_eid_adha'] == 1) |
        (df['pre_eid_shopping'] == 1) |
        (df['dubai_summer_surprises'] == 1) |
        (df['black_friday_week'] == 1) |
        black_friday_weekend_mask
    )
    df.loc[high_sales_mask, 'is_high_sales_intent_flag'] = 1

    # --------------------------
    # Encode Shopping Event Priority
    df['shopping_event_code'] = 0
    event_priority = {
        'end_of_season_sales': 1,
        'dubai_summer_surprises': 1,
        'dubai_shopping_festival': 1,
        'black_friday_week': 1,
        'year_end_sales': 1,
        'pre_eid_shopping': 1
    }
    for event, code in event_priority.items():
        df.loc[df[event] == 1, 'shopping_event_code'] = code
    df.drop(columns=list(event_priority.keys()), inplace=True)

    # --------------------------
    # Fashion Weeks
    df = add_luxury_seasons(df, date_column)

    # Cleanup
    df.drop(columns=['is_eid_fitr', 'is_eid_adha', 'national_holiday_flag'], inplace=True)

    print("✅ Holiday & UAE retail features created")
    return df

In [8]:
calendar = pd.DataFrame({'date': pd.date_range(start=data_start_date, end=data_end_date)})
calendar['month'] = calendar['date'].dt.month
calendar['year'] = calendar['date'].dt.year
calendar['year_month'] = calendar['date'].dt.to_period('M')
df=create_holiday_features(calendar,"date")

✅ Holiday & UAE retail features created


In [9]:
# df_events["count_of_days"]=df_events["year_month"].apply(lambda x: x.days_in_month)

In [10]:
df_grp=df.groupby(["year_month"]).agg(ramadan_period=("ramadan_period","sum"),public_holidays=("public_holidays","sum")\
        ,is_high_sales_intent_flag=("is_high_sales_intent_flag","sum"),shopping_event_code=("shopping_event_code","sum"),\
        fashion_week_season=("fashion_week_season","sum")).reset_index()

In [11]:
df_grp["count_of_days"]=df_grp["year_month"].apply(lambda x: x.days_in_month)
cols=df_grp.columns.to_list()
cols.remove("year_month")
cols.remove("count_of_days")

df_grp["month"]=df_grp["year_month"].apply(lambda x: x.month)

In [12]:
def create_weather_season_features(df, month):
    """Add a single UAE-specific season feature as a numeric value"""
    
    def assign_season_code(month):
        if month in [12, 1, 2]:
            return 1  # Winter (Cool season)
        elif month in [3, 4, 5]:
            return 2  # Spring (Pleasant season)
        elif month in [6, 7, 8, 9]:
            return 3  # Summer (Hot season)
        elif month in [10, 11]:
            return 4  # Autumn (Pleasant season)

    df['uae_seasons'] = df[month].apply(assign_season_code)
    
    return df

df_grp = create_weather_season_features(df_grp, 'month')
df_grp.head(2)

Unnamed: 0,year_month,ramadan_period,public_holidays,is_high_sales_intent_flag,shopping_event_code,fashion_week_season,count_of_days,month,uae_seasons
0,2023-01,0,1,0,15,0,31,1,1
1,2023-02,0,0,0,28,28,28,2,1


In [13]:
df_grp['date'] = df_grp['year_month'].dt.to_timestamp()
df_grp.drop(["year_month","month"],axis=1,errors="ignore",inplace=True)

In [14]:
full_path=f"{gcs_path}/{target}/{brand}/{experiment_name}/{cutoff_month}/{file_output}"
full_path

'gs://trd-sf-ntb/units/LACOSTE/model_pipeline/2025-07/calendar_features.parquet'

In [15]:
df_grp.to_parquet(full_path, index=False)

In [16]:
hello

NameError: name 'hello' is not defined

In [None]:
def eda_features(df_input: pd.DataFrame, feature_name: str):
    '''
    feature_name can only be following : temporal, promotion, marketing, store
    '''
    print(f"------EDA on {feature_name} features------------------")
    try:
        print(f"min & max dates {df_input.date.min().date()},{df_input.date.max().date()}",end="\n\n")
    except:
        pass
    print(f"shape of dataset : {df_input.shape}",end="\n\n")
    print(f"{feature_name} features: {df_input.columns.values}",end="\n\n")
    print(f"missing values :\n{df_input.isnull().sum()}")
    

In [None]:
eda_features(df_grp, "events_features")