In [1]:
import fsspec
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
brand = 'LACOSTE'
country = 'UAE'
target = 'units'
cutoff_month='2025-07'
gcs_path = "gs://trd-sf-ntb"
experiment_name = "model_pipeline"  # or something like f"exp_{datetime.now():%Y%m%d_%H%M%S}"
# file_input = "pre_process_monthly_units.parquet"
file_output = "monthly_lever_features.parquet"

#inactive_stores = [3106, 3143, 3173, 4022]

In [3]:
# Define inputs
lacoste_store_ids = pd.read_pickle("active_stores.pkl").to_list()

# Format store list for SQL IN clause
lacoste_storeid_str = ', '.join(str(sid) for sid in lacoste_store_ids)
year_month='2025-07'

target_end_date="2026-10-31"
EffFromDate = '2023-03-01'
EffUpToDate = '2023-03-30'

In [4]:
from google.cloud import bigquery

# BigQuery client
client = bigquery.Client()

# Define query with f-string
query_promo = f"""
SELECT * FROM `chb-svc-tredence-d001.shared_analytics_prod.promotions`
WHERE locationId IN ({lacoste_storeid_str})
  AND DATE(eligibilityRuleEffFromDate) >= DATE('{EffFromDate}')
  AND DATE(eligibilityRuleEffUpToDate) >= DATE('{EffUpToDate}')
"""

# Run query
query_job_pro_dis = client.query(query_promo)
promo_la = query_job_pro_dis.to_dataframe()

In [5]:
promo_la.head(2)

Unnamed: 0,commonPromotionId,itemId,locationId,eligibilityRuleEffFromDate,eligibilityRuleEffUpToDate,description,name,strategy,amountOfDiscount,buyQuantity,promotionSpend,promotionCondition,maximumPromotionBuyQuantity,loc_type
0,53575992,18826730047,52003,2024-11-27,2024-12-03,BLACK FRIDAY:LACOSTE:UAE:25-40%,BLACK FRIDAY:25-40%,RELATIVE,25.0,1.0,,,,S
1,53575992,18826936331,52071,2024-11-27,2024-12-03,BLACK FRIDAY:LACOSTE:UAE:25-40%,BLACK FRIDAY:25-40%,RELATIVE,25.0,1.0,,,,S


### Select only 'RELATIVE' strategy

In [6]:
promo = promo_la[promo_la['strategy'] == 'RELATIVE']

In [7]:
promo.head(2)

Unnamed: 0,commonPromotionId,itemId,locationId,eligibilityRuleEffFromDate,eligibilityRuleEffUpToDate,description,name,strategy,amountOfDiscount,buyQuantity,promotionSpend,promotionCondition,maximumPromotionBuyQuantity,loc_type
0,53575992,18826730047,52003,2024-11-27,2024-12-03,BLACK FRIDAY:LACOSTE:UAE:25-40%,BLACK FRIDAY:25-40%,RELATIVE,25.0,1.0,,,,S
1,53575992,18826936331,52071,2024-11-27,2024-12-03,BLACK FRIDAY:LACOSTE:UAE:25-40%,BLACK FRIDAY:25-40%,RELATIVE,25.0,1.0,,,,S


In [8]:
# Ensure date columns are in datetime format
promo['eligibilityRuleEffFromDate'] = pd.to_datetime(promo['eligibilityRuleEffFromDate'])
promo['eligibilityRuleEffUpToDate'] = pd.to_datetime(promo['eligibilityRuleEffUpToDate'])

# Group by locationId and get min/max dates
promo.groupby('locationId').agg(
    min_eff_from_date=('eligibilityRuleEffFromDate', 'min'),
    max_eff_up_to_date=('eligibilityRuleEffUpToDate', 'max')
).reset_index()

Unnamed: 0,locationId,min_eff_from_date,max_eff_up_to_date
0,52003,2023-06-15,2025-08-04
1,52009,2023-06-15,2025-08-04
2,52010,2023-06-15,2025-08-04
3,52012,2023-06-15,2025-08-04
4,52020,2023-06-15,2025-08-04
5,52043,2023-06-15,2025-08-04
6,52052,2023-06-15,2025-08-04
7,52071,2023-06-15,2025-08-04
8,52074,2023-06-15,2025-08-04
9,52082,2024-08-08,2025-06-02


In [9]:
promo.shape

(728230, 14)

In [10]:
promo.eligibilityRuleEffFromDate.min()

Timestamp('2023-06-15 00:00:00')

In [11]:
promo.eligibilityRuleEffUpToDate.max()

Timestamp('2025-08-04 00:00:00')

In [12]:
%%time
def parse_promo_dates(df: pd.DataFrame) -> pd.DataFrame:
    df['eligibilityRuleEffFromDate'] = pd.to_datetime(df['eligibilityRuleEffFromDate'])
    df['eligibilityRuleEffUpToDate'] = pd.to_datetime(df['eligibilityRuleEffUpToDate'])
    df['promo_length'] = (df['eligibilityRuleEffUpToDate'] - df['eligibilityRuleEffFromDate']).dt.days + 1
    df['discount_efficiency'] = df['amountOfDiscount'].astype(float) / df['promo_length']
    return df

def expand_monthly_records(df: pd.DataFrame) -> pd.DataFrame:
    def expand_months(row):
        return pd.date_range(row['eligibilityRuleEffFromDate'], row['eligibilityRuleEffUpToDate'], freq='MS')
    df['months'] = df.apply(expand_months, axis=1)
    exploded = df.explode('months').rename(columns={'months': 'date'})
    exploded['date'] = exploded['date'].dt.to_period('M')
    return exploded


def expand_promo_days(df: pd.DataFrame) -> pd.DataFrame:
    def expand_days(row):
        return pd.date_range(row['eligibilityRuleEffFromDate'], row['eligibilityRuleEffUpToDate'])
    df['promo_days'] = df.apply(expand_days, axis=1)
    exploded = df.explode('promo_days')
    exploded['year_month'] = exploded['promo_days'].dt.to_period('M')
    return exploded

def compute_promo_days_per_month(promo_days_df: pd.DataFrame) -> pd.DataFrame:
    return (
        promo_days_df
        .drop_duplicates(['locationId', 'promo_days'])
        .groupby(['locationId', 'year_month'])
        .agg(promo_days_in_month=('promo_days', 'count'))
        .reset_index()
        .rename(columns={'year_month': 'date'})
    )

def aggregate_monthly_features(promo_monthly: pd.DataFrame) -> pd.DataFrame:
    return (
        promo_monthly.groupby(['locationId', 'date']).agg(
            unique_items_on_promo=('itemId', 'nunique'),
            distinct_discount_levels=('amountOfDiscount', 'nunique'),
            avg_discount=('amountOfDiscount', 'mean'),
            max_discount=('amountOfDiscount', 'max'),
            min_discount=('amountOfDiscount', 'min'),
            avg_promo_duration=('promo_length', 'mean')
        ).reset_index()
    )

def calculate_percentage_products_on_promo(monthly_df: pd.DataFrame, promo_la: pd.DataFrame) -> pd.DataFrame:
    total_items_available = (
        promo_la.groupby('locationId')['itemId']
        .nunique()
        .reset_index()
        .rename(columns={'itemId': 'total_items_available'})
    )
    merged = pd.merge(monthly_df, total_items_available, on='locationId', how='left')
    merged['percentage_products_on_promo'] = (
        merged['unique_items_on_promo'] / merged['total_items_available']
    ) * 100
    merged['percentage_products_on_promo'] = merged['percentage_products_on_promo'].round(2)
    return merged.drop(columns=['unique_items_on_promo', 'total_items_available'])

def monthly_aggregated_promo_features(promo_la: pd.DataFrame) -> pd.DataFrame:
    promo_la = parse_promo_dates(promo_la)
    promo_monthly = expand_monthly_records(promo_la)
    promo_days_expanded = expand_promo_days(promo_la)
    promo_day_counts = compute_promo_days_per_month(promo_days_expanded)
    monthly_agg = aggregate_monthly_features(promo_monthly)

    # Merge promo days
    monthly_agg = pd.merge(monthly_agg, promo_day_counts, on=['locationId', 'date'], how='left')
    monthly_agg['promo_days_in_month'] = monthly_agg['promo_days_in_month'].fillna(0).astype(int)

    # Calculate % of products on promo
    monthly_agg = calculate_percentage_products_on_promo(monthly_agg, promo_la)

    # Round selected numeric columns
    for col in ['avg_discount', 'max_discount', 'min_discount', 'avg_promo_duration']:
        try:
             monthly_agg[col] = monthly_agg[col].fillna(0).astype(float).round(2)
        except:
             print(monthly_agg[col].apply(lambda x: type(x)).unique())

    return monthly_agg

monthly_promo_df = monthly_aggregated_promo_features(promo)
monthly_promo_df.head(2)

CPU times: user 3min 33s, sys: 22.7 s, total: 3min 55s
Wall time: 3min 47s


Unnamed: 0,locationId,date,distinct_discount_levels,avg_discount,max_discount,min_discount,avg_promo_duration,promo_days_in_month,percentage_products_on_promo
0,52003,2023-07,5,29.54,50.0,25.0,45.64,31,31.81
1,52003,2023-08,5,29.59,50.0,25.0,29.98,14,31.81


In [13]:
monthly_promo_df.tail()

Unnamed: 0,locationId,date,distinct_discount_levels,avg_discount,max_discount,min_discount,avg_promo_duration,promo_days_in_month,percentage_products_on_promo
143,52085,2025-02,4,37.02,40.0,25.0,25.76,3,65.51
144,52085,2025-06,4,33.18,40.0,25.0,7.58,14,61.0
145,52085,2025-07,4,34.97,40.0,25.0,47.0,31,68.7
146,52085,2025-08,4,34.97,40.0,25.0,47.0,4,68.7
147,52086,2025-06,1,40.0,40.0,40.0,5.0,2,8.31


In [14]:
filtered_df = monthly_promo_df[monthly_promo_df['date'] <= pd.Period(year_month)]

In [15]:
filtered_df.date.min()

Period('2023-07', 'M')

In [16]:
def extend_promo_until_oct_2026(monthly_promo_df: pd.DataFrame,target_end_date) -> pd.DataFrame:
    """
    Extend monthly promo data until October 2026 using same-month values from the previous year.
    Dates are returned as datetime (1st of each month).
    """
    df = monthly_promo_df.copy()

    # Ensure 'date' column is datetime at start of month
    if pd.api.types.is_period_dtype(df['date']):
        df['date'] = df['date'].dt.to_timestamp(how='start')
    elif pd.api.types.is_datetime64_any_dtype(df['date']):
        df['date'] = df['date'].values.astype('datetime64[M]')

    target_end_date = pd.to_datetime(target_end_date)
    repeated_dfs = []

    for loc in df['locationId'].unique():
        loc_df = df[df['locationId'] == loc].copy()
        last_date = loc_df['date'].max()

        while last_date < target_end_date:
            future_start = last_date + pd.DateOffset(months=1)
            future_end = min(last_date + pd.DateOffset(months=15), target_end_date)

            sply_start = future_start - pd.DateOffset(years=1)
            sply_end = future_end - pd.DateOffset(years=1)

            sply_data = loc_df[(loc_df['date'] >= sply_start) & (loc_df['date'] <= sply_end)].copy()

            if sply_data.empty:
                break

            sply_data['date'] = sply_data['date'] + pd.DateOffset(months=12)
            repeated_dfs.append(sply_data)
            loc_df = pd.concat([loc_df, sply_data], ignore_index=True)
            last_date = loc_df['date'].max()
    print(last_date)
    df_extended = pd.concat([df] + repeated_dfs, ignore_index=True)
    df_extended = df_extended.drop_duplicates(subset=['locationId', 'date'])
    df_extended = df_extended.sort_values(['locationId', 'date']).reset_index(drop=True)

    return df_extended

In [17]:
monthly_df_extended = extend_promo_until_oct_2026(filtered_df,target_end_date)
monthly_df_extended.tail(2)

2026-06-01 00:00:00


Unnamed: 0,locationId,date,distinct_discount_levels,avg_discount,max_discount,min_discount,avg_promo_duration,promo_days_in_month,percentage_products_on_promo
217,52086,2025-06-01,1,40.0,40.0,40.0,5.0,2,8.31
218,52086,2026-06-01,1,40.0,40.0,40.0,5.0,2,8.31


In [18]:
monthly_df_extended["date"].max()

Timestamp('2026-10-01 00:00:00')

In [19]:
# Ensure 'date' is datetime
monthly_df_extended['date'] = pd.to_datetime(monthly_df_extended['date'])

# Extract year-month as Period (e.g., 2024-05)
monthly_df_extended['year_month'] = monthly_df_extended['date'].dt.to_period('M')

# Group and aggregate unique months per location
available_months = (
    monthly_df_extended.groupby('locationId')['year_month']
    .unique()
    .reset_index()
    .rename(columns={'year_month': 'available_months'})
)

# Optionally sort months
available_months['available_months'] = available_months['available_months'].apply(lambda x: sorted(x))

# Display
available_months.head()
#available_months.to_csv("lacoste_available_months.csv")

Unnamed: 0,locationId,available_months
0,52003,"[2023-07, 2023-08, 2023-12, 2024-01, 2024-02, ..."
1,52009,"[2023-07, 2023-08, 2023-12, 2024-01, 2024-02, ..."
2,52010,"[2023-07, 2023-08, 2023-12, 2024-01, 2024-02, ..."
3,52012,"[2023-07, 2023-08, 2023-12, 2024-01, 2024-02, ..."
4,52020,"[2023-07, 2023-08, 2023-12, 2024-01, 2024-02, ..."


In [20]:
monthly_df_extended.drop("year_month",axis=1,inplace=True)

In [21]:
monthly_df_extended.columns

Index(['locationId', 'date', 'distinct_discount_levels', 'avg_discount',
       'max_discount', 'min_discount', 'avg_promo_duration',
       'promo_days_in_month', 'percentage_products_on_promo'],
      dtype='object')

In [22]:
monthly_df_extended.date.max()

Timestamp('2026-10-01 00:00:00')

In [23]:
# Construct full dynamic path
full_path = f"{gcs_path}/{target}/{brand}/{experiment_name}/{cutoff_month}/{file_output}"

In [24]:
print(full_path)

gs://trd-sf-ntb/units/LACOSTE/model_pipeline/2025-07/monthly_lever_features.parquet


In [25]:
# Save the DataFrame
monthly_df_extended.to_parquet(full_path, index=False)

In [26]:
hello

NameError: name 'hello' is not defined

In [None]:
def eda_features(df_input: pd.DataFrame, feature_name: str):
    '''
    feature_name can only be following : temporal, promotion, marketing, store
    '''
    print(f"------EDA on {feature_name} features------------------")
    try:
        print(f"min & max dates {df_input.date.min().date()},{df_input.date.max().date()}",end="\n\n")
    except:
        pass
    print(f"shape of dataset : {df_input.shape}",end="\n\n")
    print(f"{feature_name} features: {df_input.columns.values}",end="\n\n")
    print(f"missing values :\n{df_input.isnull().sum()}")
    

In [None]:
eda_features(monthly_df_extended,"Promotion_features")