In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import randint as sp_randint

from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from typing import Optional, List, Tuple,  Callable, Dict, Union
import re
import os
import joblib
import holidays
import pickle
import fsspec

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [20]:
# 13 stores transactional data - 
# target - count of transactions monthly
# obtain monthly units_sold, transactions, sales_amount for active stores ( filtered for transaction observed in May 2025)

In [21]:
brand = 'LACOSTE'
country = 'UAE'
data_start_date = '2023-01-01'
data_end_date = '2025-07-31'
target="units"
cutoff_month='2025-07'
gcs_path = "gs://trd-sf-ntb"
experiment_name = "model_pipeline"  # or something like f"exp_{datetime.now():%Y%m%d_%H%M%S}"
file_name = "pre_process_monthly_units.parquet"

In [22]:
from google.cloud import bigquery
client = bigquery.Client()
 
#Transaction data
query_df = f"""
    SELECT *
    FROM `chb-svc-tredence-d001.shared_analytics_prod.factretailsales_v2` sales
    JOIN `chb-svc-tredence-d001.shared_analytics_prod.dim_retail_product` products
    ON sales.bk_productid = products.item
    WHERE sales.brand = '{brand}'
      AND sales.bu_country = '{country}'
      AND sales.bk_businessdate >= '{data_start_date}' 
      AND sales.bk_businessdate <= '{data_end_date}'
"""
query_job_trans = client.query(query_df)
df_full = query_job_trans.to_dataframe()

In [23]:
df_full.rename(columns={'bk_storeid': 'key', 'bk_businessdate': 'date'}, inplace=True)

In [24]:
df_full.columns

Index(['primary_key', 'bk_productid', 'key', 'date', 'crm_customer_id',
       'pos_customer_id', 'fullprice_tran_id', 'tran_seq_no', 'sales_channel',
       'sales_channel_group', 'mea_quantity', 'amountusd_beforetax',
       'discountamt_usd', 'av_cost_USD', 'return_quantity', 'tran_type',
       'brand', 'business_type', 'vertical', 'bu_country', 'netsalesUSD_onFP',
       'NetSalesUSD_onMD', 'atr_muse_id', 'item', 'barcode', 'item_style',
       'last_update_datetime', 'item_desc', 'short_desc', 'vpn', 'diff_1',
       'diff_2', 'standard_uom', 'brand_1', 'division_no', 'division',
       'group_no', 'group_name', 'dept_no', 'dept_name', 'class', 'class_name',
       'subclass', 'subclass_name', 'taxo_class', 'taxo_class_uda_value',
       'taxo_subclass', 'taxo_subclass_uda_value', 'gender', 'original_gender',
       'line', 'sub_line', 'atrb_boy_girl', 'attrb_theme', 'attrb_made_of',
       'grey_mkt_ind', 'attrb_color', 'sup_class', 'sup_subclass',
       'usage_specificity', 'p

In [25]:
# Convert 'date' column to datetime
df_full['date'] = pd.to_datetime(df_full['date'])

# Convert 'date' to the first day of the month
df_full['month'] = df_full['date'].values.astype('datetime64[M]')

# Ensure numeric columns are the right type
df_full['amountusd_beforetax'] = df_full['amountusd_beforetax'].astype(int)
df_full['mea_quantity'] = df_full['mea_quantity'].astype(int)

# Perform all aggregations in one groupby operation
df = df_full.groupby(['key','business_type', 'month']).agg(
    num_transactions=('tran_seq_no', 'nunique'),
    sales_amount=('amountusd_beforetax', 'sum'),
    units_quantity=('mea_quantity', 'sum')
).reset_index()

# Calculate price per unit
df['ppu'] = df['sales_amount'] / df['units_quantity']

# Ensure 'key' is an integer
df['key'] = df['key'].astype(int)
df['key'] = df['key'].astype(str)

In [26]:
# Rename 'month' column to 'date' to match expected function parameter
df = df.rename(columns={'month': 'date'})

# Convert 'date' to string before passing to the function
df['date'] = df['date'].dt.strftime('%Y-%m-%d')  # Or any format if you want to simulate ambiguity

In [27]:
def detect_date_format(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
    """
    Detect and apply the correct date format to a specified column in a pandas DataFrame.

    This function attempts to parse the dates in the specified column using a list of common date formats.
    If a format is found that can successfully parse all dates without resulting in null values, the column
    is converted to this date format.

    Parameters:
    df (pd.DataFrame): Input DataFrame containing the date column.
    date_col (str): The name of the column containing date strings to be parsed.

    Returns:
    pd.DataFrame: DataFrame with the specified date column converted to datetime. If no format is successful,
                  the original DataFrame is returned unchanged.
    """
    formats = [
        '%m-%d-%Y', '%-m/%-d/%Y', '%-d/%-m/%Y', '%Y-%m-%d', '%m/%d/%Y', '%Y/%m/%d',
        '%d-%m-%Y', '%Y/%m/%d %H:%M:%S', '%m-%d-%Y %H:%M:%S', '%m-%d-%Y %I:%M %p',
        '%d-%b-%Y', '%Y%m%d'
    ]

    for fmt in formats:
        try:
            parsed_dates = pd.to_datetime(df[date_col], format=fmt, errors='raise')
            df_copy = df.copy()
            df_copy[date_col] = parsed_dates
            return df_copy
        except Exception:
            continue

    # If none of the formats match, return original DataFrame
    return df

df = detect_date_format(df, 'date')
df.head(2)

Unnamed: 0,key,business_type,date,num_transactions,sales_amount,units_quantity,ppu
0,52003,RETAIL,2023-01-01,2446,446061,4069,109.624232
1,52003,RETAIL,2023-02-01,1287,271612,2194,123.79763


In [28]:
df

Unnamed: 0,key,business_type,date,num_transactions,sales_amount,units_quantity,ppu
0,52003,RETAIL,2023-01-01,2446,446061,4069,109.624232
1,52003,RETAIL,2023-02-01,1287,271612,2194,123.797630
2,52003,RETAIL,2023-03-01,1584,375908,3030,124.062046
3,52003,RETAIL,2023-04-01,1500,347405,3051,113.865946
4,52003,RETAIL,2023-05-01,1759,364235,3116,116.891849
...,...,...,...,...,...,...,...
380,52086,ECOMMERCE,2025-03-01,376,44077,339,130.020649
381,52086,ECOMMERCE,2025-04-01,468,40168,376,106.829787
382,52086,ECOMMERCE,2025-05-01,594,54293,522,104.009579
383,52086,ECOMMERCE,2025-06-01,654,54380,548,99.233577


In [29]:
def remove_inactive_stores(df, store_col='store_id', date_col='date', cutoff_month="2025-05"):
    """
    Removes inactive stores from the DataFrame based on store activity in the target month.

    Parameters
    ----------
    df : pd.DataFrame
        Input transaction data with at least [store_col, date_col].
    store_col : str, default 'store_id'
        Name of the column containing store IDs.
    date_col : str, default 'date'
        Name of the column containing transaction dates.
    cutoff_month : str, default 2025-05
        Cutoff month in 'YYYY-MM' format to define active stores.

    Returns
    -------
    pd.DataFrame
        Filtered DataFrame containing only rows from active stores.
    """
    df = df.copy()
    
    # Ensure date column is datetime
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Extract month
    df['month'] = df[date_col].dt.to_period('M').astype(str)
    
    # Get active stores in target month
    active_stores = set(df[df['month'] == cutoff_month][store_col].unique())
    
    # Filter to only active stores
    filtered_df = df[df[store_col].isin(active_stores)].drop(columns='month').reset_index(drop=True)
    
    return filtered_df

In [30]:
filtered_df = remove_inactive_stores(
    df=df,
    store_col='key',
    date_col='date',
    cutoff_month=cutoff_month
)


In [31]:
filtered_df['key'].nunique()

12

In [32]:
filtered_df['key'].unique()

array(['52003', '52009', '52010', '52012', '52020', '52043', '52052',
       '52071', '52074', '52082', '52085', '52086'], dtype=object)

In [33]:
pd.Series(filtered_df['key'].unique()).to_pickle("active_stores.pkl")

In [34]:
# Construct full dynamic path
full_path = f"{gcs_path}/{target}/{brand}/{experiment_name}/{cutoff_month}/{file_name}"
full_path

'gs://trd-sf-ntb/units/LACOSTE/model_pipeline/2025-07/pre_process_monthly_units.parquet'

In [35]:
# Save the DataFrame
filtered_df.to_parquet(full_path, index=False)

In [36]:
hello

NameError: name 'hello' is not defined

In [None]:
def eda_features(df_input: pd.DataFrame, feature_name: str):
    '''
    feature_name can only be following : temporal, promotion, marketing, store
    '''
    print(f"------EDA on {feature_name} features------------------")
    try:
        print(f"min & max dates {df_input.date.min().date()},{df_input.date.max().date()}",end="\n\n")
    except:
        pass
    print(f"shape of dataset : {df_input.shape}",end="\n\n")
    print(f"{feature_name} features: {df_input.columns.values}",end="\n\n")
    print(f"missing values :\n{df_input.isnull().sum()}")
    

In [None]:
eda_features(filtered_df, "pre_process")