In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import randint as sp_randint
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from Lacoste_temporal_utils import add_monthly_UAE_holiday_count,get_lag_features, create_peak_calendar,merge_peak_calendar_info,create_monthly_seasonal_features,get_rate_of_sale_monthly,get_monthly_seasonality_index,get_moving_stats_features
import category_encoders as ce
import seaborn as sns

In [2]:
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from typing import Optional, List, Tuple,  Callable, Dict, Union
import re
import os
import joblib
import holidays
import pickle
import fsspec

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from google.cloud import bigquery
from pandas_gbq import to_gbq

# !pip install category_encoders

In [3]:
brand = 'LACOSTE'
country = 'UAE'
data_start_date = '2023-01-01'
train_end_date="2024-12-31"
inference_start_date="2025-05-01"
cut_off_date='2025-06-01'
data_end_date = '2026-12-31'


target = 'units'
fc_horizon=22
# Starting date
gcs_path = "gs://trd-sf-ntb"
experiment_name = "pipeline_test"  
dt = datetime.strptime(train_end_date, "%Y-%m-%d")
date=dt + relativedelta(day=1)+relativedelta(months=fc_horizon)
horizon_end_date=date.strftime("%Y-%m-%d")
fc_start_date=dt + relativedelta(day=1)+relativedelta(months=1)
fc_start_date=fc_start_date.strftime("%Y-%m-%d")



gcs_path = "gs://trd-sf-ntb"
experiment_name = "pipeline_test"  # or something like f"exp_{datetime.now():%Y%m%d_%H%M%S}"

file_input="feature_store.parquet"
file_output="model_inference.parquet"

local_model_dir="saved_models/tuned/dec"
os.makedirs(local_model_dir, exist_ok=True)

local_res_dir="saved_results/tuned/dec"
os.makedirs(local_res_dir, exist_ok=True)






In [4]:
'gs://trd-sf-ntb/units/LACOSTE/22/pipeline_test/feature_store.parquet'

'gs://trd-sf-ntb/units/LACOSTE/22/pipeline_test/feature_store.parquet'

In [5]:
'gs://trd-sf-ntb/units/FACES/pipeline_test/18/feature_store.parquet'

'gs://trd-sf-ntb/units/FACES/pipeline_test/18/feature_store.parquet'

In [6]:
full_path = f"{gcs_path}/{target}/{brand}/{fc_horizon}/{experiment_name}/{file_input}"

df=pd.read_parquet(full_path)

In [7]:
full_path

'gs://trd-sf-ntb/units/LACOSTE/22/pipeline_test/feature_store.parquet'

In [8]:
base_vars = {"key": "key",
             "target": "target",
             "date": "date"}

### Recursive features

In [9]:
def feature_combine_pd(dataframe: pd.DataFrame,
                       dfu_columns: list[str],
                       feature_list: dict[str, any],
                       key: str,
                       ds: str,
                       target: str) -> pd.DataFrame:

    temp_df = dataframe.copy()

    temp_df = get_lag_features(temp_df, [12], key=key, date_col=ds, target=target)
    temp_df = get_moving_stats_features(temp_df, months_back=[6])
    temp_df = get_monthly_seasonality_index(temp_df, 'date', 'target')
    # temp_df=create_monthly_seasonal_features(temp_df,'date', 'target','key')
    
    # Add year column if not already there
    if 'year' not in temp_df.columns:
        temp_df['year'] = pd.to_datetime(temp_df[ds]).dt.year

    # Generate & merge rate_of_sale
    rate_of_sale_df = get_rate_of_sale_monthly(temp_df, [key])
    temp_df = temp_df.merge(rate_of_sale_df, on=[key, 'year'], how='left')

    return temp_df


In [10]:
def forecast_loop_store(key,df_in,model_pred, expected_features,
                  feature_list,model_name):
    future_rows_to_predict=df_in[df_in["Pred_Flag"]==1].copy()
    future_rows_to_predict.sort_values("date",inplace=True)
    main_df=df_in[df_in["Pred_Flag"]==0].copy()
    main_df.sort_values("date",inplace=True)
    
    for i in range(len(future_rows_to_predict)):
        data_sv=[]
        row_to_forecast = future_rows_to_predict.iloc[[i],:].copy()
        date=row_to_forecast["date"].values[0]
        # print(date)
        pred_flag=row_to_forecast["Pred_Flag"].values[0]
        temp_df = pd.concat([main_df, row_to_forecast], ignore_index=True)
        # print(temp_df.columns)
        temp_df.drop(columns=feature_list, errors='ignore', inplace=True)

        # Generate features
        feature_df = feature_combine_pd(temp_df, ['key'], feature_list, 'key', 'date', 'target')
        model_input = feature_df.reindex(columns=expected_features, fill_value=0)

        row_pred = model_input.iloc[[-1]]  # Last row is the forecast row
        column_name=expected_features.copy()
        column_name.extend(["target",'key', 'date','Pred_Flag'])
        # Predict
        pred = model_pred.predict(row_pred)
        record=list(np.concatenate([row_pred.values.flatten(),pred.flatten()]).reshape(len(row_pred.values[0])+1))
        record.append(key)
        record.append(date)
        record.append(pred_flag)
        data_sv.append(record)
       
        # Update prediction in the future dataframe
        future_rows_to_predict.iloc[i, future_rows_to_predict.columns.get_loc('target')] = pred[0]
        # Append predicted row to df for next iteration
        row_to_forecast['target'] = pred[0]
        df_res=pd.DataFrame(data_sv,columns=column_name)
        main_df = pd.concat([main_df[column_name], df_res], ignore_index=True)
        
    # df_res=pd.DataFrame(data_sv,columns=column_name)
    # df_res2=pd.concat([main_df[column_name],df_res],ignore_index=True)

    local_key_forecast_path = os.path.join(local_res_dir, f"{key}_{model_name}_forecast_.csv")
    
    print(f"{local_key_forecast_path}")
    main_df.to_csv(local_key_forecast_path)
    
     # Prepare forecast_df with predictions
    forecast_df = future_rows_to_predict[['key', 'date', 'target']].copy()
    forecast_df.rename(columns={'target': 'forecast'}, inplace=True)
    forecast_df['run_status'] = f"{df_in['key'].iloc[0]} success"
    return forecast_df


In [11]:
df

Unnamed: 0,key,date,target,holiday_count,fourier_year_sin,fourier_year_cos,Lag12_y,MA6_y,STD6_y,Seasonality_Index,...,promo_days_in_month,percentage_products_on_promo,channel_encode,store_format_encode,key_encode,quarter_sin,quarter_cos,channel_Farfetch,channel_Retail,pre_post_rm_flg
0,52003,2023-07-01,2992.0,1.0,-5.000000e-01,-8.660254e-01,1622.203438,3058.666667,600.061219,1.10,...,31.0,31.81,1565.700010,1524.243559,2062.643202,1.224647e-16,-1.000000e+00,0,1,0
1,52003,2023-10-01,1907.0,0.0,-8.660254e-01,5.000000e-01,1622.203438,2879.166667,343.719896,0.80,...,0.0,0.00,1565.700010,1524.243559,2062.643202,-1.000000e+00,-1.836970e-16,0,1,0
2,52003,2023-11-01,2660.0,0.0,-5.000000e-01,8.660254e-01,1622.203438,2831.333333,458.834030,1.13,...,0.0,0.00,1565.700010,1524.243559,2062.643202,-1.000000e+00,-1.836970e-16,0,1,0
3,52003,2023-12-01,3669.0,3.0,-2.449294e-16,1.000000e+00,1622.203438,2769.666667,451.600118,1.39,...,22.0,35.02,1565.700010,1524.243559,2062.643202,-1.000000e+00,-1.836970e-16,0,1,0
4,52003,2024-01-01,3139.0,1.0,5.000000e-01,8.660254e-01,1622.203438,2872.666667,580.639532,1.22,...,31.0,34.21,1565.700010,1524.243559,2062.643202,0.000000e+00,1.000000e+00,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,52086,2026-06-01,,1.0,1.224647e-16,-1.000000e+00,,,,,...,2.0,8.31,933.254773,1720.452579,1312.890189,1.000000e+00,6.123234e-17,1,0,0
493,52086,2026-07-01,,0.0,-5.000000e-01,-8.660254e-01,,,,,...,0.0,0.00,933.254773,1720.452579,1312.890189,1.224647e-16,-1.000000e+00,1,0,0
494,52086,2026-08-01,,1.0,-8.660254e-01,-5.000000e-01,,,,,...,0.0,0.00,933.254773,1720.452579,1312.890189,1.224647e-16,-1.000000e+00,1,0,0
495,52086,2026-09-01,,0.0,-1.000000e+00,-1.836970e-16,,,,,...,0.0,0.00,933.254773,1720.452579,1312.890189,1.224647e-16,-1.000000e+00,1,0,0


In [13]:
model_name="xgboost"
local_model_path = os.path.join(local_model_dir, f"{model_name}_model.pkl")
with open(local_model_path, 'rb') as f:
    model1 = pickle.load(f)
    print(model1)
    model=model1[0]
    expected_features=model1[1]
feature_list=['Lag12_y', 'MA6_y', 'STD6_y','Seasonality_Index']

(XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=0.8, colsample_bynode=None, colsample_bytree=0.8,
             device=None, early_stopping_rounds=None, enable_categorical=True,
             eval_metric=None, feature_types=None, feature_weights=None,
             gamma=0, grow_policy=None, importance_type='weight',
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=300, n_jobs=None,
             num_parallel_tree=None, ...), ['holiday_count', 'fourier_year_sin', 'fourier_year_cos', 'Lag12_y', 'MA6_y', 'STD6_y', 'Seasonality_Index', 'UAE_shopping_peak_ratio', 'festive_peak_flag', 'total_cost_usd_facebook', 'total_cost_usd_google', 'total_cost_usd_instagram', 'total_cost_usd_t

In [14]:
pd.Series(np.array(expected_features)).to_csv("lacoste_features.csv")

### Inference - Champion challenger model

In [15]:
def model_inference(df_input:pd.DataFrame,model,expected_features:list,feature_list:list,model_name:str)->pd.DataFrame:
    final = pd.DataFrame()
    dff=pd.DataFrame({})
    master = pd.DataFrame(columns=[base_vars['date'], base_vars['key'], 'forecast'])
    df_input["Pred_Flag"]=0
    df_input.loc[(pd.to_datetime(df_input.date)>train_end_date) & (pd.to_datetime(df_input.date)<=horizon_end_date),"Pred_Flag"]=1
    main_df=df_input[df_input["Pred_Flag"]==0]
    forecast_df=df_input[df_input["Pred_Flag"]==1]
    all_keys=forecast_df["key"].unique().tolist()

    for k in all_keys:
        df_key=df_input[df_input["key"]==k].copy()
        # print(df_key.head())
        df_forecast=forecast_loop_store(k,df_key,model ,expected_features,feature_list,model_name)
        dff=pd.concat([dff,df_forecast],axis=0,ignore_index=True)


    # merged_df=pd.merge(df_test[[base_vars['date'], base_vars['key'],base_vars['target']]],dff,how="left",on=[base_vars['date'], base_vars['key']])
    # apr_may_jun_df=merged_df[merged_df["date"]<"2025-07-01"].reset_index(drop=True)

    ovrll_forecast_path = os.path.join(local_res_dir, f"forecast_jun2026_{model_name}.csv")
    dff.to_csv(ovrll_forecast_path,index=False)
    dff['run_status_' + model_name] = 'success'

    dff=dff[master.columns]
    # Concatenate the current model results to the master DataFrame
    master = pd.concat([master, dff], axis=0)

    # Drop rows with NaN in the key column to clean up the results
    master.dropna(subset=[base_vars['key']], inplace=True)

    # Rename the forecast column to include the model name
    master.rename(columns={'forecast': f'forecast_{model_name}'}, inplace=True)

    # Ensure the date column is in datetime format
    master[base_vars['date']] = pd.to_datetime(master[base_vars['date']])
    # master.to_csv('inter_out.csv',index=False)


    # Merge the predicted values with the actual data
    final = pd.merge(master, forecast_df[[base_vars['date'], base_vars['key'],base_vars['target']]], on=[base_vars['key'], base_vars['date']], how='left')

    # Replace any NaN values in the final DataFrame with infinity

    return final

In [16]:
completed_fc=model_inference(df,model,expected_features,feature_list,"xgboost")

saved_results/tuned/dec/52003_xgboost_forecast_.csv
saved_results/tuned/dec/52004_xgboost_forecast_.csv
saved_results/tuned/dec/52009_xgboost_forecast_.csv
saved_results/tuned/dec/52010_xgboost_forecast_.csv
saved_results/tuned/dec/52012_xgboost_forecast_.csv
saved_results/tuned/dec/52020_xgboost_forecast_.csv
saved_results/tuned/dec/52043_xgboost_forecast_.csv
saved_results/tuned/dec/52052_xgboost_forecast_.csv
saved_results/tuned/dec/52071_xgboost_forecast_.csv
saved_results/tuned/dec/52074_xgboost_forecast_.csv
saved_results/tuned/dec/52082_xgboost_forecast_.csv
saved_results/tuned/dec/52085_xgboost_forecast_.csv
saved_results/tuned/dec/52086_xgboost_forecast_.csv


In [9]:
completed_fc=pd.read_parquet('gs://trd-sf-ntb/units/LACOSTE/22/pipeline_test/model_inference.parquet')

array([<class 'str'>], dtype=object)

In [17]:
inference_res=completed_fc[completed_fc["date"]>="2025-05-01"]
wmpe_res=completed_fc[(completed_fc["date"]>="2025-01-01") & (completed_fc["date"]<="2025-06-01")]

In [18]:
# Calculate WMAPE
def weighted_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true) * 100

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs(y_true - y_pred) /y_true)*100


def results(df_res,actuals:str,forecast:str,model_name:str,key:str,date:str):
    """ metrics - mae, rmse, r2, std_dev are obtained on predicted values
        Plot of target v/s forecasted is obtained on test data 
    """
    df_temp=df_res.copy()
    y_test=df_temp[actuals]
    y_pred=df_temp[forecast]
    print(f"---------{model_name}------------")
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    std_dev = y_test.std()
    rmse_std_ratio = rmse / std_dev

    wmape = weighted_mean_absolute_percentage_error(y_test, y_pred)

    # Output metrics
    print("\nModel Evaluation Metrics:")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")
    print(f"Standard Deviation of Test Data: {std_dev:.2f}")
    print(f"RMSE/Std Dev Ratio: {rmse_std_ratio:.2f}")
    print(f"WMAPE: {wmape:.2f}%")

    # # Plot feature importance
    # plt.figure(figsize=(10, 6))
    # xgb.plot_importance(xgb_model, max_num_features=20)
    # plt.title('XGBoost Feature Importance')
    # plt.tight_layout()
    # plt.show()
    index_plot=df_temp[key].astype(str)+"_"+\
    df_temp[date].astype(str).str.replace(r'-\d{2}$', '', regex=True)
    
    # Plot actual vs predicted
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(y_test)), y_test, label='Actual', linewidth=2)
    plt.plot(range(len(y_test)) ,y_pred, label='Predicted', linestyle='--', linewidth=2)
    plt.title(f'Monthly Store Units:({model_name})')
    plt.xticks(range(len(y_test)),index_plot)
    plt.xlabel('Store_Month')
    plt.legend()
    plt.ylabel('Unit count')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()



In [19]:
def output_results(df_input:pd.DataFrame,forecast:str,target:str) ->pd.DataFrame:
    
    df_input['row_mape']=(np.abs(df_input[target] - df_input[forecast]) / df_input[target] * 100)
    df_input=df_input.rename(columns = {'key':'store',forecast:'forecasted_units',
                               target:'actual_units','row_mape':'MAPE'})
    
    df_input["Brand"] = "LACOSTE"
    df_input["Frequency"] = "Monthly"
    column_order = ['date','store','Brand','actual_units','forecasted_units','MAPE','Frequency']
    df_input = df_input[column_order]
    return df_input

In [20]:
forecast_df=output_results(wmpe_res,"forecast_xgboost","target")

In [21]:
forecast_df.sort_values("MAPE",ascending=False)

Unnamed: 0,date,store,Brand,actual_units,forecasted_units,MAPE,Frequency
27,2025-06-01,52004,LACOSTE,26.0,385.800995,1383.849980,Monthly
265,2025-02-01,52086,LACOSTE,111.0,241.396744,117.474544,Monthly
26,2025-05-01,52004,LACOSTE,195.0,360.769012,85.009750,Monthly
243,2025-02-01,52085,LACOSTE,425.0,784.882019,84.678122,Monthly
224,2025-05-01,52082,LACOSTE,264.0,461.578796,74.840453,Monthly
...,...,...,...,...,...,...,...
89,2025-02-01,52012,LACOSTE,835.0,852.789246,2.130449,Monthly
3,2025-04-01,52003,LACOSTE,2642.0,2586.734863,2.091792,Monthly
68,2025-03-01,52010,LACOSTE,1088.0,1075.546753,1.144600,Monthly
22,2025-01-01,52004,LACOSTE,392.0,388.662170,0.851487,Monthly


In [53]:
# weighted_mean_absolute_percentage_error(table["actual_units"],table["forecasted_units"])
mean_absolute_percentage_error(table["actual_units"],table["forecasted_units"])

40.548563747530345

In [57]:
key_res=forecast_df.groupby("store").agg(total_xgboost_fc=("forecasted_units","sum"),total_target=("actual_units","sum"))

In [58]:
model_name="xgboost"
key_res[f'{model_name}_row_mape'] = (
            np.abs(key_res['total_target'] - key_res['total_xgboost_fc']) / key_res['total_target']
        ) * 100
key_res.reset_index(inplace=True)

In [64]:
key_res.sort_values("xgboost_row_mape",ascending=False)


Unnamed: 0,store,total_xgboost_fc,total_target,xgboost_row_mape
1,52004,2266.745636,1416.0,60.080906
11,52085,5516.643127,4148.0,32.995254
12,52086,1727.479752,2243.0,22.983515
3,52010,6714.082642,5493.0,22.229795
5,52020,3385.915039,4234.0,20.030349
10,52082,2771.373749,2528.0,9.627126
2,52009,10182.262695,9416.0,8.137879
9,52074,5928.053589,5499.0,7.802393
0,52003,16589.490967,15522.0,6.877277
7,52052,17296.719971,18317.0,5.570126


In [22]:
new_forecast_df=forecast_df[forecast_df.store!='52004']

In [26]:
weighted_mean_absolute_percentage_error(new_forecast_df["actual_units"],new_forecast_df["forecasted_units"])

16.25662321156181

In [23]:
# weighted_mean_absolute_percentage_error(table["actual_units"],table["forecasted_units"])
mean_absolute_percentage_error(new_forecast_df["actual_units"],new_forecast_df["forecasted_units"])

21.740407714733077

In [25]:
mean_absolute_percentage_error(new_forecast_df['target'],new_forecast_df["forecasted_units"])

KeyError: 'target'

In [15]:
def eda_features(df_input: pd.DataFrame, feature_name: str):
    '''
    feature_name can only be following : temporal, promotion, marketing, store
    '''
    print(f"------EDA on {feature_name} features------------------")
    try:
        print(f"min & max dates {df_input.date.min().date()},{df_input.date.max().date()}",end="\n\n")
    except:
        pass
    print(f"shape of dataset : {df_input.shape}",end="\n\n")
    print(f"{feature_name} features: {df_input.columns.values}",end="\n\n")
    print(f"missing values :\n{df_input.isnull().sum()}")
    

In [20]:
eda_features(completed_fc,"output")

------EDA on output features------------------
min & max dates 2025-01-01,2026-10-01

shape of dataset : (286, 4)

output features: ['date' 'key' 'forecast_xgboost' 'target']

missing values :
date                  0
key                   0
forecast_xgboost      0
target              208
dtype: int64


In [80]:
full_path = f"{gcs_path}/{target}/{brand}/{fc_horizon}/{experiment_name}/{file_output}"

In [81]:
full_path

'gs://trd-sf-ntb/units/LACOSTE/22/pipeline_test/model_inference.parquet'

In [8]:
completed_fc["key"]

NameError: name 'completed_fc' is not defined

In [82]:
completed_fc.to_parquet(full_path,index=False)

In [78]:
completed_fc[~completed_fc.target.isnull()]

Unnamed: 0,date,key,forecast_xgboost,target
0,2025-01-01,52003,3010.662354,3520.0
1,2025-02-01,52003,2426.026855,1966.0
2,2025-03-01,52003,2879.098877,2473.0
3,2025-04-01,52003,2586.734863,2642.0
4,2025-05-01,52003,2559.792969,2723.0
...,...,...,...,...
265,2025-02-01,52086,241.396744,111.0
266,2025-03-01,52086,381.235229,339.0
267,2025-04-01,52086,192.869781,376.0
268,2025-05-01,52086,322.530426,522.0


In [27]:
hello

NameError: name 'hello' is not defined

In [8]:
# full_path3
file_path

'gs://trd-sf-ntb/LACOSTE/pipeline_test/model_inference.parquet'

In [None]:
hello

In [61]:
# features
print(full_path2)
print(full_path3)
promo_df=pd.read_parquet(full_path2)


mmm_df = pd.read_parquet(full_path3)

# lacoste_mmm = pd.read_parquet("gs://trd-sf-ntb/lacoste/sankalp_pipeline_daily/daily_features_mmm_lacoste.parquet")

gs://trd-sf-ntb/LACOSTE_only/pipeline_test/monthly_lever_features.parquet
gs://trd-sf-ntb/LACOSTE_only/pipeline_test/monthly_features_mmm_faces.parquet


In [62]:
table['date'] = pd.to_datetime(table['date']).dt.date
table['store']=table['store'].apply(lambda x: int(x))
promo_df['date'] = pd.to_datetime(promo_df['date']).dt.date
mmm_df['date'] = pd.to_datetime(mmm_df['date']).dt.date

In [51]:

# promo_df.columns

In [63]:
table = table.rename(columns = {'Brand':'brand','Frequency':'frequency'})
promo_df = promo_df.rename(columns = {'locationId':'store','avg_discount':'promo_depth'})
mmm_df = mmm_df.rename(columns = {'total_cost_usd':'marketing_spend'})

In [64]:
table = table.merge(promo_df[['store','date','promo_depth']], on = ['store','date'], how = 'left').fillna(0)
table = table.merge(mmm_df[['date','marketing_spend']], on = 'date', how = 'left').sort_values(by = ['date','store']).reset_index(drop= True).fillna(0)

In [65]:
table.dtypes

date                 object
store                 int64
brand                object
actual_units        float64
forecasted_units    float64
MAPE                float64
frequency            object
promo_depth         float64
marketing_spend     float64
dtype: object

In [66]:
table['forecasted_units'] = table['forecasted_units'].round().astype('Int64')

table['MAPE'] = table['MAPE'].round().astype('Int64')

In [67]:
table

Unnamed: 0,date,store,brand,actual_units,forecasted_units,MAPE,frequency,promo_depth,marketing_spend
0,2025-01-01,52003,LACOSTE,3520.0,3157,10,Monthly,37.817834,40756.8988
1,2025-01-01,52004,LACOSTE,392.0,541,38,Monthly,37.702703,40756.8988
2,2025-01-01,52009,LACOSTE,1889.0,2005,6,Monthly,37.949088,40756.8988
3,2025-01-01,52010,LACOSTE,1252.0,1223,2,Monthly,37.899584,40756.8988
4,2025-01-01,52012,LACOSTE,1263.0,1197,5,Monthly,37.928501,40756.8988
...,...,...,...,...,...,...,...,...,...
229,2026-06-01,52071,LACOSTE,0.0,4403,0,Monthly,33.796132,42443.6987
230,2026-06-01,52074,LACOSTE,0.0,1121,0,Monthly,33.705483,42443.6987
231,2026-06-01,52082,LACOSTE,0.0,731,0,Monthly,40.000000,42443.6987
232,2026-06-01,52085,LACOSTE,0.0,850,0,Monthly,33.176208,42443.6987


In [68]:
# table.to_parquet(file_path,index=False)

In [69]:
def save_to_bigquery(table,table_id):
    from pandas_gbq import to_gbq

    # Explicit schema alignment
    table['date'] = pd.to_datetime(table['date']).dt.date
    table['store'] = table['store'].astype('str')
    table['brand'] = table['brand'].astype(str)
    table['forecasted_units'] = table['forecasted_units'].astype('Int64')
    table['frequency'] = table['frequency'].astype(str)
    table['MAPE'] = table['MAPE'].astype('Int64')
    table['promo_depth'] = table['promo_depth'].astype('float')
    table['marketing_spend'] = table['marketing_spend'].astype('float')

    table_id = f"{dataset_id}.{table_id}"
    print(table_id)

    to_gbq(
        table,
        destination_table=table_id,
        project_id=project_id,
        if_exists='replace',
        table_schema=[
            {"name": "date", "type": "DATE"},
            {"name": "store", "type": "STRING"},
            {"name": "brand", "type": "STRING"},
            {"name": "forecasted_trans", "type": "INTEGER"},
            {"name": "forecasted_units", "type": "INTEGER"},
            {"name": "frequency", "type": "STRING"},
            {"name": "MAPE", "type": "INTEGER"},
            {"name": "promo_depth", "type": "FLOAT"},
            {"name": "marketing_spend", "type": "FLOAT"}
        ]
    )

    print(f"✅ Appended {len(table)} rows to {table_id} safely.")

In [70]:
save_to_bigquery(table,table_id)

sales_forcasting.forecast_monthly_lacoste


100%|██████████| 1/1 [00:00<00:00, 11397.57it/s]

✅ Appended 234 rows to sales_forcasting.forecast_monthly_lacoste safely.





In [None]:
hello

In [1]:
df=pd.read_csv("52043_xgboost_forecast.csv")

NameError: name 'pd' is not defined

In [71]:
# champions = pd.read_csv('monthly_champion_challenger_df_fa.csv')

# champions = champions[['key','min_error','best_model']]
# champions["brand"] = "FACES"
# champions["date"] = '2025-05-31'

# summary = (
#     champions
#     .groupby(['brand', 'best_model','date'])
#     .agg(
#         stores=('key', 'nunique'),
#         avg_mape=('min_error', 'mean')
#     )
#     .reset_index()
# )

# summary

# summary['date'] = pd.to_datetime(summary['date']).dt.date
# summary['brand'] = summary['brand'].astype(str)
# summary['best_model'] = summary['best_model'].astype(str)
# summary['stores'] = summary['stores'].astype('Int64')
# summary['avg_mape'] = summary['avg_mape'].astype('float')

# to_gbq(
#     summary,
#     destination_table=f"{dataset_id}.model_summary",
#     project_id=project_id,
#     if_exists='replace',
#     table_schema=[
#         {"name": "date", "type": "DATE"},
#         {"name": "brand", "type": "STRING"},
#         {"name": "best_model", "type": "STRING"},
#         {"name": "stores", "type": "INTEGER"},
#         {"name": "avg_mape", "type": "FLOAT"}
#     ]
# )

