# Part IV: Modelling

## Basic settings

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import pickle
import sys
import warnings
from datetime import datetime, timedelta

import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import shap
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings("ignore")

# Set plotting style
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("deep")

# Set random seed for reproducibility
np.random.seed(2025)

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [3]:
src_path = os.path.abspath(os.path.join("../..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

In [4]:
from utils.plots import plot_forecast_single
from utils.utils import flatten_prophet_predictions, weighted_absolute_percentage_error

In [5]:
import os
import pandas as pd
import numpy as np

DATA_DIR = "../../data"

# 1. LOAD DATA ĐÃ PREPROCESS VÀ FEATURE ENGINEERING
df_sales = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/sales_data_preprocessed.csv"),
    parse_dates=["date"]
)
df_weather = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/weather_preprocessed.csv"),
    parse_dates=["date"]
)
df_weather_key_store_merged = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/weather_key_store_merged.csv"),
    parse_dates=["date"]
)

# Đây là file đã có is_kaggle_test và toàn bộ features
df_features = pd.read_feather(os.path.join(DATA_DIR,'data_processed/feature_engineered_data_89_features.feather'))

print("Full feature data:", df_features.shape)
print("Kaggle test rows:", df_features['is_kaggle_test'].sum())
print("Train rows:", (df_features['is_kaggle_test'] == 0).sum())




Full feature data: (686187, 89)
Kaggle test rows: 526917
Train rows: 159270


In [6]:
df_weather_key_store_merged.columns

Index(['date', 'store_nbr', 'item_nbr', 'units', 'logunits', 'is_kaggle_test',
       'station_nbr', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb',
       'heat', 'cool', 'sunrise', 'sunset', 'snowfall', 'preciptotal',
       'stnpressure', 'sealevel', 'resultspeed', 'resultdir', 'avgspeed',
       'BCFG', 'BLDU', 'BLSN', 'BR', 'DU', 'DZ', 'FG', 'FG+', 'FU', 'FZDZ',
       'FZFG', 'FZRA', 'GR', 'GS', 'HZ', 'MIFG', 'PL', 'PRFG', 'RA', 'SG',
       'SN', 'SQ', 'TS', 'TSRA', 'TSSN', 'UP', 'VCFG', 'VCTS'],
      dtype='object')

## Load data

In [7]:
df_features['is_valid'] = 0
mask_train = df_features['is_kaggle_test'] == 0
cutoff_date = pd.Timestamp("2014-08-01")
df_features.loc[mask_train & (df_features['date'] >= cutoff_date), 'is_valid'] = 1

# 2. Tách train/valid và kaggle test
df_train = df_features[(df_features['is_kaggle_test'] == 0) & (df_features['is_valid'] == 0)].copy()
df_valid = df_features[(df_features['is_kaggle_test'] == 0) & (df_features['is_valid'] == 1)].copy()
df_kaggle_test = df_features[df_features['is_kaggle_test'] == 1].copy()

print("Final splits:")
print("  Train:", df_train.shape)
print("  Valid:", df_valid.shape)
print("  Kaggle test:", df_kaggle_test.shape)

Final splits:
  Train: (153496, 90)
  Valid: (5774, 90)
  Kaggle test: (526917, 90)


In [8]:
df_kaggle_test.head()

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,is_kaggle_test,station_nbr,tmax,depart,cool,...,logunits_ewma_14d_a05,logunits_ewma_28d_a05,logunits_ewma_7d_a075,logunits_ewma_14d_a075,logunits_ewma_28d_a075,store_sum_7d,store_mean_7d,item_sum_7d,item_mean_7d,is_valid
159270,2013-04-01,2,1,,,True,14,71.0,1.0,0.0,...,,,,,,8.670772,1.238682,7.203406,1.029058,0
159271,2013-04-01,3,1,,,True,7,68.0,6.2,0.0,...,,,,,,8.82556,1.260794,6.510258,1.085043,0
159272,2013-04-01,6,1,,,True,14,71.0,1.0,0.0,...,,,,,,12.102488,1.728927,5.817111,1.163422,0
159273,2013-04-01,7,1,,,True,6,86.0,6.0,5.0,...,,,,,,8.648221,1.23546,5.123964,1.280991,0
159274,2013-04-01,8,1,,,True,4,87.0,8.0,9.0,...,,,,,,9.57248,1.367497,3.178054,1.059351,0


In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 153496 entries, 0 to 159249
Data columns (total 90 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   date                    153496 non-null  datetime64[ns]
 1   store_nbr               153496 non-null  int64         
 2   item_nbr                153496 non-null  int64         
 3   units                   153496 non-null  float64       
 4   logunits                153496 non-null  float64       
 5   is_kaggle_test          153496 non-null  bool          
 6   station_nbr             153496 non-null  int64         
 7   tmax                    153496 non-null  float64       
 8   depart                  153496 non-null  float64       
 9   cool                    153496 non-null  float64       
 10  sunrise                 153496 non-null  float64       
 11  sunset                  153496 non-null  float64       
 12  snowfall                153496 non-

## Build xboost model

In [10]:
feature_cols = [c for c in df_train.columns if c not in ['date', 'store_nbr', 'item_nbr', 'units', 'logunits']]

# Định nghĩa dictionary aggregation
agg_dict = {
    'logunits': 'sum',  # Target chính
    'units': 'sum',     # Target phụ (để tham chiếu)
}

# Với tất cả feature khác, lấy 'first' (giả định dữ liệu duplicate đã giống nhau về feature)
for c in feature_cols:
    agg_dict[c] = 'first' 

# Groupby & Aggregation
print("Grouping data for XGBoost...")
df_train_grouped = df_train.groupby(['store_nbr', 'item_nbr', 'date'], as_index=False).agg(agg_dict)
df_valid_grouped = df_valid.groupby(['store_nbr', 'item_nbr', 'date'], as_index=False).agg(agg_dict)

print(f"Train shape before: {df_train.shape} -> after: {df_train_grouped.shape}")
print(f"Valid shape before: {df_valid.shape} -> after: {df_valid_grouped.shape}")

drop_train_cols = ['date', 'units', 'logunits', 'store_nbr', 'item_nbr', 
                   'station_nbr', 'is_kaggle_test', 'is_valid']

# Tách lại X, y từ dữ liệu đã grouped
X_train = df_train_grouped.drop(columns=drop_train_cols) 
y_train = df_train_grouped['logunits']

X_valid = df_valid_grouped.drop(columns=drop_train_cols)
y_valid = df_valid_grouped['logunits']

Grouping data for XGBoost...
Train shape before: (153496, 90) -> after: (153496, 90)
Valid shape before: (5774, 90) -> after: (5774, 90)


### Build a lightgbm model

In [11]:
def create_xgboost_model(X_train, y_train, X_test, y_test):
    """
    Create an XGBoost model using engineered features
    """
    print("\nCreating base XGBoost model...")
    
    
    # Use a time series split for validation within the training set
    # This ensures we're always validating on future data
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Basic XGBoost parameters (equivalent to LightGBM params)
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "max_depth": 6,  # Equivalent to num_leaves=31
        "learning_rate": 0.05,
        "subsample": 0.9,  # Equivalent to feature_fraction
        "n_estimators": 100,
        "verbosity": 0,  # Equivalent to verbose=-1
    }
    
    # Train the model with cross-validation on training data only
    cv_scores = []
    
    for train_idx, val_idx in tscv.split(X_train):
        X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Prepare DMatrix for XGBoost (more efficient)
        dtrain = xgb.DMatrix(X_train_cv, label=y_train_cv)
        dval = xgb.DMatrix(X_val_cv, label=y_val_cv)
        
        # Train the model
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=100,
            evals=[(dval, "validation")],
            # early_stopping_rounds=50,
            verbose_eval=False
        )
        
        # Make predictions
        dval_pred = xgb.DMatrix(X_val_cv)
        preds = model.predict(dval_pred)
        
        # Calculate metrics
        mae = mean_absolute_error(y_val_cv, preds)
        rmse = np.sqrt(mean_squared_error(y_val_cv, preds))
        wape = weighted_absolute_percentage_error(y_val_cv, preds)
        
        cv_scores.append((mae, rmse, wape))
    
    # Print average scores from cross-validation
    mae_avg, rmse_avg, wape_avg = np.mean(cv_scores, axis=0)
    print(
        f"Baseline Model CV - MAE: {mae_avg:.2f}, RMSE: {rmse_avg:.2f}, WAPE: {wape_avg:.2f}%"
    )
    
    # Train a final model on all training data
    dtrain_final = xgb.DMatrix(X_train, label=y_train)
    final_model = xgb.train(params, dtrain_final, num_boost_round=100, verbose_eval=False)
    
    # Evaluate on the test set (last 3 months of 2017)
    dtest = xgb.DMatrix(X_test)
    test_preds = final_model.predict(dtest)
    test_mae = mean_absolute_error(y_test, test_preds)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    test_wape = weighted_absolute_percentage_error(y_test, test_preds)
    
    print(
        f"Baseline Model Test - MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}, WAPE: {test_wape:.2f}%"
    )
    
    return final_model, (test_mae, test_rmse, test_wape)


In [12]:
# Gọi hàm train
model, metrics = create_xgboost_model(
    X_train, y_train, X_valid, y_valid
)


Creating base XGBoost model...


Baseline Model CV - MAE: 0.30, RMSE: 0.46, WAPE: 19.04%
Baseline Model Test - MAE: 0.21, RMSE: 0.36, WAPE: 15.16%


In [13]:
# Accuracy of Prophet Model
print(
    f"LightGBM Model Results:\nMAE: {metrics[0]:.2f} | RMSE: {metrics[1]:.2f} | WAPE: {metrics[2]:.2f}%"
)

LightGBM Model Results:
MAE: 0.21 | RMSE: 0.36 | WAPE: 15.16%


In [14]:
def create_xgboost_submission(df_kaggle_test, lightgbm_model, filename="submission_xgboost.csv"):
    """
    Tạo file submission từ model LightGBM đã train.
    - df_kaggle_test: full test dataframe (có cột is_kaggle_test, date, store_nbr, item_nbr, ...).
    - lightgbm_model: model đã fit trên logunits.
    - filename: tên file csv output.
    """
    # 1. Lọc đúng dữ liệu cho tập Test (từ 01/04/2013 trở đi)
    min_test_date = "2013-04-01"
    df_kaggle_test_lgbm = df_kaggle_test[
        (df_kaggle_test['is_kaggle_test'] == True) &
        (df_kaggle_test['date'] >= min_test_date)
    ].copy()

    # 2. Tách ID columns + Features (không dùng store_nbr, item_nbr cho model)
    id_cols = ['store_nbr', 'item_nbr', 'date']
    drop_cols = [
        'date', 'units', 'logunits',
        'is_kaggle_test', 'is_valid', 'station_nbr',
        'store_nbr', 'item_nbr'
    ]

    df_ids = df_kaggle_test_lgbm[id_cols].copy()
    X_kaggle = df_kaggle_test_lgbm.drop(columns=drop_cols)
    dtest = xgb.DMatrix(X_kaggle)

    # 3. Predict (trên log scale) và inverse transform
    yhat = lightgbm_model.predict(dtest)
    df_kaggle_pred = df_ids.copy()
    df_kaggle_pred['yhat'] = yhat
    df_kaggle_pred['units'] = np.expm1(df_kaggle_pred['yhat']).clip(lower=0)

    # 4. Tạo date_str, sort và ID đúng format Kaggle
    df_kaggle_pred['date_str'] = df_kaggle_pred['date'].dt.strftime('%Y-%m-%d')
    df_kaggle_pred = df_kaggle_pred.sort_values(['date_str', 'store_nbr', 'item_nbr'])

    df_kaggle_pred['id'] = (
        df_kaggle_pred['store_nbr'].astype(str) + '_' +
        df_kaggle_pred['item_nbr'].astype(str) + '_' +
        df_kaggle_pred['date_str']
    )

    # 5. Tạo submission và lưu
    submission = df_kaggle_pred[['id', 'units']].reset_index(drop=True)
    submission.to_csv(filename, index=False)

    print(f"✓ Saved {filename} ({len(submission)} rows)")
    return submission

submission = create_xgboost_submission(df_kaggle_test, model)

✓ Saved submission_xgboost.csv (526917 rows)


In [15]:
# Kiểm tra nhanh: Đếm số lượng dòng dự đoán khác 0
non_zero_preds = submission[submission['units'] > 0]
print(f"Số lượng dòng có dự đoán bán hàng: {len(non_zero_preds)}")
print("Ví dụ 5 dòng có số liệu:")
print(non_zero_preds.head())

Số lượng dòng có dự đoán bán hàng: 526868
Ví dụ 5 dòng có số liệu:
               id       units
0  2_1_2013-04-01  225.630569
1  2_2_2013-04-01  225.630569
2  2_3_2013-04-01  225.630569
3  2_4_2013-04-01  209.089218
4  2_5_2013-04-01   61.425156


### (Optional) Fine tunning using Optuna

In [16]:
def optimize_xgboost(X_train, y_train, X_valid, y_valid, n_trials=50):
    print("\nOptimizing XGBoost model with Optuna...")
    
    def objective(trial):
        # Hyperparameters search space for XGBoost
        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "verbosity": 0,
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
            "n_estimators": 2000,
        }
        
        # Prepare DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dvalid = xgb.DMatrix(X_valid, label=y_valid)
        
        # Train with early stopping
        evals_result = {}
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dvalid, "validation")],
            evals_result=evals_result,
            early_stopping_rounds=100,
            verbose_eval=False
        )
        
        # Predict và tính metric mục tiêu (WAPE)
        dvalid_pred = xgb.DMatrix(X_valid)
        preds = model.predict(dvalid_pred)
        # wape = weighted_absolute_percentage_error(y_valid, preds)
        rmse = np.sqrt(mean_squared_error(y_valid, preds))
        return rmse  # Optimize trực tiếp WAPE
    
    # Chạy Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    
    print("\nBest params found:")
    best_params = study.best_params
    best_params.update({
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "verbosity": 0,
        "n_estimators": 2000
    })
    
    for k, v in best_params.items():
        print(f"  {k}: {v}")
    
    # Train final model với best params
    dtrain_final = xgb.DMatrix(X_train, label=y_train)
    dvalid_final = xgb.DMatrix(X_valid, label=y_valid)
    
    evals_result_final = {}
    final_model = xgb.train(
        best_params,
        dtrain_final,
        num_boost_round=2000,
        evals=[(dvalid_final, "validation")],
        evals_result=evals_result_final,
        early_stopping_rounds=100,
        verbose_eval=100
    )
    

    dvalid_pred = xgb.DMatrix(X_valid)
    valid_preds = final_model.predict(dvalid_pred)
    test_mae = mean_absolute_error(y_valid, valid_preds)
    test_rmse = np.sqrt(mean_squared_error(y_valid, valid_preds))
    test_wape = weighted_absolute_percentage_error(y_valid, valid_preds)
    
    print(f"\nOptimized XGBoost Valid Metrics - MAE: {test_mae:.3f}, RMSE: {test_rmse:.3f}, WAPE: {test_wape:.3f}")
    
    return final_model, best_params, (test_mae, test_rmse, test_wape)

In [17]:
optimized_model, best_params, optimized_metrics = optimize_xgboost(
    X_train, y_train, X_valid, y_valid, n_trials=50
)


[I 2025-11-29 04:03:20,629] A new study created in memory with name: no-name-165e36b5-6f55-4823-aaf7-cfd359274450



Optimizing XGBoost model with Optuna...


[I 2025-11-29 04:03:21,157] Trial 0 finished with value: 0.3673872115255754 and parameters: {'max_depth': 3, 'learning_rate': 0.08710911202364505, 'subsample': 0.6809416757858037, 'colsample_bytree': 0.5082648069500509, 'colsample_bylevel': 0.803585150169208, 'min_child_weight': 3, 'gamma': 0.47414271304802524, 'reg_alpha': 3.088214763098494e-05, 'reg_lambda': 0.00022755861206792943}. Best is trial 0 with value: 0.3673872115255754.
[I 2025-11-29 04:03:21,636] Trial 1 finished with value: 0.36785844360775133 and parameters: {'max_depth': 3, 'learning_rate': 0.17817279952436255, 'subsample': 0.8238931684276216, 'colsample_bytree': 0.9801888114630191, 'colsample_bylevel': 0.8263225096618712, 'min_child_weight': 1, 'gamma': 2.6532817515126395, 'reg_alpha': 9.018937787150919e-06, 'reg_lambda': 1.9379271534003437}. Best is trial 0 with value: 0.3673872115255754.
[I 2025-11-29 04:03:28,407] Trial 2 finished with value: 0.3622782384963556 and parameters: {'max_depth': 9, 'learning_rate': 0.014


Best params found:
  max_depth: 8
  learning_rate: 0.013195482764595833
  subsample: 0.9534247192829588
  colsample_bytree: 0.5165166240356645
  colsample_bylevel: 0.8807498303735782
  min_child_weight: 4
  gamma: 0.4444879045317784
  reg_alpha: 1.2364363436749473
  reg_lambda: 4.038761039573867e-06
  objective: reg:squarederror
  eval_metric: rmse
  verbosity: 0
  n_estimators: 2000
[0]	validation-rmse:1.70850
[100]	validation-rmse:0.57599
[200]	validation-rmse:0.38332
[300]	validation-rmse:0.36428
[400]	validation-rmse:0.36191
[500]	validation-rmse:0.36144
[600]	validation-rmse:0.36120
[700]	validation-rmse:0.36111
[787]	validation-rmse:0.36115

Optimized XGBoost Valid Metrics - MAE: 0.199, RMSE: 0.361, WAPE: 14.639


In [18]:
submission = create_xgboost_submission(df_kaggle_test, optimized_model, filename="submission_xgboost_optim.csv")

✓ Saved submission_xgboost_optim.csv (526917 rows)


## Evaluating model

In [19]:
def evaluate_model(model, X_test, y_test, data):
    """
    Evaluate the model performance on the test set (last 3 months of 2017)
    """
    print("\nEvaluating model performance on test set...")

    # Make predictions on the test set
    test_preds = model.predict(X_test)

    # Calculate metrics
    test_mae = mean_absolute_error(y_test, test_preds)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    test_wape = weighted_absolute_percentage_error(y_test, test_preds)

    # Print evaluation results
    print(f"Final Model Test Evaluation:")
    print(f"    MAE: {test_mae:.2f}")
    print(f"    RMSE: {test_rmse:.2f}")
    print(f"    WAPE: {test_wape:.2f}%")

    # Analyze errors by time period (month)
    test_results = data[data["is_test"]].copy()
    test_results["prediction"] = test_preds
    test_results["error"] = test_results["sales"] - test_results["prediction"]
    test_results["abs_error"] = np.abs(test_results["error"])
    test_results["month_name"] = test_results["date"].dt.strftime("%B")

    # Summarize errors by month
    monthly_errors = (
        test_results.groupby("month_name")
        .agg({"abs_error": "mean", "error": "mean", "sales": "mean"})
        .reset_index()
    )
    monthly_errors["error_pct"] = (
        100 * monthly_errors["abs_error"] / monthly_errors["sales"]
    )

    print("\nError Analysis by Month:")
    print(
        monthly_errors[["month_name", "abs_error", "error_pct"]].to_string(index=False)
    )

    # Store results for visualization
    # Include month and store information for granular analysis
    test_results["year_month"] = test_results["date"].dt.strftime("%Y-%m")

    # Plot actual vs predicted
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, test_preds, alpha=0.5)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], "r--")
    plt.title("Actual vs Predicted Sales (Test Set)")
    plt.xlabel("Actual Sales")
    plt.ylabel("Predicted Sales")
    plt.tight_layout()
    # plt.savefig('actual_vs_predicted_test.png')

    # Plot error distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(test_results["error"], kde=True)
    plt.title("Error Distribution")
    plt.xlabel("Prediction Error")
    plt.tight_layout()
    # plt.savefig('error_distribution.png')

    return test_mae, test_rmse, test_wape, test_preds, y_test, test_results

In [20]:
# Prophet Model Results:
# MAE: 9.03 | RMSE: 11.86 | WAPE: 29.13%

In [21]:
# Evaluate the lightgbm model
test_mae, test_rmse, test_smape, test_preds, y_test_values, test_results = (
    evaluate_model(lightgbm_model, X_test, y_test, df_features)
)

NameError: name 'lightgbm_model' is not defined

## Save trained models

In [None]:
def save_model(model, X_train, feature_names, output_dir="../models"):
    """
    Save the trained model and related artifacts for API use

    Args:
        model: Trained model (e.g., LightGBM model)
        feature_names: List of feature names
        output_dir: Directory to save model artifacts
    """

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save the model
    model_path = os.path.join(output_dir, "sales_forecast_model.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    print(f"Model saved to {model_path}")

    # Create and save feature statistics
    feature_stats = {
        "model_version": "1.0.0",
        "last_trained": pd.Timestamp.now().strftime("%Y-%m-%d"),
        "required_columns": list(feature_names),
        "column_order": list(feature_names),
        "default_values": {},
        "temperature_bins": [-np.inf, 20, 25, 30, np.inf],
        "temperature_labels": ["Cold", "Cool", "Warm", "Hot"],
        "humidity_bins": [-np.inf, 60, 75, np.inf],
        "humidity_labels": ["Low", "Medium", "High"],
    }

    # Add default values for date features
    feature_stats["default_values"] = {
        "year": 2017,
        "month": 11,
        "day": 15,
        "day_of_week": 2,
        "is_weekend": 0,
        "quarter": 4,
        "is_holiday": 0,
    }

    # Save feature stats
    stats_path = os.path.join(output_dir, "feature_stats.json")
    with open(stats_path, "w") as f:
        json.dump(feature_stats, f, indent=4)
    print(f"Feature statistics saved to {stats_path}")

    print(f"All model artifacts saved successfully to {output_dir}/")

    return model_path, stats_path

In [None]:
# Save model
save_model(
    model=optimized_model,
    X_train=X_train,
    feature_names=X_train.columns,
    output_dir='../models'
)

Model saved to ../models/sales_forecast_model.pkl
Feature statistics saved to ../models/feature_stats.json
All model artifacts saved successfully to ../models/


('../models/sales_forecast_model.pkl', '../models/feature_stats.json')