Collecting mlflow
  Downloading mlflow-3.1.4-py3-none-any.whl.metadata (29 kB)
Collecting dagshub
  Downloading dagshub-0.6.2-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==3.1.4 (from mlflow)
  Downloading mlflow_skinny-3.1.4-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading databricks_sdk-0.61.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opent

In [None]:
!pip install prophet dagshub scikit-learn pandas numpy mlflow==2.2.2



In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
import mlflow
import mlflow.pyfunc
import dagshub
import pickle
import warnings
import os
from datetime import datetime
import json

warnings.filterwarnings('ignore')
np.random.seed(42)

In [None]:
features_data = pd.read_csv('features.csv')
train_data = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')

print(f"Features data shape: {features_data.shape}")
print(f"Train data shape: {train_data.shape}")
print(f"Stores data shape: {stores.shape}")

Features data shape: (8190, 12)
Train data shape: (421570, 5)
Stores data shape: (45, 3)


In [None]:
df = train_data.merge(features_data, on=['Store', 'Date'], how='inner').merge(stores, on=['Store'], how='inner')

if 'IsHoliday_y' in df.columns:
    df.drop(['IsHoliday_y'], axis=1, inplace=True)
    df.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

print(f"Merged dataset shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

Merged dataset shape: (421570, 16)
Date range: 2010-02-05 to 2012-10-26


In [None]:
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['Store', 'Dept', 'Date'])

# Remove negative sales (returns/adjustments)
print(f"Negative sales records: {len(df[df['Weekly_Sales'] < 0])}")
df = df[df['Weekly_Sales'] >= 0]

print(f"Final dataset shape: {df.shape}")

Negative sales records: 1285
Final dataset shape: (420285, 16)


In [None]:
# Create holiday flags
df['Is_SuperBowl'] = np.where(df['Date'].isin(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08']), 1, 0)
df['Is_LaborDay'] = np.where(df['Date'].isin(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06']), 1, 0)
df['Is_Thanksgiving'] = np.where(df['Date'].isin(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29']), 1, 0)
df['Is_Christmas'] = np.where(df['Date'].isin(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']), 1, 0)

# Handle missing values in external regressors
external_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
for col in external_cols:
    if col in df.columns:
        df[col] = df.groupby(['Store', 'Dept'])[col].fillna(method='ffill').fillna(method='bfill')

print("Feature engineering completed")
print(f"Holiday weeks: SuperBowl={df['Is_SuperBowl'].sum()}, LaborDay={df['Is_LaborDay'].sum()}, Thanksgiving={df['Is_Thanksgiving'].sum()}, Christmas={df['Is_Christmas'].sum()}")

Feature engineering completed
Holiday weeks: SuperBowl=8874, LaborDay=8833, Thanksgiving=5946, Christmas=5910


In [None]:
# Setup MLflow and DagsHub
os.environ['MLFLOW_TRACKING_USERNAME'] = 'nipkha21'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '202fb8a4c58a90b0eb3598b1037498eb6fe9f593'

dagshub.init(repo_owner='TomC333', repo_name='ml-walmart-recruiting', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow')

print("MLflow setup completed")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=1ddd9b60-d8f2-402e-ba78-370c6a3d1758&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=a8a18f190c549c095a235fe6e3ba73c6a7817edaf884c13f802b1dfa50b1ca92




MLflow setup completed


In [None]:
# Constants
MIN_DATA_POINTS = 20
TRAIN_RATIO = 0.8

def prepare_prophet_data(data, store, dept, regressors=None):
    """Prepare data for Prophet model"""
    combo_data = data[(data['Store'] == store) & (data['Dept'] == dept)].copy()
    combo_data = combo_data.sort_values('Date')

    if len(combo_data) < MIN_DATA_POINTS:
        return None, None, None, None

    # Prophet requires 'ds' and 'y' columns
    prophet_data = combo_data[['Date', 'Weekly_Sales']].copy()
    prophet_data.columns = ['ds', 'y']

    # Add regressors if specified
    if regressors:
        for regressor in regressors:
            if regressor in combo_data.columns:
                prophet_data[regressor] = combo_data[regressor].values

    # Split data
    train_size = int(len(prophet_data) * TRAIN_RATIO)
    train_data = prophet_data[:train_size].copy()
    test_data = prophet_data[train_size:].copy()

    return train_data, test_data, len(train_data), len(test_data)

def train_prophet_model(train_data, test_data, regressors=None, holidays_df=None):
    """Train Prophet model and return metrics"""
    try:
        # Initialize Prophet
        model = Prophet(
            daily_seasonality=False,
            weekly_seasonality=True,
            yearly_seasonality=True,
            holidays=holidays_df,
            changepoint_prior_scale=0.05,
            seasonality_prior_scale=10.0
        )

        # Add regressors
        if regressors:
            for regressor in regressors:
                if regressor in train_data.columns:
                    model.add_regressor(regressor)

        # Fit model
        model.fit(train_data)

        # Make predictions
        if len(test_data) > 0:
            forecast = model.predict(test_data)
            y_true = test_data['y'].values
            y_pred = forecast['yhat'].values

            mae = mean_absolute_error(y_true, y_pred)
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            mape = np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-8))) * 100
        else:
            mae = rmse = mape = 0

        return model, mae, rmse, mape

    except Exception as e:
        print(f"Error training Prophet model: {e}")
        return None, None, None, None

print("Prophet training functions defined")

Prophet training functions defined


In [None]:
# Create holidays dataframe for Prophet
holidays_data = []

# Super Bowl dates
superbowl_dates = ['2010-02-12', '2011-02-11', '2012-02-10']
for date in superbowl_dates:
    holidays_data.append({'holiday': 'SuperBowl', 'ds': pd.to_datetime(date), 'lower_window': 0, 'upper_window': 0})

# Labor Day dates
laborday_dates = ['2010-09-10', '2011-09-09', '2012-09-07']
for date in laborday_dates:
    holidays_data.append({'holiday': 'LaborDay', 'ds': pd.to_datetime(date), 'lower_window': 0, 'upper_window': 0})

# Thanksgiving dates
thanksgiving_dates = ['2010-11-26', '2011-11-25', '2012-11-23']
for date in thanksgiving_dates:
    holidays_data.append({'holiday': 'Thanksgiving', 'ds': pd.to_datetime(date), 'lower_window': 0, 'upper_window': 0})

# Christmas dates
christmas_dates = ['2010-12-31', '2011-12-30', '2012-12-28']
for date in christmas_dates:
    holidays_data.append({'holiday': 'Christmas', 'ds': pd.to_datetime(date), 'lower_window': 0, 'upper_window': 0})

holidays_df = pd.DataFrame(holidays_data)
print(f"Created holidays dataframe with {len(holidays_df)} holiday dates")

Created holidays dataframe with 12 holiday dates


In [None]:
# Define iterative feature phases
phases = {
    "Phase1_Baseline": {
        "regressors": [],
        "holidays": None,
        "description": "Baseline Prophet with only time trends and seasonality"
    },
    "Phase2_Holidays": {
        "regressors": [],
        "holidays": holidays_df,
        "description": "Prophet with holiday effects"
    },
    "Phase3_External": {
        "regressors": ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment'],
        "holidays": holidays_df,
        "description": "Prophet with holidays and external regressors"
    },
    "Phase4_Store_Char": {
        "regressors": ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Type', 'Size'],
        "holidays": holidays_df,
        "description": "Prophet with all features"
    }
}

print("Defined 4 phases for iterative feature testing:")
for phase, config in phases.items():
    print(f"  {phase}: {config['description']}")

Defined 4 phases for iterative feature testing:
  Phase1_Baseline: Baseline Prophet with only time trends and seasonality
  Phase2_Holidays: Prophet with holiday effects
  Phase3_External: Prophet with holidays and external regressors
  Phase4_Store_Char: Prophet with all features


In [None]:
# Get valid store-department combinations
store_dept_combinations = df.groupby(['Store', 'Dept']).size().reset_index(name='count')
valid_combinations = store_dept_combinations[store_dept_combinations['count'] >= MIN_DATA_POINTS][['Store', 'Dept']]

print(f"Total store-dept combinations: {len(store_dept_combinations)}")
print(f"Valid combinations (>= {MIN_DATA_POINTS} data points): {len(valid_combinations)}")

# Limit to first 50 combinations for testing (remove this for full run)
valid_combinations = valid_combinations.head(50)
print(f"Running on first {len(valid_combinations)} combinations for testing")

Total store-dept combinations: 3323
Valid combinations (>= 20 data points): 3071
Running on first 50 combinations for testing


In [None]:
# Set experiment
mlflow.set_experiment("Sales_Forecasting_Prophet")

# Store results for comparison
all_results = {}

# Process each phase
for phase_name, phase_config in phases.items():
    print(f"\n{'='*60}")
    print(f"PHASE: {phase_name}")
    print(f"Description: {phase_config['description']}")
    print(f"Regressors: {phase_config['regressors']}")
    print(f"Holidays: {'Yes' if phase_config['holidays'] is not None else 'No'}")
    print(f"{'='*60}")

    phase_results = []
    successful_models = 0
    failed_models = 0

    with mlflow.start_run(run_name=f"Prophet_{phase_name}"):

        # Log phase parameters
        mlflow.log_param("phase_name", phase_name)
        mlflow.log_param("regressors", str(phase_config['regressors']))
        mlflow.log_param("use_holidays", phase_config['holidays'] is not None)
        mlflow.log_param("total_combinations", len(valid_combinations))
        mlflow.log_param("min_data_points", MIN_DATA_POINTS)
        mlflow.log_param("train_ratio", TRAIN_RATIO)

        for idx, row in valid_combinations.iterrows():
            store, dept = row['Store'], row['Dept']

            # Prepare data
            train_data, test_data, train_size, test_size = prepare_prophet_data(
                df, store, dept, phase_config['regressors']
            )

            if train_data is None:
                failed_models += 1
                continue

            # Handle categorical regressors for Prophet
            if 'Type' in phase_config['regressors']:
                type_mapping = {'A': 1, 'B': 2, 'C': 3}
                train_data['Type'] = train_data['Type'].map(type_mapping).fillna(1)
                if len(test_data) > 0:
                    test_data['Type'] = test_data['Type'].map(type_mapping).fillna(1)

            # Train model
            model, mae, rmse, mape = train_prophet_model(
                train_data, test_data,
                phase_config['regressors'],
                phase_config['holidays']
            )

            if model is not None:
                phase_results.append({
                    'Store': store,
                    'Dept': dept,
                    'MAE': mae,
                    'RMSE': rmse,
                    'MAPE': mape,
                    'Train_Size': train_size,
                    'Test_Size': test_size
                })
                successful_models += 1

                if successful_models % 10 == 0:
                    print(f"Completed {successful_models} models...")
            else:
                failed_models += 1

        # Calculate and log metrics
        if phase_results:
            results_df = pd.DataFrame(phase_results)

            mlflow.log_metric("successful_models", successful_models)
            mlflow.log_metric("failed_models", failed_models)
            mlflow.log_metric("avg_mae", results_df['MAE'].mean())
            mlflow.log_metric("avg_rmse", results_df['RMSE'].mean())
            mlflow.log_metric("avg_mape", results_df['MAPE'].mean())
            mlflow.log_metric("median_mae", results_df['MAE'].median())
            mlflow.log_metric("median_rmse", results_df['RMSE'].median())
            mlflow.log_metric("median_mape", results_df['MAPE'].median())
            mlflow.log_metric("std_mae", results_df['MAE'].std())
            mlflow.log_metric("std_rmse", results_df['RMSE'].std())
            mlflow.log_metric("std_mape", results_df['MAPE'].std())

            # Log summary
            print(f"\nPHASE {phase_name} RESULTS:")
            print(f"Successful models: {successful_models}")
            print(f"Failed models: {failed_models}")
            print(f"Average MAE: {results_df['MAE'].mean():.2f}")
            print(f"Average RMSE: {results_df['RMSE'].mean():.2f}")
            print(f"Average MAPE: {results_df['MAPE'].mean():.2f}%")

            # Store results for comparison
            all_results[phase_name] = results_df

        run_id = mlflow.active_run().info.run_id
        print(f"Phase {phase_name} logged with run_id: {run_id}")

print(f"\nAll phases completed!")


PHASE: Phase1_Baseline
Description: Baseline Prophet with only time trends and seasonality
Regressors: []
Holidays: No


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/mnif3keb.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/jdz5slg6.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=82215', 'data', 'file=/tmp/tmp2onrqyg3/mnif3keb.json', 'init=/tmp/tmp2onrqyg3/jdz5slg6.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model8f0gbmvb/prophet_model-20250731175203.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:03 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:03 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/kku3w144.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/_mik51xd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 10 models...


17:52:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/e6ljtg2a.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/8won1rpp.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=96513', 'data', 'file=/tmp/tmp2onrqyg3/e6ljtg2a.json', 'init=/tmp/tmp2onrqyg3/8won1rpp.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modeli514rkgn/prophet_model-20250731175204.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/neuzb6gv.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/3a90n0_w.json
DEBUG:cmdstanpy:idx

Completed 20 models...


17:52:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/3tw2lhit.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/8fzyyqgy.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1819', 'data', 'file=/tmp/tmp2onrqyg3/3tw2lhit.json', 'init=/tmp/tmp2onrqyg3/8fzyyqgy.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modelsu1ndh1w/prophet_model-20250731175205.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/2zy_593z.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/f8hzozj1.json
DEBUG:cmdstanpy:idx 

Completed 30 models...


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/6yjsl0vi.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/8s015gkr.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=5645', 'data', 'file=/tmp/tmp2onrqyg3/6yjsl0vi.json', 'init=/tmp/tmp2onrqyg3/8s015gkr.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modelptjd4c37/prophet_model-20250731175206.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/g9gch572.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/5u6qejhs.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/l

Completed 40 models...


17:52:07 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/hu49jhn8.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/clr0ewlc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=24373', 'data', 'file=/tmp/tmp2onrqyg3/hu49jhn8.json', 'init=/tmp/tmp2onrqyg3/clr0ewlc.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modelan12spkm/prophet_model-20250731175207.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:07 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:07 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/m0rvrn4n.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/iza3sh_f.json
DEBUG:cmdstanpy:idx

Completed 50 models...

PHASE Phase1_Baseline RESULTS:
Successful models: 50
Failed models: 0
Average MAE: 1552.25
Average RMSE: 1970.97
Average MAPE: 1563.93%
Phase Phase1_Baseline logged with run_id: 1a0a034716604285a2ba1db3ea695717

PHASE: Phase2_Holidays
Description: Prophet with holiday effects
Regressors: []
Holidays: Yes


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/nzg2u_kc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/jw7f_1z7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=12945', 'data', 'file=/tmp/tmp2onrqyg3/nzg2u_kc.json', 'init=/tmp/tmp2onrqyg3/jw7f_1z7.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modeldxy_hych/prophet_model-20250731175217.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/f95k5esc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/6m3uwyyv.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 10 models...


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/uwjidrzw.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/qgo6o9mi.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18123', 'data', 'file=/tmp/tmp2onrqyg3/uwjidrzw.json', 'init=/tmp/tmp2onrqyg3/qgo6o9mi.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model2phwtwu9/prophet_model-20250731175218.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:18 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:18 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/ctvp4cgf.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/10v39pri.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 20 models...


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/g5ldosah.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/yln7xe6a.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=43693', 'data', 'file=/tmp/tmp2onrqyg3/g5ldosah.json', 'init=/tmp/tmp2onrqyg3/yln7xe6a.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model75nggwgc/prophet_model-20250731175219.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:19 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:19 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/uhgvgo7a.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/k0e6o9t1.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 30 models...


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/31s272ki.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/2963ht2g.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=88918', 'data', 'file=/tmp/tmp2onrqyg3/31s272ki.json', 'init=/tmp/tmp2onrqyg3/2963ht2g.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model25lz384z/prophet_model-20250731175220.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:20 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/d2lox7ix.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/3qd4l0ai.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 40 models...


17:52:22 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/_d6q5mv5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/xhz6l4fj.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=52372', 'data', 'file=/tmp/tmp2onrqyg3/_d6q5mv5.json', 'init=/tmp/tmp2onrqyg3/xhz6l4fj.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model0is0t39i/prophet_model-20250731175222.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:22 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:22 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/xgq5zefe.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/9nrhph5z.json
DEBUG:cmdstanpy:idx

Completed 50 models...

PHASE Phase2_Holidays RESULTS:
Successful models: 50
Failed models: 0
Average MAE: 1558.87
Average RMSE: 1975.18
Average MAPE: 1689.93%
Phase Phase2_Holidays logged with run_id: 6c3a0f42a9ee49278d3093882641340f

PHASE: Phase3_External
Description: Prophet with holidays and external regressors
Regressors: ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
Holidays: Yes


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/2cujv79n.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/k67tiria.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=52933', 'data', 'file=/tmp/tmp2onrqyg3/2cujv79n.json', 'init=/tmp/tmp2onrqyg3/k67tiria.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modelei02d8hi/prophet_model-20250731175236.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:36 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:36 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/ybqd1l1_.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/mcrxr_13.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 10 models...


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/utfp3w1z.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/twyfes5v.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80051', 'data', 'file=/tmp/tmp2onrqyg3/utfp3w1z.json', 'init=/tmp/tmp2onrqyg3/twyfes5v.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model2xctr6oz/prophet_model-20250731175237.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:37 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:37 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/q75mucjm.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/4wkmbzsz.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 20 models...


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/09v1whzx.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/knqkdo6p.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=84421', 'data', 'file=/tmp/tmp2onrqyg3/09v1whzx.json', 'init=/tmp/tmp2onrqyg3/knqkdo6p.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model5_spcz02/prophet_model-20250731175239.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:39 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:39 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/4ubg6tix.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/w8djfbk4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 30 models...


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/12h4guvq.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/2_jt471n.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=55689', 'data', 'file=/tmp/tmp2onrqyg3/12h4guvq.json', 'init=/tmp/tmp2onrqyg3/2_jt471n.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modelt7o8uwfx/prophet_model-20250731175240.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:40 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:40 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/n2d6lmz4.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/5kdg0oep.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Completed 40 models...


17:52:41 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/jvdnwe0q.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/tb88bc8t.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=27197', 'data', 'file=/tmp/tmp2onrqyg3/jvdnwe0q.json', 'init=/tmp/tmp2onrqyg3/tb88bc8t.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modele96dm91b/prophet_model-20250731175242.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:42 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:42 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/21_xpzu3.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/anw29lxd.json
DEBUG:cmdstanpy:idx

Completed 50 models...

PHASE Phase3_External RESULTS:
Successful models: 50
Failed models: 0
Average MAE: 2040.74
Average RMSE: 2450.73
Average MAPE: 1566.87%
Phase Phase3_External logged with run_id: b822cb1986654b6d946316fa724a0d88

PHASE: Phase4_Store_Char
Description: Prophet with all features
Regressors: ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Type', 'Size']
Holidays: Yes


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/ch9s86w5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/a4gfvxoc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=56546', 'data', 'file=/tmp/tmp2onrqyg3/ch9s86w5.json', 'init=/tmp/tmp2onrqyg3/a4gfvxoc.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model6453pamj/prophet_model-20250731175255.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:52:55 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:52:55 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
17:52:55 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
ERROR:cmdstanpy:Chain [1] error: error during processing Operation not permitted
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/_hb4lzkl.json
DE

Completed 10 models...


17:54:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/rz89o2bd.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/usj4ynaw.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=58619', 'data', 'file=/tmp/tmp2onrqyg3/rz89o2bd.json', 'init=/tmp/tmp2onrqyg3/usj4ynaw.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modeldjbgm5ft/prophet_model-20250731175405.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:54:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:54:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
17:54:05 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
ERROR:cmdstanpy:Chain [1] error: error during pro

Completed 20 models...


17:55:38 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/sqrvqtpt.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/51hdt95w.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=90343', 'data', 'file=/tmp/tmp2onrqyg3/sqrvqtpt.json', 'init=/tmp/tmp2onrqyg3/51hdt95w.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modelpyuolxtl/prophet_model-20250731175538.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:55:38 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:55:38 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
17:55:38 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
ERROR:cmdstanpy:Chain [1] error: error during pro

Completed 30 models...


17:57:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/xz1v6aj0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/c93qmf3u.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57843', 'data', 'file=/tmp/tmp2onrqyg3/xz1v6aj0.json', 'init=/tmp/tmp2onrqyg3/c93qmf3u.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modelpvu9sf28/prophet_model-20250731175720.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:57:20 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:57:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
17:57:20 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted
ERROR:cmdstanpy:Chain [1] error: error during pro

Completed 40 models...


17:58:43 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/pnaqcipf.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/d_7x5rk8.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=93752', 'data', 'file=/tmp/tmp2onrqyg3/pnaqcipf.json', 'init=/tmp/tmp2onrqyg3/d_7x5rk8.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_modeluey6ojpq/prophet_model-20250731175844.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
17:58:44 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
17:58:44 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/3h605iw7.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/e2uev0oq.json
DEBUG:cmdstanpy:id

Completed 50 models...

PHASE Phase4_Store_Char RESULTS:
Successful models: 50
Failed models: 0
Average MAE: 2025.00
Average RMSE: 2443.08
Average MAPE: 1565.33%
Phase Phase4_Store_Char logged with run_id: c7653f167670400192c6cca76ebdde0a

All phases completed!


In [None]:
print("\n" + "="*80)
print("PHASE COMPARISON SUMMARY")
print("="*80)

comparison_data = []
for phase_name, results_df in all_results.items():
    if len(results_df) > 0:
        comparison_data.append({
            'Phase': phase_name,
            'Models': len(results_df),
            'Avg_MAE': results_df['MAE'].mean(),
            'Avg_RMSE': results_df['RMSE'].mean(),
            'Avg_MAPE': results_df['MAPE'].mean(),
            'Median_MAE': results_df['MAE'].median(),
            'Std_MAE': results_df['MAE'].std()
        })

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

# Calculate improvements
if len(comparison_df) > 1:
    baseline_mae = comparison_df.iloc[0]['Avg_MAE']
    print(f"\nMAE Improvements over Phase 1 Baseline:")
    for idx, row in comparison_df.iterrows():
        if idx > 0:
            improvement = ((baseline_mae - row['Avg_MAE']) / baseline_mae) * 100
            print(f"  {row['Phase']}: {improvement:.2f}% improvement")


PHASE COMPARISON SUMMARY
            Phase  Models     Avg_MAE    Avg_RMSE    Avg_MAPE  Median_MAE     Std_MAE
  Phase1_Baseline      50 1552.250143 1970.971254 1563.930442 1056.092552 1500.480558
  Phase2_Holidays      50 1558.867556 1975.184220 1689.927625 1098.684176 1507.447903
  Phase3_External      50 2040.743687 2450.730744 1566.871487 1104.470895 2247.127165
Phase4_Store_Char      50 2024.998747 2443.079061 1565.333765 1058.573810 2161.159136

MAE Improvements over Phase 1 Baseline:
  Phase2_Holidays: -0.43% improvement
  Phase3_External: -31.47% improvement
  Phase4_Store_Char: -30.46% improvement


In [None]:
# For demonstration, we'll save information about the best phase
best_phase = None
best_mae = float('inf')

for phase_name, results_df in all_results.items():
    if len(results_df) > 0:
        avg_mae = results_df['MAE'].mean()
        if avg_mae < best_mae:
            best_mae = avg_mae
            best_phase = phase_name

print(f"\nBest performing phase: {best_phase} with average MAE: {best_mae:.2f}")

# Save phase comparison results
comparison_df.to_csv('prophet_phase_comparison.csv', index=False)
print("Phase comparison results saved to 'prophet_phase_comparison.csv'")

# Log final summary
with mlflow.start_run(run_name="Prophet_Final_Summary"):
    mlflow.log_param("best_phase", best_phase)
    mlflow.log_metric("best_avg_mae", best_mae)
    mlflow.log_artifact('prophet_phase_comparison.csv')

print("\nProphet experiment completed successfully!")


Best performing phase: Phase1_Baseline with average MAE: 1552.25
Phase comparison results saved to 'prophet_phase_comparison.csv'

Prophet experiment completed successfully!


Found existing installation: mlflow 3.1.4
Uninstalling mlflow-3.1.4:
  Successfully uninstalled mlflow-3.1.4
Collecting mlflow==2.2.2
  Downloading mlflow-2.2.2-py3-none-any.whl.metadata (11 kB)
Collecting cloudpickle<3 (from mlflow==2.2.2)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting databricks-cli<1,>=0.8.7 (from mlflow==2.2.2)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting protobuf<5,>=3.12.0 (from mlflow==2.2.2)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytz<2023 (from mlflow==2.2.2)
  Downloading pytz-2022.7.1-py2.py3-none-any.whl.metadata (21 kB)
Collecting packaging<24 (from mlflow==2.2.2)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Collecting importlib-metadata!=4.7.0,<7,>=3.7.0 (from mlflow==2.2.2)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting docker<7,>=4.0.0 (from mlflow==2.2.2)
  Downloadin

ImportError: cannot import name 'get_databricks_runtime' from 'mlflow.utils.databricks_utils' (/usr/local/lib/python3.11/dist-packages/mlflow/utils/databricks_utils.py)

In [None]:
# After identifying the best phase
print(f"Best performing phase: {best_phase} with average MAE: {best_mae:.2f}")

# Train and register the best model configuration
if best_phase:
    best_config = phases[best_phase]

    with mlflow.start_run(run_name=f"Prophet_Best_Model_{best_phase}"):
        # Log the best configuration
        mlflow.log_param("phase_name", best_phase)
        mlflow.log_param("regressors", str(best_config['regressors']))
        mlflow.log_param("use_holidays", best_config['holidays'] is not None)
        mlflow.log_param("is_best_model", True)

        # Log preprocessing configuration that will be needed for inference
        preprocessing_config = {
            "min_data_points": MIN_DATA_POINTS,
            "train_ratio": TRAIN_RATIO,
            "remove_negative_sales": True,
            "holiday_dates": {
                "superbowl": ['2010-02-12', '2011-02-11', '2012-02-10'],
                "laborday": ['2010-09-10', '2011-09-09', '2012-09-07'],
                "thanksgiving": ['2010-11-26', '2011-11-25', '2012-11-23'],
                "christmas": ['2010-12-31', '2011-12-30', '2012-12-28']
            },
            "external_cols": ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment'],
            "prophet_params": {
                "daily_seasonality": False,
                "weekly_seasonality": True,
                "yearly_seasonality": True,
                "changepoint_prior_scale": 0.05,
                "seasonality_prior_scale": 10.0
            }
        }

        # Save preprocessing config
        mlflow.log_dict(preprocessing_config, "preprocessing_config.json")

        # Save holidays dataframe if used
        if best_config['holidays'] is not None:
            mlflow.log_dict(best_config['holidays'].to_dict('records'), "holidays_config.json")

        # Log best performance metrics
        if best_phase in all_results:
            best_results = all_results[best_phase]
            mlflow.log_metric("best_avg_mae", best_results['MAE'].mean())
            mlflow.log_metric("best_median_mae", best_results['MAE'].median())
            mlflow.log_metric("best_avg_rmse", best_results['RMSE'].mean())
            mlflow.log_metric("best_median_rmse", best_results['RMSE'].median())

        # Train a representative model for saving (using a typical store-dept combination)
        representative_store, representative_dept = valid_combinations.iloc[0]['Store'], valid_combinations.iloc[0]['Dept']

        train_data, test_data, _, _ = prepare_prophet_data(
            df, representative_store, representative_dept, best_config['regressors']
        )

        # Train the representative model (keep your existing code)
if train_data is not None:
    # Train the model
    model = Prophet(
        daily_seasonality=False,
        weekly_seasonality=True,
        yearly_seasonality=True,
        holidays=best_config['holidays'],
        changepoint_prior_scale=0.05,
        seasonality_prior_scale=10.0
    )

    # Add regressors if any
    if best_config['regressors']:
        for regressor in best_config['regressors']:
            if regressor in train_data.columns:
                model.add_regressor(regressor)

    # Fit model
    model.fit(train_data)

    # Log model to MLflow using the correct parameter name
    mlflow.prophet.log_model(
        pr_model=model,
        artifact_path="prophet_model",  # Use 'name' instead of 'artifact_path'
        registered_model_name="Prophet_Sales_Forecasting"
    )

    print(f"✅ Best Prophet model registered: {best_phase}")
    print(f"   Configuration: {best_config['description']}")
    print(f"   Performance: {best_mae:.2f} MAE")

    run_id = mlflow.active_run().info.run_id
    print(f"Best model logged with run_id: {run_id}")

Best performing phase: Phase1_Baseline with average MAE: 1552.25


DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/4x1yf5ku.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2onrqyg3/3cau8b11.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=39945', 'data', 'file=/tmp/tmp2onrqyg3/4x1yf5ku.json', 'init=/tmp/tmp2onrqyg3/3cau8b11.json', 'output', 'file=/tmp/tmp2onrqyg3/prophet_model2zaj77cs/prophet_model-20250731181458.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:14:58 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
18:14:58 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
Successfully registered model 'Prophet_Sales_Forecasting'.
2025/07/31 18:15:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Prophet_Sales_Forecast

✅ Best Prophet model registered: Phase1_Baseline
   Configuration: Baseline Prophet with only time trends and seasonality
   Performance: 1552.25 MAE
Best model logged with run_id: 62f20e6cd1494a70b73cf0d91d884b19


Created version '1' of model 'Prophet_Sales_Forecasting'.
