In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import optuna
import os
import logging
import warnings
from tqdm import tqdm
import seaborn as sns
from joblib import Parallel, delayed, wrap_non_picklable_objects
import time

# Disable tqdm monitor to avoid warnings
tqdm.monitor_interval = 0

# Setup output directory and logging
img_dir = 'classical_model_results'
os.makedirs(img_dir, exist_ok=True)

logging.basicConfig(
    filename=f'{img_dir}/classical_models_log.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

# Configuration
CONFIG = {
    'forecast_horizon': 12,
    'seasonal_periods': 12,
    'min_data_length': 24,
    'img_dir': img_dir,
    'results_file': f'{img_dir}/classical_model_results.csv',
    'n_jobs': min(4, os.cpu_count()),
    'outlier_threshold': 3,
    'max_diff': 2,
    'lag_features': list(range(1, 7)),
    'rolling_windows': [3, 6, 12],
    'correlation_threshold': 0.2,
    'optuna_trials': 100,
}

def detect_outliers(series, method='iqr', threshold=CONFIG['outlier_threshold']):
    """Detect and replace outliers in a series using IQR or Z-score."""
    if method == 'zscore':
        z_scores = np.abs((series - series.mean()) / series.std())
        outliers = z_scores > threshold
    elif method == 'iqr':
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        outliers = (series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR))
    series_clean = series.copy()
    series_clean[outliers] = series.interpolate(method='linear').bfill().ffill()
    logger.info(f"Detected {outliers.sum()} outliers in series using {method}")
    return series_clean

def validate_input_data(df, required_columns):
    """Validate input DataFrame for required columns, duplicates, and data types."""
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {set(required_columns) - set(df.columns)}")
    if df.index.duplicated().any():
        raise ValueError("Index contains duplicates!")
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
        logger.warning("Index was not monotonically increasing, sorted!")
    if df[required_columns].isnull().sum().any():
        logger.warning(f"Missing values in data: {df[required_columns].isnull().sum().to_dict()}")
        df[required_columns] = df[required_columns].interpolate(method='linear').bfill().ffill()
    if df[required_columns].replace([np.inf, -np.inf], np.nan).isnull().sum().any():
        raise ValueError("Data contains infinite values!")
    if not all(df[required_columns].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
        raise ValueError("Some columns are not numeric!")
    return df

def check_stationarity(series, name):
    """Check stationarity using ADF and KPSS tests, return differencing order."""
    max_diff = CONFIG['max_diff']
    series_clean = series.dropna().replace([np.inf, -np.inf], np.nan).dropna()
    if len(series_clean) < 2:
        logger.error(f"{name}: Data too short after cleaning!")
        return 0
    d = 0
    while d <= max_diff:
        adf_result = adfuller(series_clean)
        kpss_result = kpss(series_clean, regression='c', nlags="auto")
        logger.info(f"{name} (d={d}): ADF p-value={adf_result[1]:.4f}, KPSS p-value={kpss_result[1]:.4f}")
        if adf_result[1] < 0.05 and kpss_result[1] > 0.1:
            logger.info(f"{name} stationary at differencing order d={d}")
            return d
        if d == max_diff:
            logger.warning(f"{name} not stationary after {max_diff} differencing. Using d={d}.")
            return d
        series_clean = series_clean.diff().dropna()
        if len(series_clean) < 2:
            logger.warning(f"{name}: Data too short after differencing {d+1}!")
            return d
        d += 1
    return d

def create_features(data, target, lags=CONFIG['lag_features'], rolling_windows=CONFIG['rolling_windows']):
    """Create lagged, rolling, and seasonal features."""
    logger.info(f"Creating features for {target}, data shape: {data.shape}")
    if len(data) < CONFIG['min_data_length']:
        raise ValueError(f"Data too short: {len(data)} rows")
    if not isinstance(data.index, pd.DatetimeIndex):
        raise ValueError("DataFrame index must be DatetimeIndex")
    
    df = data.copy()
    exog_vars = ['oil_price', 'gold_price']
    required_cols = [target] + exog_vars
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"Missing columns: {required_cols}")
    
    # Handle missing values with interpolation
    df[required_cols] = df[required_cols].interpolate(method='linear').bfill().ffill()
    
    # Create lag and rolling features
    for col in required_cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
        for window in rolling_windows:
            df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
            df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
    
    # Add seasonal features
    df['month'] = df.index.month
    df = pd.get_dummies(df, columns=['month'], prefix='month', dtype=float)  # Ensure numeric dummies
    period = CONFIG['seasonal_periods']
    df['month_sin'] = np.sin(2 * np.pi * df.index.month / period)
    df['month_cos'] = np.cos(2 * np.pi * df.index.month / period)
    df['quarter'] = df.index.quarter.astype(float)  # Ensure quarter is numeric
    
    # Drop columns with all NaN and fill remaining NaN
    df = df.dropna(axis=1, how='all')
    df = df.interpolate(method='linear').bfill().ffill()
    if df.isnull().any().any():
        raise ValueError(f"Data still contains NaN: {df.isnull().sum().to_dict()}")
    
    # Select only numeric columns for feature selection, excluding target
    feature_cols = [col for col in df.columns if col != target and df[col].dtype in [np.float64, np.float32, np.int64, np.int32]]
    logger.info(f"Feature columns before RFE: {feature_cols}")
    
    # Log non-numeric columns for debugging
    non_numeric_cols = [col for col in df.columns if col not in feature_cols and col != target]
    if non_numeric_cols:
        logger.warning(f"Non-numeric columns excluded from features: {non_numeric_cols}")
    
    X = df[feature_cols].dropna()
    y = df[target].loc[X.index]
    
    # Ensure X has enough features for RFE
    n_features = min(10, len(feature_cols))
    if n_features == 0:
        raise ValueError("No valid numeric features available for RFE")
    
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=n_features)
    rfe.fit(X, y)
    selected_features = X.columns[rfe.support_].tolist()
    logger.info(f"Selected features: {selected_features}")
    
    # Return DataFrame with target and selected features
    return df[[target] + selected_features]

def calculate_metrics(actual, predicted, naive_forecast=None):
    """Calculate evaluation metrics for forecasts."""
    actual = np.array(actual, dtype=float)
    predicted = np.array(predicted, dtype=float)
    valid_mask = ~np.isnan(actual) & ~np.isnan(predicted) & ~np.isinf(actual) & ~np.isinf(predicted)
    actual = actual[valid_mask]
    predicted = predicted[valid_mask]
    if len(actual) == 0:
        logger.warning("No valid data for metrics calculation!")
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    mape = mean_absolute_percentage_error(actual, predicted) * 100 if np.all(np.abs(actual) > 1e-8) else np.nan
    smape = 100 * np.mean(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted) + 1e-8))
    norm_mape = mape / np.mean(np.abs(actual)) if not np.isnan(mape) else np.nan
    directional_acc = np.mean((np.diff(actual) * np.diff(predicted)) > 0) * 100 if len(actual) > 1 else np.nan
    mase = np.mean(np.abs(actual - predicted)) / np.mean(np.abs(actual[1:] - naive_forecast[:-1])) if naive_forecast is not None else np.nan
    return rmse, mae, mape, smape, norm_mape, directional_acc, mase

def plot_decomposition(series, period, filename):
    """Plot seasonal decomposition of a series."""
    try:
        decomposition = seasonal_decompose(series, period=period, model='additive')
        fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 8))
        ax1.plot(series.index, series, label='Original'); ax1.legend(loc='upper left')
        ax2.plot(series.index, decomposition.trend, label='Trend'); ax2.legend(loc='upper left')
        ax3.plot(series.index, decomposition.seasonal, label='Seasonal'); ax3.legend(loc='upper left')
        ax4.plot(series.index, decomposition.resid, label='Residual'); ax4.legend(loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(CONFIG['img_dir'], filename), dpi=300)
        logger.info(f"Saved decomposition plot: {filename}")
    except Exception as e:
        logger.error(f"Error saving decomposition plot: {str(e)}")
    finally:
        plt.close()

def plot_forecast(historical, test, forecast, forecast_index, title, ylabel, filename, confidence_intervals=None):
    """Plot historical data, actual test data, and forecast."""
    try:
        plt.figure(figsize=(12, 6))
        plt.plot(historical.index, historical, label='Historical', color='blue')
        plt.plot(test.index, test, label='Actual (Test)', color='green')
        plt.plot(forecast_index, forecast, label='Forecast', color='orange', linestyle='--', linewidth=2)
        if confidence_intervals:
            plt.fill_between(forecast_index, confidence_intervals[0], confidence_intervals[1], color='orange', alpha=0.2, label='95% CI')
        plt.title(title); plt.xlabel('Time'); plt.ylabel(ylabel); plt.legend(); plt.grid(True)
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=3))
        plt.xticks(rotation=45); plt.tight_layout()
        plt.savefig(os.path.join(CONFIG['img_dir'], filename), dpi=300)
        logger.info(f"Saved plot: {filename}")
    except Exception as e:
        logger.error(f"Error saving plot: {str(e)}")
    finally:
        plt.close()

def plot_comparison_forecasts(historical, test, forecasts, forecast_index, title, ylabel, filename, metrics=None):
    """Plot comparison of forecasts from multiple models."""
    try:
        plt.figure(figsize=(14, 8))
        plt.plot(historical.index, historical, label='Historical', color='blue')
        plt.plot(test.index, test, label='Actual (Test)', color='green')
        colors = sns.color_palette("husl", len(forecasts))
        for (model_name, forecast), color in zip(forecasts.items(), colors):
            rmse = metrics.get(model_name, {}).get('RMSE', np.nan) if metrics else np.nan
            if forecast is None or pd.isna(rmse):
                continue
            plt.plot(forecast_index, forecast, label=f'Forecast {model_name} (RMSE: {rmse:.4f})', linestyle='--', color=color)
        plt.title(title); plt.xlabel('Time'); plt.ylabel(ylabel); plt.legend(); plt.grid(True)
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=3))
        plt.xticks(rotation=45); plt.tight_layout()
        plt.savefig(os.path.join(CONFIG['img_dir'], filename), dpi=300)
        logger.info(f"Saved comparison plot: {filename}")
    except Exception as e:
        logger.error(f"Error saving comparison plot: {str(e)}")
    finally:
        plt.close()

def plot_metrics_bar(metrics_df, filename):
    """Plot bar comparison of metrics across models."""
    try:
        plt.figure(figsize=(12, 8))
        metrics = ['RMSE', 'MAE', 'MAPE', 'sMAPE', 'NormMAPE', 'DirAcc', 'MASE']
        for i, metric in enumerate(metrics, 1):
            plt.subplot(3, 3, i)
            sns.barplot(x='Model', y=metric, data=metrics_df)
            plt.title(f'{metric} Comparison')
            plt.xticks(rotation=45)
            plt.tight_layout()
        plt.savefig(os.path.join(CONFIG['img_dir'], filename), dpi=300)
        logger.info(f"Saved metrics bar plot: {filename}")
    except Exception as e:
        logger.error(f"Error saving metrics bar plot: {str(e)}")
    finally:
        plt.close()

def plot_residual_acf(residuals, title, filename):
    """Plot ACF of residuals."""
    if residuals is None or len(residuals) < 2:
        logger.warning(f"Skipping ACF: Insufficient residual data - {title}")
        return
    try:
        plt.figure(figsize=(5, 3))
        max_lags = min(20, len(residuals) - 1)
        if max_lags < 1:
            logger.warning(f"Skipping ACF: Too few residuals - {title}")
            return
        plot_acf(residuals, lags=max_lags, title=title)
        plt.tight_layout()
        plt.savefig(os.path.join(CONFIG['img_dir'], filename), dpi=300)
        logger.info(f"Saved ACF plot: {filename}")
    except Exception as e:
        logger.error(f"Error saving ACF plot: {str(e)}")
    finally:
        plt.close()

def run_exponential_smoothing(train, test, forecast_index, seasonal_periods=CONFIG['seasonal_periods']):
    """Run Exponential Smoothing model."""
    start_time = time.time()
    try:
        model = ExponentialSmoothing(train, trend='add', seasonal='add', seasonal_periods=seasonal_periods).fit(optimized=True)
        forecast = model.forecast(CONFIG['forecast_horizon'])
        residuals = train - model.fittedvalues
        forecast = pd.Series(forecast.values, index=forecast_index)
        resid_std = np.std(residuals)
        ci_lower = forecast - 1.96 * resid_std
        ci_upper = forecast + 1.96 * resid_std
        naive_forecast = train.shift(1).reindex(test.index).fillna(train.iloc[-1])
        rmse, mae, mape, smape, norm_mape, dir_acc, mase = calculate_metrics(test, forecast, naive_forecast)
        logger.info(f"Exponential Smoothing: RMSE={rmse:.4f}, Time={time.time() - start_time:.2f}s")
        return forecast, residuals, rmse, mae, mape, smape, norm_mape, dir_acc, mase, (ci_lower, ci_upper)
    except Exception as e:
        logger.error(f"Error Exponential Smoothing: {str(e)}")
        return None, None, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, None

def run_arima(train, test, forecast_index):
    """Run ARIMA model with Optuna tuning using statsmodels."""
    start_time = time.time()
    try:
        d = check_stationarity(train, "ARIMA")
        def objective(trial):
            p = trial.suggest_int('p', 0, 3)
            q = trial.suggest_int('q', 0, 3)
            try:
                model = SARIMAX(train, order=(p, d, q), trend='c')
                results = model.fit(disp=False, maxiter=100)
                forecast = results.forecast(steps=CONFIG['forecast_horizon'])
                rmse = np.sqrt(mean_squared_error(test, forecast))
                return rmse
            except:
                return float('inf')  # Return high value for failed fits
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=CONFIG['optuna_trials'])
        best_params = study.best_params
        model = SARIMAX(train, order=(best_params['p'], d, best_params['q']), trend='c')
        results = model.fit(disp=False, maxiter=100)
        forecast = results.forecast(steps=CONFIG['forecast_horizon'])
        residuals = train - results.fittedvalues
        forecast = pd.Series(forecast, index=forecast_index)
        naive_forecast = train.shift(1).reindex(test.index).fillna(train.iloc[-1])
        rmse, mae, mape, smape, norm_mape, dir_acc, mase = calculate_metrics(test, forecast, naive_forecast)
        logger.info(f"ARIMA (order=({best_params['p']}, {d}, {best_params['q']})): RMSE={rmse:.4f}, Time={time.time() - start_time:.2f}s")
        return forecast, residuals, rmse, mae, mape, smape, norm_mape, dir_acc, mase, None
    except Exception as e:
        logger.error(f"Error ARIMA: {str(e)}")
        return None, None, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, None

def run_sarima(train, test, forecast_index, seasonal_periods=CONFIG['seasonal_periods']):
    """Run SARIMA model with Optuna tuning using statsmodels."""
    start_time = time.time()
    try:
        d = check_stationarity(train, "SARIMA")
        def objective(trial):
            p = trial.suggest_int('p', 0, 3)
            q = trial.suggest_int('q', 0, 3)
            P = trial.suggest_int('P', 0, 1)
            D = trial.suggest_int('D', 0, 1)
            Q = trial.suggest_int('Q', 0, 1)
            try:
                model = SARIMAX(train, order=(p, d, q), seasonal_order=(P, D, Q, seasonal_periods), trend='c')
                results = model.fit(disp=False, maxiter=100)
                forecast = results.forecast(steps=CONFIG['forecast_horizon'])
                rmse = np.sqrt(mean_squared_error(test, forecast))
                return rmse
            except:
                return float('inf')
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=CONFIG['optuna_trials'])
        best_params = study.best_params
        model = SARIMAX(train, order=(best_params['p'], d, best_params['q']),
                        seasonal_order=(best_params['P'], best_params['D'], best_params['Q'], seasonal_periods),
                        trend='c')
        results = model.fit(disp=False, maxiter=100)
        forecast = results.forecast(steps=CONFIG['forecast_horizon'])
        residuals = train - results.fittedvalues
        forecast = pd.Series(forecast, index=forecast_index)
        naive_forecast = train.shift(1).reindex(test.index).fillna(train.iloc[-1])
        rmse, mae, mape, smape, norm_mape, dir_acc, mase = calculate_metrics(test, forecast, naive_forecast)
        logger.info(f"SARIMA (order=({best_params['p']}, {d}, {best_params['q']}), seasonal_order=({best_params['P']}, {best_params['D']}, {best_params['Q']}, {seasonal_periods})): RMSE={rmse:.4f}, Time={time.time() - start_time:.2f}s")
        return forecast, residuals, rmse, mae, mape, smape, norm_mape, dir_acc, mase, None
    except Exception as e:
        logger.error(f"Error SARIMA: {str(e)}")
        return None, None, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, None

def run_sarimax(train, test, forecast_index, exog_train, exog_test):
    """Run SARIMAX model with Optuna tuning using statsmodels."""
    start_time = time.time()
    try:
        exog_train = exog_train.interpolate(method='linear').bfill().ffill()
        exog_test = exog_test.interpolate(method='linear').bfill().ffill()
        d = check_stationarity(train, "SARIMAX")
        def objective(trial):
            p = trial.suggest_int('p', 0, 2)
            q = trial.suggest_int('q', 0, 2)
            P = trial.suggest_int('P', 0, 1)
            D = trial.suggest_int('D', 0, 1)
            Q = trial.suggest_int('Q', 0, 1)
            try:
                model = SARIMAX(train, exog=exog_train, order=(p, d, q),
                                seasonal_order=(P, D, Q, CONFIG['seasonal_periods']), trend='c')
                results = model.fit(disp=False, maxiter=100)
                forecast = results.forecast(steps=CONFIG['forecast_horizon'], exog=exog_test)
                rmse = np.sqrt(mean_squared_error(test, forecast))
                return rmse
            except:
                return float('inf')
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=CONFIG['optuna_trials'])
        best_params = study.best_params
        model = SARIMAX(train, exog=exog_train, order=(best_params['p'], d, best_params['q']),
                        seasonal_order=(best_params['P'], best_params['D'], best_params['Q'], CONFIG['seasonal_periods']),
                        trend='c')
        results = model.fit(disp=False, maxiter=100)
        forecast = results.forecast(steps=CONFIG['forecast_horizon'], exog=exog_test)
        residuals = train - results.fittedvalues
        forecast = pd.Series(forecast, index=forecast_index)
        naive_forecast = train.shift(1).reindex(test.index).fillna(train.iloc[-1])
        rmse, mae, mape, smape, norm_mape, dir_acc, mase = calculate_metrics(test, forecast, naive_forecast)
        logger.info(f"SARIMAX (order=({best_params['p']}, {d}, {best_params['q']}), seasonal_order=({best_params['P']}, {best_params['D']}, {best_params['Q']}, {CONFIG['seasonal_periods']})): RMSE={rmse:.4f}, Time={time.time() - start_time:.2f}s")
        return forecast, residuals, rmse, mae, mape, smape, norm_mape, dir_acc, mase, None
    except Exception as e:
        logger.error(f"Error SARIMAX: {str(e)}")
        return None, None, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, None

def run_prophet(train, test, forecast_index, exog_train=None, exog_test=None):
    """Run Prophet model with Optuna tuning."""
    start_time = time.time()
    try:
        df_train = pd.DataFrame({'ds': train.index, 'y': train.values})
        if exog_train is not None:
            for col in exog_train.columns:
                df_train[col] = exog_train[col].values
        def objective(trial):
            model = Prophet(
                yearly_seasonality=True,
                weekly_seasonality=False,
                daily_seasonality=False,
                changepoint_prior_scale=trial.suggest_float('changepoint_prior_scale', 0.01, 0.5, log=True),
                seasonality_prior_scale=trial.suggest_float('seasonality_prior_scale', 0.1, 10.0, log=True)
            )
            if exog_train is not None:
                for col in exog_train.columns:
                    model.add_regressor(col)
            model.fit(df_train)
            future = pd.DataFrame({'ds': forecast_index})
            if exog_test is not None:
                for col in exog_test.columns:
                    future[col] = exog_test[col].values
            forecast = model.predict(future)
            rmse = np.sqrt(mean_squared_error(test, forecast['yhat']))
            return rmse
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=CONFIG['optuna_trials'])
        best_params = study.best_params
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False,
            changepoint_prior_scale=best_params['changepoint_prior_scale'],
            seasonality_prior_scale=best_params['seasonality_prior_scale']
        )
        if exog_train is not None:
            for col in exog_train.columns:
                model.add_regressor(col)
        model.fit(df_train)
        future = pd.DataFrame({'ds': forecast_index})
        if exog_test is not None:
            for col in exog_test.columns:
                future[col] = exog_test[col].values
        forecast = model.predict(future)
        forecast_series = pd.Series(forecast['yhat'].values, index=forecast_index)
        residuals = train - model.predict(df_train)['yhat']
        ci_lower = pd.Series(forecast['yhat_lower'].values, index=forecast_index)
        ci_upper = pd.Series(forecast['yhat_upper'].values, index=forecast_index)
        naive_forecast = train.shift(1).reindex(test.index).fillna(train.iloc[-1])
        rmse, mae, mape, smape, norm_mape, dir_acc, mase = calculate_metrics(test, forecast_series, naive_forecast)
        logger.info(f"Prophet: RMSE={rmse:.4f}, Time={time.time() - start_time:.2f}s")
        return forecast_series, residuals, rmse, mae, mape, smape, norm_mape, dir_acc, mase, (ci_lower, ci_upper)
    except Exception as e:
        logger.error(f"Error Prophet: {str(e)}")
        return None, None, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, None

def generate_report(results_df, forecasts_cpi, img_dir):
    """Generate HTML report of model performance."""
    try:
        report = results_df.style.highlight_min(subset=['RMSE', 'MAE', 'MAPE', 'sMAPE', 'NormMAPE', 'MASE'], color='lightgreen')\
                                .highlight_max(subset=['DirAcc'], color='lightgreen')\
                                .set_caption("Model Performance Summary")
        with open(f'{img_dir}/report.html', 'w') as f:
            f.write(report.to_html())
        logger.info("Generated report at report.html")
    except Exception as e:
        logger.error(f"Error generating report: {str(e)}")

def run_model_for_target(target, train, test, forecast_index, model_name, model_func, params, retries=3):
    """Run a model for a target variable with retries."""
    logger.info(f"Running {model_name} for {target}")
    start_time = time.time()
    for attempt in range(retries):
        try:
            if model_name == 'SARIMAX' or model_name == 'Prophet':
                exog_vars = ['oil_price', 'gold_price']
                exog_train = train[exog_vars]
                exog_test = test[exog_vars].reindex(forecast_index).interpolate(method='linear').bfill().ffill()
                forecast, residuals, rmse, mae, mape, smape, norm_mape, dir_acc, mase, ci = model_func(
                    train[target], test[target], forecast_index, exog_train, exog_test, **params
                )
            else:
                forecast, residuals, rmse, mae, mape, smape, norm_mape, dir_acc, mase, ci = model_func(
                    train[target], test[target], forecast_index, **params
                )
            if forecast is None or pd.isna(rmse):
                logger.warning(f"{model_name} for {target} failed to produce valid forecast or RMSE")
                continue
            plot_forecast(train[target][-36:], test[target], forecast, forecast_index,
                          f'{model_name} Forecast for {target}', target, f'{target}_{model_name}_forecast.png', ci)
            if residuals is not None:
                plot_residual_acf(residuals.dropna(), f'ACF of Residuals - {model_name} ({target})',
                                  f'{target}_{model_name}_acf.png')
            logger.info(f"Completed {model_name} for {target} in {time.time() - start_time:.2f}s")
            return {
                'Target': target,
                'Model': model_name,
                'RMSE': rmse,
                'MAE': mae,
                'MAPE': mape,
                'sMAPE': smape,
                'NormMAPE': norm_mape,
                'DirAcc': dir_acc,
                'MASE': mase,
                'Forecast': forecast,
                'Residuals': residuals,
                'CI': ci
            }
        except Exception as e:
            logger.warning(f"Attempt {attempt+1} failed for {model_name}: {str(e)}")
            if attempt == retries - 1:
                logger.error(f"All retries failed for {model_name} on {target}")
                return None
    return None

def main():
    """Main function to run time series forecasting."""
    try:
        data = pd.read_csv('data/data.csv')
        data['time'] = pd.to_datetime(data['date'])
        data.set_index('time', inplace=True)
        required_columns = ['cpi', 'oil_price', 'gold_price']
        data = validate_input_data(data, required_columns)
        
        for col in required_columns:
            data[col] = detect_outliers(data[col], method='iqr')
        
        data_features = create_features(data, 'cpi')
        
        train_size = len(data_features) - CONFIG['forecast_horizon']
        train, test = data_features[:train_size], data_features[train_size:]
        forecast_index = pd.date_range(start=test.index[0], periods=CONFIG['forecast_horizon'], freq='MS')
        
        plot_decomposition(data['cpi'], period=CONFIG['seasonal_periods'], filename='cpi_decomposition.png')
        
        models = {
            'ARIMA': (run_arima, {}),
            'Exponential Smoothing': (run_exponential_smoothing, {}),
            'Prophet': (run_prophet, {}),
            'SARIMA': (run_sarima, {}),
            'SARIMAX': (run_sarimax, {})
        }
        
        results = []
        forecasts_cpi = {}
        metrics_cpi = {}
        logger.info("Running models for CPI")
        tasks = [delayed(wrap_non_picklable_objects(run_model_for_target))('cpi', train, test, forecast_index, model_name, model_func, params)
                 for model_name, (model_func, params) in models.items()]
        model_results = Parallel(n_jobs=CONFIG['n_jobs'], verbose=1)(tasks)
        
        for result in model_results:
            if result is not None:
                results.append({
                    'Target': result['Target'],
                    'Model': result['Model'],
                    'RMSE': result['RMSE'],
                    'MAE': result['MAE'],
                    'MAPE': result['MAPE'],
                    'sMAPE': result['sMAPE'],
                    'NormMAPE': result['NormMAPE'],
                    'DirAcc': result['DirAcc'],
                    'MASE': result['MASE']
                })
                forecasts_cpi[result['Model']] = result['Forecast']
                metrics_cpi[result['Model']] = {'RMSE': result['RMSE']}
            else:
                logger.warning(f"Result for a CPI model is None, skipping!")
        
        if forecasts_cpi:
            weights = {model: 1/max(metrics_cpi[model]['RMSE'], 1e-8) for model in metrics_cpi}
            total_weight = sum(weights.values())
            weights = {model: w/total_weight for model, w in weights.items()}
            plot_comparison_forecasts(train['cpi'][-36:], test['cpi'], forecasts_cpi, forecast_index,
                                     'Comparison of Forecasts for CPI', 'CPI', 'cpi_model_comparison.png',
                                     metrics=metrics_cpi)
        
        results_df = pd.DataFrame(results)
        print(results_df)
        results_df.to_csv(CONFIG['results_file'], index=False)
        logger.info(f"Results saved to {CONFIG['results_file']}")
        
        if not results_df.empty:
            plot_metrics_bar(results_df, 'cpi_metrics_comparison.png')
            generate_report(results_df, forecasts_cpi, img_dir)
        
        if forecasts_cpi:
            combined_forecast = pd.DataFrame({'Date': forecast_index})
            for model_name, forecast in forecasts_cpi.items():
                combined_forecast[f'{model_name}_cpi'] = forecast
            combined_forecast.to_csv(f'{img_dir}/combined_forecast_cpi.csv', index=False)
            logger.info(f"Combined forecasts saved to {img_dir}/combined_forecast_cpi.csv")
            
    except Exception as e:
        logger.error(f"Main program error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 5 out of 5 | elapsed:  9.1min finished


  Target                  Model      RMSE       MAE      MAPE     sMAPE  \
0    cpi                  ARIMA  0.305111  0.209585  0.208702  0.208942   
1    cpi  Exponential Smoothing  0.267943  0.228681  0.228223  0.228172   
2    cpi                 SARIMA  0.179295  0.159355  0.158901  0.158919   
3    cpi                SARIMAX  0.208344  0.173125  0.172700  0.172629   

   NormMAPE     DirAcc      MASE  
0  0.002082  63.636364  0.929611  
1  0.002277  36.363636  1.014313  
2  0.001585  27.272727  0.706817  
3  0.001723  27.272727  0.767893  
