In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

def load_data(file_path):
    """Load and prepare the time series data"""
    df = pd.read_csv(file_path)
    
    # Convert time column to datetime
    df['time'] = pd.to_datetime(df['time'])
    
    # Set time as index
    df.set_index('time', inplace=True)
    
    return df

def engineer_features(df):
    """Create additional features for modeling"""
    df_copy = df.copy()
    
    # Create lag features
    for i in range(1, 13):
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_lag_{i}'] = df_copy['cpi_mom'].shift(i)
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_lag_{i}'] = df_copy['cpi_yoy'].shift(i)
    
    # Create rolling window features
    for window in [3, 6, 12]:
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_rolling_mean_{window}'] = df_copy['cpi_mom'].rolling(window=window).mean()
            df_copy[f'cpi_mom_rolling_std_{window}'] = df_copy['cpi_mom'].rolling(window=window).std()
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_rolling_mean_{window}'] = df_copy['cpi_yoy'].rolling(window=window).mean()
            df_copy[f'cpi_yoy_rolling_std_{window}'] = df_copy['cpi_yoy'].rolling(window=window).std()
    
    # Create economic indicator lag features
    for i in range(1, 4):
        if 'oil_price' in df_copy.columns:
            df_copy[f'oil_price_lag_{i}'] = df_copy['oil_price'].shift(i)
        if 'gold_price' in df_copy.columns:
            df_copy[f'gold_price_lag_{i}'] = df_copy['gold_price'].shift(i)
        if 'interest_rate' in df_copy.columns:
            df_copy[f'interest_rate_lag_{i}'] = df_copy['interest_rate'].shift(i)
    
    # Add month and year as cyclical features
    if 'month' in df_copy.columns:
        df_copy['month_sin'] = np.sin(2 * np.pi * df_copy['month']/12)
        df_copy['month_cos'] = np.cos(2 * np.pi * df_copy['month']/12)
    
    # Create interaction features
    if all(col in df_copy.columns for col in ['oil_price', 'gold_price']):
        df_copy['oil_gold_ratio'] = df_copy['oil_price'] / df_copy['gold_price']
    
    # Drop rows with NaN values (due to lag features)
    df_clean = df_copy.dropna()
    
    return df_clean

def calculate_metrics(actual, predicted):
    """Calculate evaluation metrics"""
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    
    return rmse, mae, mape, r2

def run_linear_regression(df, target_col, test_size=0.2, model_type='linear', tune_hyperparams=True):
    """
    Train and evaluate a Linear Regression model for time series forecasting
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with engineered features
    target_col : str
        Name of the target column
    test_size : float
        Proportion of data to use for testing
    model_type : str
        Type of linear model ('linear', 'ridge', 'lasso', or 'elastic_net')
    tune_hyperparams : bool
        Whether to perform hyperparameter tuning (only for ridge, lasso, elastic_net)
        
    Returns:
    --------
    tuple
        Model, predictions, coefficients, metrics
    """
    print(f"Running {model_type.capitalize()} Regression model for {target_col}...")
    
    # Define features and target
    X = df.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df.columns], axis=1)
    y = df[target_col]
    
    # Split data into training and testing sets (time-based split)
    split_idx = int(len(df) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")
    
    # Create pipeline with scaling
    if model_type == 'linear':
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', LinearRegression())
        ])
        
        # No hyperparameters to tune for standard linear regression
        pipeline.fit(X_train, y_train)
        
    elif model_type == 'ridge' and tune_hyperparams:
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', Ridge(random_state=42))
        ])
        
        # Define parameter grid
        param_grid = {
            'model__alpha': np.logspace(-3, 3, 20)
        }
        
        # Use TimeSeriesSplit for cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Create and fit GridSearchCV
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        # Get best parameters and model
        best_params = grid_search.best_params_
        print(f"Best parameters: {best_params}")
        
        # Use the best model
        pipeline = grid_search.best_estimator_
        
    elif model_type == 'lasso' and tune_hyperparams:
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', Lasso(random_state=42))
        ])
        
        # Define parameter grid
        param_grid = {
            'model__alpha': np.logspace(-3, 3, 20)
        }
        
        # Use TimeSeriesSplit for cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Create and fit GridSearchCV
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        # Get best parameters and model
        best_params = grid_search.best_params_
        print(f"Best parameters: {best_params}")
        
        # Use the best model
        pipeline = grid_search.best_estimator_
        
    elif model_type == 'elastic_net' and tune_hyperparams:
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', ElasticNet(random_state=42))
        ])
        
        # Define parameter grid
        param_grid = {
            'model__alpha': np.logspace(-3, 3, 10),
            'model__l1_ratio': np.linspace(0.1, 0.9, 9)
        }
        
        # Use TimeSeriesSplit for cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Create and fit GridSearchCV
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        # Get best parameters and model
        best_params = grid_search.best_params_
        print(f"Best parameters: {best_params}")
        
        # Use the best model
        pipeline = grid_search.best_estimator_
        
    else:
        # Use default parameters if not tuning
        if model_type == 'ridge':
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', Ridge(alpha=1.0, random_state=42))
            ])
        elif model_type == 'lasso':
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', Lasso(alpha=0.1, random_state=42))
            ])
        elif model_type == 'elastic_net':
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42))
            ])
        else:
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', LinearRegression())
            ])
        
        pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    # Calculate metrics
    train_rmse, train_mae, train_mape, train_r2 = calculate_metrics(y_train, y_pred_train)
    test_rmse, test_mae, test_mape, test_r2 = calculate_metrics(y_test, y_pred_test)
    
    print(f"Training - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, MAPE: {train_mape:.2f}%, R²: {train_r2:.4f}")
    print(f"Testing - RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}, MAPE: {test_mape:.2f}%, R²: {test_r2:.4f}")
    
    # Get coefficients
    model = pipeline.named_steps['model']
    coefficients = pd.DataFrame({
        'Feature': X_train.columns,
        'Coefficient': model.coef_
    }).sort_values('Coefficient', key=abs, ascending=False)
    
    # Add intercept
    if hasattr(model, 'intercept_'):
        print(f"Intercept: {model.intercept_:.4f}")
    
    # Plot coefficients
    plt.figure(figsize=(12, 8))
    top_features = coefficients.head(20)
    sns.barplot(x='Coefficient', y='Feature', data=top_features)
    plt.title(f'Top 20 Coefficients for {model_type.capitalize()} Regression - {target_col}')
    plt.tight_layout()
    plt.savefig(f'plots/linear_reg_coefficients_{model_type}_{target_col}.png')
    plt.close()
    
    # Plot actual vs predicted
    plt.figure(figsize=(14, 7))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', color='blue')
    plt.plot(df.index[-len(y_test):], y_pred_test, label='Predicted', color='red', linestyle='--')
    plt.title(f'{model_type.capitalize()} Regression: Actual vs Predicted {target_col}')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/linear_reg_predictions_{model_type}_{target_col}.png')
    plt.close()
    
    # Create a DataFrame with predictions
    predictions = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred_test,
        'Error': y_test - y_pred_test
    })
    
    # Plot error distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(predictions['Error'], kde=True)
    plt.title(f'Error Distribution for {model_type.capitalize()} Regression - {target_col}')
    plt.xlabel('Error')
    plt.savefig(f'plots/linear_reg_error_distribution_{model_type}_{target_col}.png')
    plt.close()
    
    # Plot residuals vs fitted values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred_test, predictions['Error'])
    plt.axhline(y=0, color='r', linestyle='-')
    plt.title(f'Residuals vs Fitted Values for {model_type.capitalize()} Regression - {target_col}')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.savefig(f'plots/linear_reg_residuals_{model_type}_{target_col}.png')
    plt.close()
    
    # Return results
    metrics = {
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_mape': train_mape,
        'train_r2': train_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_mape': test_mape,
        'test_r2': test_r2
    }
    
    return pipeline, predictions, coefficients, metrics

def recursive_forecast(model, df, target_col, forecast_horizon=12):
    """
    Generate recursive forecasts using the trained model
    
    Parameters:
    -----------
    model : Pipeline
        Trained model pipeline
    df : pd.DataFrame
        Input dataframe with features
    target_col : str
        Name of the target column
    forecast_horizon : int
        Number of periods to forecast
        
    Returns:
    --------
    pd.Series
        Forecasted values
    """
    print(f"Generating {forecast_horizon} period forecast...")
    
    # Create a copy of the dataframe
    df_forecast = df.copy()
    
    # Get the last date in the dataframe
    last_date = df_forecast.index[-1]
    
    # Create a list to store forecasts
    forecasts = []
    
    # Generate forecasts recursively
    for i in range(forecast_horizon):
        # Get the features for the next period
        if i == 0:
            # For the first forecast, use the last row of the dataframe
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        else:
            # Update the dataframe with the previous forecast
            # This requires updating all the lag features, rolling means, etc.
            # For simplicity, we'll just use the last forecast as a naive prediction
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        
        # Make prediction
        forecast = model.predict(X_next)[0]
        
        # Store forecast
        forecasts.append(forecast)
        
        # Create next date
        next_date = last_date + pd.DateOffset(months=i+1)
        
        # For a more accurate implementation, we would need to update all features
        # based on the new forecast, but this is a simplified version
    
    # Create a Series with the forecasts
    future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=forecast_horizon, freq='MS')
    forecast_series = pd.Series(forecasts, index=future_dates)
    
    # Plot historical data with forecasts
    plt.figure(figsize=(14, 7))
    plt.plot(df[target_col].index, df[target_col], label='Historical Data')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', color='red', linestyle='--')
    plt.title(f'Linear Regression: {target_col} Forecast')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/linear_reg_future_forecast_{target_col}.png')
    plt.close()
    
    return forecast_series

def main():
    # Create plots directory if it doesn't exist
    import os
    if not os.path.exists('plots'):
        os.makedirs('plots')
    
    # Load data
    file_path = 'data/analyzed_time_series.csv'
    df = load_data(file_path)
    
    # Engineer features
    df_engineered = engineer_features(df)
    
    # List of models to run
    models = ['linear', 'ridge', 'lasso', 'elastic_net']
    
    # Dictionary to store results
    results = {}
    
    # Run models for CPI Year-over-Year
    target_col = 'cpi_yoy'
    for model_type in models:
        print(f"\n{'-'*50}")
        print(f"Running {model_type.capitalize()} Regression for {target_col}")
        print(f"{'-'*50}")
        
        model, predictions, coefficients, metrics = run_linear_regression(
            df_engineered, target_col, test_size=0.2, model_type=model_type, tune_hyperparams=True
        )
        
        results[f"{model_type}_yoy"] = {
            'model': model,
            'predictions': predictions,
            'coefficients': coefficients,
            'metrics': metrics
        }
        
        # Generate future forecasts
        forecast = recursive_forecast(model, df_engineered, target_col, forecast_horizon=24)
        print(f"\nFuture {target_col} forecasts:")
        print(forecast)
        
        results[f"{model_type}_yoy"]['forecast'] = forecast
    
    # Run models for CPI Month-over-Month
    target_col = 'cpi_mom'
    for model_type in models:
        print(f"\n{'-'*50}")
        print(f"Running {model_type.capitalize()} Regression for {target_col}")
        print(f"{'-'*50}")
        
        model, predictions, coefficients, metrics = run_linear_regression(
            df_engineered, target_col, test_size=0.2, model_type=model_type, tune_hyperparams=True
        )
        
        results[f"{model_type}_mom"] = {
            'model': model,
            'predictions': predictions,
            'coefficients': coefficients,
            'metrics': metrics
        }
        
        # Generate future forecasts
        forecast = recursive_forecast(model, df_engineered, target_col, forecast_horizon=24)
        print(f"\nFuture {target_col} forecasts:")
        print(forecast)
        
        results[f"{model_type}_mom"]['forecast'] = forecast
    
    # Compare models
    print("\n\nModel Comparison - CPI Year-over-Year:")
    print("-" * 80)
    print(f"{'Model':<15} {'Train RMSE':<12} {'Test RMSE':<12} {'Train R²':<12} {'Test R²':<12}")
    print("-" * 80)
    for model_type in models:
        metrics = results[f"{model_type}_yoy"]['metrics']
        print(f"{model_type.capitalize():<15} {metrics['train_rmse']:<12.4f} {metrics['test_rmse']:<12.4f} {metrics['train_r2']:<12.4f} {metrics['test_r2']:<12.4f}")
    
    print("\n\nModel Comparison - CPI Month-over-Month:")
    print("-" * 80)
    print(f"{'Model':<15} {'Train RMSE':<12} {'Test RMSE':<12} {'Train R²':<12} {'Test R²':<12}")
    print("-" * 80)
    for model_type in models:
        metrics = results[f"{model_type}_mom"]['metrics']
        print(f"{model_type.capitalize():<15} {metrics['train_rmse']:<12.4f} {metrics['test_rmse']:<12.4f} {metrics['train_r2']:<12.4f} {metrics['test_r2']:<12.4f}")
    
    # Save metrics to CSV
    metrics_data = []
    for model_type in models:
        metrics_yoy = results[f"{model_type}_yoy"]['metrics']
        metrics_mom = results[f"{model_type}_mom"]['metrics']
        
        metrics_data.append({
            'Model': f"{model_type.capitalize()} Regression",
            'Target': 'CPI YoY',
            'Train_RMSE': metrics_yoy['train_rmse'],
            'Train_MAE': metrics_yoy['train_mae'],
            'Train_MAPE': metrics_yoy['train_mape'],
            'Train_R2': metrics_yoy['train_r2'],
            'Test_RMSE': metrics_yoy['test_rmse'],
            'Test_MAE': metrics_yoy['test_mae'],
            'Test_MAPE': metrics_yoy['test_mape'],
            'Test_R2': metrics_yoy['test_r2']
        })
        
        metrics_data.append({
            'Model': f"{model_type.capitalize()} Regression",
            'Target': 'CPI MoM',
            'Train_RMSE': metrics_mom['train_rmse'],
            'Train_MAE': metrics_mom['train_mae'],
            'Train_MAPE': metrics_mom['train_mape'],
            'Train_R2': metrics_mom['train_r2'],
            'Test_RMSE': metrics_mom['test_rmse'],
            'Test_MAE': metrics_mom['test_mae'],
            'Test_MAPE': metrics_mom['test_mape'],
            'Test_R2': metrics_mom['test_r2']
        })
    
    metrics_df = pd.DataFrame(metrics_data)
    metrics_df.to_csv('plots/linear_regression_metrics.csv', index=False)
    print("\nMetrics saved to plots/linear_regression_metrics.csv")

if __name__ == "__main__":
    main()


--------------------------------------------------
Running Linear Regression for cpi_yoy
--------------------------------------------------
Running Linear Regression model for cpi_yoy...
Training data: (269, 63), Test data: (68, 63)
Training - RMSE: 0.0000, MAE: 0.0000, MAPE: 0.00%, R²: 1.0000
Testing - RMSE: 0.0000, MAE: 0.0000, MAPE: 0.00%, R²: 1.0000
Intercept: 106.3346
Generating 24 period forecast...

Future cpi_yoy forecasts:
2025-01-01    102.96
2025-02-01    102.96
2025-03-01    102.96
2025-04-01    102.96
2025-05-01    102.96
2025-06-01    102.96
2025-07-01    102.96
2025-08-01    102.96
2025-09-01    102.96
2025-10-01    102.96
2025-11-01    102.96
2025-12-01    102.96
2026-01-01    102.96
2026-02-01    102.96
2026-03-01    102.96
2026-04-01    102.96
2026-05-01    102.96
2026-06-01    102.96
2026-07-01    102.96
2026-08-01    102.96
2026-09-01    102.96
2026-10-01    102.96
2026-11-01    102.96
2026-12-01    102.96
Freq: MS, dtype: float64

---------------------------------

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def load_data(file_path):
    """Load and prepare the time series data"""
    df = pd.read_csv(file_path)
    
    # Convert time column to datetime
    df['time'] = pd.to_datetime(df['time'])
    
    # Set time as index
    df.set_index('time', inplace=True)
    
    return df

def engineer_features(df):
    """Create additional features for modeling"""
    df_copy = df.copy()
    
    # Create lag features
    for i in range(1, 13):
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_lag_{i}'] = df_copy['cpi_mom'].shift(i)
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_lag_{i}'] = df_copy['cpi_yoy'].shift(i)
    
    # Create rolling window features
    for window in [3, 6, 12]:
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_rolling_mean_{window}'] = df_copy['cpi_mom'].rolling(window=window).mean()
            df_copy[f'cpi_mom_rolling_std_{window}'] = df_copy['cpi_mom'].rolling(window=window).std()
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_rolling_mean_{window}'] = df_copy['cpi_yoy'].rolling(window=window).mean()
            df_copy[f'cpi_yoy_rolling_std_{window}'] = df_copy['cpi_yoy'].rolling(window=window).std()
    
    # Create economic indicator lag features
    for i in range(1, 4):
        if 'oil_price' in df_copy.columns:
            df_copy[f'oil_price_lag_{i}'] = df_copy['oil_price'].shift(i)
        if 'gold_price' in df_copy.columns:
            df_copy[f'gold_price_lag_{i}'] = df_copy['gold_price'].shift(i)
        if 'interest_rate' in df_copy.columns:
            df_copy[f'interest_rate_lag_{i}'] = df_copy['interest_rate'].shift(i)
    
    # Add month and year as cyclical features
    if 'month' in df_copy.columns:
        df_copy['month_sin'] = np.sin(2 * np.pi * df_copy['month']/12)
        df_copy['month_cos'] = np.cos(2 * np.pi * df_copy['month']/12)
    
    # Create interaction features
    if all(col in df_copy.columns for col in ['oil_price', 'gold_price']):
        df_copy['oil_gold_ratio'] = df_copy['oil_price'] / df_copy['gold_price']
    
    # Drop rows with NaN values (due to lag features)
    df_clean = df_copy.dropna()
    
    return df_clean

def run_rf(df, target_col, test_size=0.2, tune_hyperparams=True):
    """
    Train and evaluate a Random Forest model for time series forecasting
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with engineered features
    target_col : str
        Name of the target column
    test_size : float
        Proportion of data to use for testing
    tune_hyperparams : bool
        Whether to perform hyperparameter tuning
        
    Returns:
    --------
    tuple
        Model, predictions, feature importances, and metrics
    """
    print(f"Running Random Forest model for {target_col}...")
    
    # Define features and target
    X = df.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df.columns], axis=1)
    y = df[target_col]
    
    # Split data into training and testing sets (time-based split)
    split_idx = int(len(df) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")
    
    if tune_hyperparams:
        print("Performing hyperparameter tuning...")
        # Define parameter grid
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        
        # Use TimeSeriesSplit for cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Create and fit GridSearchCV
        rf_grid = GridSearchCV(
            estimator=RandomForestRegressor(random_state=42),
            param_grid=param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        rf_grid.fit(X_train, y_train)
        
        # Get best parameters and model
        best_params = rf_grid.best_params_
        print(f"Best parameters: {best_params}")
        
        # Train model with best parameters
        model = RandomForestRegressor(random_state=42, **best_params)
    else:
        # Use default parameters
        model = RandomForestRegressor(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42
        )
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_pred_train)
    train_rmse = np.sqrt(train_mse)
    train_mae = mean_absolute_error(y_train, y_pred_train)
    train_mape = np.mean(np.abs((y_train - y_pred_train) / y_train)) * 100
    train_r2 = r2_score(y_train, y_pred_train)
    
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
    test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"Training - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, R²: {train_r2:.4f}")
    print(f"Testing - RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}, R²: {test_r2:.4f}")
    
    # Get feature importances
    feature_importances = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances.head(20))
    plt.title(f'Top 20 Feature Importances for {target_col}')
    plt.tight_layout()
    plt.savefig(f'plots/rf_feature_importance_{target_col}.png')
    plt.close()
    
    # Plot actual vs predicted
    plt.figure(figsize=(14, 7))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', color='blue')
    plt.plot(df.index[-len(y_test):], y_pred_test, label='Predicted', color='red', linestyle='--')
    plt.title(f'Random Forest: Actual vs Predicted {target_col}')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/rf_predictions_{target_col}.png')
    plt.close()
    
    # Create a DataFrame with predictions
    predictions = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred_test,
        'Error': y_test - y_pred_test
    })
    
    # Plot error distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(predictions['Error'], kde=True)
    plt.title(f'Error Distribution for {target_col}')
    plt.xlabel('Error')
    plt.savefig(f'plots/rf_error_distribution_{target_col}.png')
    plt.close()
    
    # Return results
    metrics = {
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_mape': train_mape,
        'train_r2': train_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_mape': test_mape,
        'test_r2': test_r2
    }
    
    return model, predictions, feature_importances, metrics

def recursive_forecast(model, df, target_col, forecast_horizon=12):
    """
    Generate recursive forecasts using the trained model
    
    Parameters:
    -----------
    model : RandomForestRegressor
        Trained model
    df : pd.DataFrame
        Input dataframe with features
    target_col : str
        Name of the target column
    forecast_horizon : int
        Number of periods to forecast
        
    Returns:
    --------
    pd.Series
        Forecasted values
    """
    print(f"Generating {forecast_horizon} period forecast...")
    
    # Create a copy of the dataframe
    df_forecast = df.copy()
    
    # Get the last date in the dataframe
    last_date = df_forecast.index[-1]
    
    # Create a list to store forecasts
    forecasts = []
    
    # Generate forecasts recursively
    for i in range(forecast_horizon):
        # Get the features for the next period
        if i == 0:
            # For the first forecast, use the last row of the dataframe
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        else:
            # Update the dataframe with the previous forecast
            # This requires updating all the lag features, rolling means, etc.
            # For simplicity, we'll just use the last forecast as a naive prediction
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        
        # Make prediction
        forecast = model.predict(X_next)[0]
        
        # Store forecast
        forecasts.append(forecast)
        
        # Create next date
        next_date = last_date + pd.DateOffset(months=i+1)
        
        # For a more accurate implementation, we would need to update all features
        # based on the new forecast, but this is a simplified version
    
    # Create a Series with the forecasts
    future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=forecast_horizon, freq='MS')
    forecast_series = pd.Series(forecasts, index=future_dates)
    
    # Plot historical data with forecasts
    plt.figure(figsize=(14, 7))
    plt.plot(df[target_col].index, df[target_col], label='Historical Data')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', color='red', linestyle='--')
    plt.title(f'Random Forest: {target_col} Forecast')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/rf_future_forecast_{target_col}.png')
    plt.close()
    
    return forecast_series

def main():
    # Create plots directory if it doesn't exist
    import os
    if not os.path.exists('plots'):
        os.makedirs('plots')
    
    # Load data
    file_path = 'data/analyzed_time_series.csv'
    df = load_data(file_path)
    
    # Engineer features
    df_engineered = engineer_features(df)
    
    # Run Random Forest for CPI Year-over-Year
    target_col = 'cpi_yoy'
    model_yoy, predictions_yoy, feature_importances_yoy, metrics_yoy = run_rf(
        df_engineered, target_col, test_size=0.2, tune_hyperparams=True
    )
    
    # Generate future forecasts for YoY
    forecast_yoy = recursive_forecast(model_yoy, df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_yoy)
    
    # Run Random Forest for CPI Month-over-Month
    target_col = 'cpi_mom'
    model_mom, predictions_mom, feature_importances_mom, metrics_mom = run_rf(
        df_engineered, target_col, test_size=0.2, tune_hyperparams=True
    )
    
    # Generate future forecasts for MoM
    forecast_mom = recursive_forecast(model_mom, df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_mom)

if __name__ == "__main__":
    main()

Running Random Forest model for cpi_yoy...
Training data: (269, 63), Test data: (68, 63)
Performing hyperparameter tuning...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Training - RMSE: 0.1181, MAE: 0.0510, R²: 0.9996
Testing - RMSE: 0.0550, MAE: 0.0405, R²: 0.9980
Generating 24 period forecast...

Future cpi_yoy forecasts:
2025-01-01    102.985725
2025-02-01    102.985725
2025-03-01    102.985725
2025-04-01    102.985725
2025-05-01    102.985725
2025-06-01    102.985725
2025-07-01    102.985725
2025-08-01    102.985725
2025-09-01    102.985725
2025-10-01    102.985725
2025-11-01    102.985725
2025-12-01    102.985725
2026-01-01    102.985725
2026-02-01    102.985725
2026-03-01    102.985725
2026-04-01    102.985725
2026-05-01    102.985725
2026-06-01    102.985725
2026-07-01    102.985725
2026-08-01    102.985725
2026-09-01    102.985725
2026-10-01    102.985725
20

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import shap
import warnings
warnings.filterwarnings('ignore')

def load_data(file_path):
    """Load and prepare the time series data"""
    df = pd.read_csv(file_path)
    
    # Convert time column to datetime
    df['time'] = pd.to_datetime(df['time'])
    
    # Set time as index
    df.set_index('time', inplace=True)
    
    return df

def engineer_features(df):
    """Create additional features for modeling"""
    df_copy = df.copy()
    
    # Create lag features
    for i in range(1, 13):
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_lag_{i}'] = df_copy['cpi_mom'].shift(i)
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_lag_{i}'] = df_copy['cpi_yoy'].shift(i)
    
    # Create rolling window features
    for window in [3, 6, 12]:
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_rolling_mean_{window}'] = df_copy['cpi_mom'].rolling(window=window).mean()
            df_copy[f'cpi_mom_rolling_std_{window}'] = df_copy['cpi_mom'].rolling(window=window).std()
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_rolling_mean_{window}'] = df_copy['cpi_yoy'].rolling(window=window).mean()
            df_copy[f'cpi_yoy_rolling_std_{window}'] = df_copy['cpi_yoy'].rolling(window=window).std()
    
    # Create economic indicator lag features
    for i in range(1, 4):
        if 'oil_price' in df_copy.columns:
            df_copy[f'oil_price_lag_{i}'] = df_copy['oil_price'].shift(i)
        if 'gold_price' in df_copy.columns:
            df_copy[f'gold_price_lag_{i}'] = df_copy['gold_price'].shift(i)
        if 'interest_rate' in df_copy.columns:
            df_copy[f'interest_rate_lag_{i}'] = df_copy['interest_rate'].shift(i)
    
    # Add month and year as cyclical features
    if 'month' in df_copy.columns:
        df_copy['month_sin'] = np.sin(2 * np.pi * df_copy['month']/12)
        df_copy['month_cos'] = np.cos(2 * np.pi * df_copy['month']/12)
    
    # Create interaction features
    if all(col in df_copy.columns for col in ['oil_price', 'gold_price']):
        df_copy['oil_gold_ratio'] = df_copy['oil_price'] / df_copy['gold_price']
    
    # Create momentum features
    if 'cpi_yoy' in df_copy.columns:
        df_copy['cpi_yoy_momentum'] = df_copy['cpi_yoy'].diff()
    if 'cpi_mom' in df_copy.columns:
        df_copy['cpi_mom_momentum'] = df_copy['cpi_mom'].diff()
    
    # Drop rows with NaN values (due to lag features)
    df_clean = df_copy.dropna()
    
    return df_clean

def calculate_metrics(actual, predicted):
    """Calculate evaluation metrics"""
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    
    return rmse, mae, mape, r2

def run_xgboost(df, target_col, test_size=0.2, tune_hyperparams=True):
    """
    Train and evaluate an XGBoost model for time series forecasting
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with engineered features
    target_col : str
        Name of the target column
    test_size : float
        Proportion of data to use for testing
    tune_hyperparams : bool
        Whether to perform hyperparameter tuning
        
    Returns:
    --------
    tuple
        Model, predictions, feature importances, metrics, and SHAP values
    """
    print(f"Running XGBoost model for {target_col}...")
    
    # Define features and target
    X = df.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df.columns], axis=1)
    y = df[target_col]
    
    # Split data into training and testing sets (time-based split)
    split_idx = int(len(df) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")
    
    if tune_hyperparams:
        print("Performing hyperparameter tuning...")
        # Define parameter grid
        # param_grid = {
        #     'n_estimators': [100, 200, 300],
        #     'max_depth': [3, 5, 7, 9],
        #     'learning_rate': [0.01, 0.05, 0.1, 0.2],
        #     'subsample': [0.7, 0.8, 0.9, 1.0],
        #     'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        #     'gamma': [0, 0.1, 0.2],
        #     'min_child_weight': [1, 3, 5]
        # }
        
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.05, 0.1],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'gamma': [0],
            'min_child_weight': [1, 3]
        }
        # Use TimeSeriesSplit for cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Create and fit GridSearchCV
        # xgb_grid = GridSearchCV(
        #     estimator=XGBRegressor(objective='reg:squarederror', random_state=42),
        #     param_grid=param_grid,
        #     cv=tscv,
        #     scoring='neg_mean_squared_error',
        #     n_jobs=-1,
        #     verbose=1
        # )
        xgb_random = RandomizedSearchCV(
            estimator=XGBRegressor(objective='reg:squarederror', random_state=42),
            param_distributions=param_grid,
            n_iter=10,  # ít hơn nhiều so với grid search
            cv=TimeSeriesSplit(n_splits=3),  # ít split hơn
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        xgb_random.fit(X_train, y_train)
        
        # Get best parameters and model
        best_params = xgb_random.best_params_
        print(f"Best parameters: {best_params}")
        
        # Train model with best parameters
        model = XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    else:
        # Use default parameters
        model = XGBRegressor(
            n_estimators=200,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            gamma=0,
            min_child_weight=1,
            objective='reg:squarederror',
            random_state=42
        )
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_rmse, train_mae, train_mape, train_r2 = calculate_metrics(y_train, y_pred_train)
    test_rmse, test_mae, test_mape, test_r2 = calculate_metrics(y_test, y_pred_test)
    
    print(f"Training - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, MAPE: {train_mape:.2f}%, R²: {train_r2:.4f}")
    print(f"Testing - RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}, MAPE: {test_mape:.2f}%, R²: {test_r2:.4f}")
    
    # Get feature importances
    feature_importances = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances.head(20))
    plt.title(f'Top 20 Feature Importances for {target_col}')
    plt.tight_layout()
    plt.savefig(f'plots/xgb_feature_importance_{target_col}.png')
    plt.close()
    
    # Plot actual vs predicted
    plt.figure(figsize=(14, 7))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', color='blue')
    plt.plot(df.index[-len(y_test):], y_pred_test, label='Predicted', color='red', linestyle='--')
    plt.title(f'XGBoost: Actual vs Predicted {target_col}')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/xgb_predictions_{target_col}.png')
    plt.close()
    
    # Create a DataFrame with predictions
    predictions = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred_test,
        'Error': y_test - y_pred_test
    })
    
    # Plot error distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(predictions['Error'], kde=True)
    plt.title(f'Error Distribution for {target_col}')
    plt.xlabel('Error')
    plt.savefig(f'plots/xgb_error_distribution_{target_col}.png')
    plt.close()
    
    # Calculate SHAP values
    try:
        explainer = shap.Explainer(model)
        shap_values = explainer(X_test)
        
        # Plot SHAP summary
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test, show=False)
        plt.title(f'SHAP Feature Importance for {target_col}')
        plt.tight_layout()
        plt.savefig(f'plots/xgb_shap_summary_{target_col}.png')
        plt.close()
        
        # Plot SHAP dependence plots for top features
        top_features = feature_importances['Feature'].head(3).tolist()
        for feature in top_features:
            plt.figure(figsize=(10, 6))
            shap.dependence_plot(feature, shap_values.values, X_test, show=False)
            plt.title(f'SHAP Dependence Plot for {feature}')
            plt.tight_layout()
            plt.savefig(f'plots/xgb_shap_dependence_{feature}_{target_col}.png')
            plt.close()
    except Exception as e:
        print(f"Error calculating SHAP values: {e}")
        shap_values = None
    
    # Return results
    metrics = {
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_mape': train_mape,
        'train_r2': train_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_mape': test_mape,
        'test_r2': test_r2
    }
    
    return model, predictions, feature_importances, metrics, shap_values

def recursive_forecast(model, df, target_col, forecast_horizon=12):
    """
    Generate recursive forecasts using the trained model
    
    Parameters:
    -----------
    model : XGBRegressor
        Trained model
    df : pd.DataFrame
        Input dataframe with features
    target_col : str
        Name of the target column
    forecast_horizon : int
        Number of periods to forecast
        
    Returns:
    --------
    pd.Series
        Forecasted values
    """
    print(f"Generating {forecast_horizon} period forecast...")
    
    # Create a copy of the dataframe
    df_forecast = df.copy()
    
    # Get the last date in the dataframe
    last_date = df_forecast.index[-1]
    
    # Create a list to store forecasts
    forecasts = []
    
    # Generate forecasts recursively
    for i in range(forecast_horizon):
        # Get the features for the next period
        if i == 0:
            # For the first forecast, use the last row of the dataframe
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        else:
            # Update the dataframe with the previous forecast
            # This requires updating all the lag features, rolling means, etc.
            # For simplicity, we'll just use the last forecast as a naive prediction
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        
        # Make prediction
        forecast = model.predict(X_next)[0]
        
        # Store forecast
        forecasts.append(forecast)
        
        # Create next date
        next_date = last_date + pd.DateOffset(months=i+1)
        
        # For a more accurate implementation, we would need to update all features
        # based on the new forecast, but this is a simplified version
    
    # Create a Series with the forecasts
    future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=forecast_horizon, freq='MS')
    forecast_series = pd.Series(forecasts, index=future_dates)
    
    # Plot historical data with forecasts
    plt.figure(figsize=(14, 7))
    plt.plot(df[target_col].index, df[target_col], label='Historical Data')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', color='red', linestyle='--')
    plt.title(f'XGBoost: {target_col} Forecast')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/xgb_future_forecast_{target_col}.png')
    plt.close()
    
    return forecast_series

def main():
    # Create plots directory if it doesn't exist
    import os
    if not os.path.exists('plots'):
        os.makedirs('plots')
    
    # Load data
    file_path = 'data/analyzed_time_series.csv'
    df = load_data(file_path)
    
    # Engineer features
    df_engineered = engineer_features(df)
    
    # Run XGBoost for CPI Year-over-Year
    target_col = 'cpi_yoy'
    model_yoy, predictions_yoy, feature_importances_yoy, metrics_yoy, shap_values_yoy = run_xgboost(
        df_engineered, target_col, test_size=0.2, tune_hyperparams=True
    )
    
    # Generate future forecasts for YoY
    forecast_yoy = recursive_forecast(model_yoy, df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_yoy)
    
    # Run XGBoost for CPI Month-over-Month
    target_col = 'cpi_mom'
    model_mom, predictions_mom, feature_importances_mom, metrics_mom, shap_values_mom = run_xgboost(
        df_engineered, target_col, test_size=0.2, tune_hyperparams=True
    )
    
    # Generate future forecasts for MoM
    forecast_mom = recursive_forecast(model_mom, df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_mom)
    
    # Save metrics to CSV
    metrics_data = [
        {
            'Model': 'XGBoost', 'Target': 'CPI MoM',
            'Train_RMSE': metrics_mom['train_rmse'], 'Train_MAE': metrics_mom['train_mae'], 
            'Train_MAPE': metrics_mom['train_mape'], 'Train_R2': metrics_mom['train_r2'],
            'Test_RMSE': metrics_mom['test_rmse'], 'Test_MAE': metrics_mom['test_mae'], 
            'Test_MAPE': metrics_mom['test_mape'], 'Test_R2': metrics_mom['test_r2']
        },
        {
            'Model': 'XGBoost', 'Target': 'CPI YoY',
            'Train_RMSE': metrics_yoy['train_rmse'], 'Train_MAE': metrics_yoy['train_mae'], 
            'Train_MAPE': metrics_yoy['train_mape'], 'Train_R2': metrics_yoy['train_r2'],
            'Test_RMSE': metrics_yoy['test_rmse'], 'Test_MAE': metrics_yoy['test_mae'], 
            'Test_MAPE': metrics_yoy['test_mape'], 'Test_R2': metrics_yoy['test_r2']
        }
    ]
    
    metrics_df = pd.DataFrame(metrics_data)
    metrics_df.to_csv('plots/xgboost_metrics.csv', index=False)
    print("\nMetrics saved to plots/xgboost_metrics.csv")

if __name__ == "__main__":
    main()

Running XGBoost model for cpi_yoy...
Training data: (269, 65), Test data: (68, 65)
Performing hyperparameter tuning...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters: {'subsample': 1.0, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Training - RMSE: 0.0362, MAE: 0.0228, MAPE: 0.02%, R²: 1.0000
Testing - RMSE: 0.0989, MAE: 0.0631, MAPE: 0.06%, R²: 0.9934
Generating 24 period forecast...

Future cpi_yoy forecasts:
2025-01-01    102.901375
2025-02-01    102.901375
2025-03-01    102.901375
2025-04-01    102.901375
2025-05-01    102.901375
2025-06-01    102.901375
2025-07-01    102.901375
2025-08-01    102.901375
2025-09-01    102.901375
2025-10-01    102.901375
2025-11-01    102.901375
2025-12-01    102.901375
2026-01-01    102.901375
2026-02-01    102.901375
2026-03-01    102.901375
2026-04-01    102.901375
2026-05-01    102.901375
2026-06-01    102.901375
2026-07-01    102.901375
2026-08

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import shap
import warnings
warnings.filterwarnings('ignore')

def load_data(file_path):
    """Load and prepare the time series data"""
    df = pd.read_csv(file_path)
    
    # Convert time column to datetime
    df['time'] = pd.to_datetime(df['time'])
    
    # Set time as index
    df.set_index('time', inplace=True)
    
    return df

def engineer_features(df):
    """Create additional features for modeling"""
    df_copy = df.copy()
    
    # Create lag features
    for i in range(1, 13):
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_lag_{i}'] = df_copy['cpi_mom'].shift(i)
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_lag_{i}'] = df_copy['cpi_yoy'].shift(i)
    
    # Create rolling window features
    for window in [3, 6, 12]:
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_rolling_mean_{window}'] = df_copy['cpi_mom'].rolling(window=window).mean()
            df_copy[f'cpi_mom_rolling_std_{window}'] = df_copy['cpi_mom'].rolling(window=window).std()
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_rolling_mean_{window}'] = df_copy['cpi_yoy'].rolling(window=window).mean()
            df_copy[f'cpi_yoy_rolling_std_{window}'] = df_copy['cpi_yoy'].rolling(window=window).std()
    
    # Create economic indicator lag features
    for i in range(1, 4):
        if 'oil_price' in df_copy.columns:
            df_copy[f'oil_price_lag_{i}'] = df_copy['oil_price'].shift(i)
        if 'gold_price' in df_copy.columns:
            df_copy[f'gold_price_lag_{i}'] = df_copy['gold_price'].shift(i)
        if 'interest_rate' in df_copy.columns:
            df_copy[f'interest_rate_lag_{i}'] = df_copy['interest_rate'].shift(i)
    
    # Add month and year as cyclical features
    if 'month' in df_copy.columns:
        df_copy['month_sin'] = np.sin(2 * np.pi * df_copy['month']/12)
        df_copy['month_cos'] = np.cos(2 * np.pi * df_copy['month']/12)
    
    # Create interaction features
    if all(col in df_copy.columns for col in ['oil_price', 'gold_price']):
        df_copy['oil_gold_ratio'] = df_copy['oil_price'] / df_copy['gold_price']
    
    # Create momentum features
    if 'cpi_yoy' in df_copy.columns:
        df_copy['cpi_yoy_momentum'] = df_copy['cpi_yoy'].diff()
    if 'cpi_mom' in df_copy.columns:
        df_copy['cpi_mom_momentum'] = df_copy['cpi_mom'].diff()
    
    # Drop rows with NaN values (due to lag features)
    df_clean = df_copy.dropna()
    
    return df_clean

def calculate_metrics(actual, predicted):
    """Calculate evaluation metrics"""
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    
    return rmse, mae, mape, r2

def run_lightgbm(df, target_col, test_size=0.2, tune_hyperparams=True):
    """
    Train and evaluate a LightGBM model for time series forecasting
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with engineered features
    target_col : str
        Name of the target column
    test_size : float
        Proportion of data to use for testing
    tune_hyperparams : bool
        Whether to perform hyperparameter tuning
        
    Returns:
    --------
    tuple
        Model, predictions, feature importances, metrics, and SHAP values
    """
    print(f"Running LightGBM model for {target_col}...")
    
    # Define features and target
    X = df.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df.columns], axis=1)
    y = df[target_col]
    
    # Split data into training and testing sets (time-based split)
    split_idx = int(len(df) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")
    
    if tune_hyperparams:
        print("Performing hyperparameter tuning...")
        # Define parameter grid
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7, 9],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'num_leaves': [31, 63, 127],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
            'reg_alpha': [0, 0.1, 0.5],
            'reg_lambda': [0, 0.1, 0.5]
        }
        
        # Use TimeSeriesSplit for cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Create and fit GridSearchCV
        # lgbm_grid = GridSearchCV(
        #     estimator=LGBMRegressor(objective='regression', random_state=42),
        #     param_grid=param_grid,
        #     cv=tscv,
        #     scoring='neg_mean_squared_error',
        #     n_jobs=-1,
        #     verbose=1
        # )

        # Replace GridSearchCV with RandomizedSearchCV
        lgbm_search = RandomizedSearchCV(
            estimator=LGBMRegressor(objective='regression', random_state=42),
            param_distributions=param_grid,
            n_iter=30,             # Giảm số tổ hợp kiểm tra
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1,
            random_state=42
        )

        lgbm_search.fit(X_train, y_train)
        
        # Get best parameters and model
        best_params = lgbm_search.best_params_
        print(f"Best parameters: {best_params}")
        
        # Train model with best parameters
        model = LGBMRegressor(objective='regression', random_state=42, **best_params)
    else:
        # Use default parameters
        model = LGBMRegressor(
            n_estimators=200,
            max_depth=7,
            learning_rate=0.1,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            objective='regression',
            random_state=42
        )
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_rmse, train_mae, train_mape, train_r2 = calculate_metrics(y_train, y_pred_train)
    test_rmse, test_mae, test_mape, test_r2 = calculate_metrics(y_test, y_pred_test)
    
    print(f"Training - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, MAPE: {train_mape:.2f}%, R²: {train_r2:.4f}")
    print(f"Testing - RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}, MAPE: {test_mape:.2f}%, R²: {test_r2:.4f}")
    
    # Get feature importances
    feature_importances = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances.head(20))
    plt.title(f'Top 20 Feature Importances for {target_col}')
    plt.tight_layout()
    plt.savefig(f'plots/lgbm_feature_importance_{target_col}.png')
    plt.close()
    
    # Plot actual vs predicted
    plt.figure(figsize=(14, 7))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', color='blue')
    plt.plot(df.index[-len(y_test):], y_pred_test, label='Predicted', color='red', linestyle='--')
    plt.title(f'LightGBM: Actual vs Predicted {target_col}')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/lgbm_predictions_{target_col}.png')
    plt.close()
    
    # Create a DataFrame with predictions
    predictions = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred_test,
        'Error': y_test - y_pred_test
    })
    
    # Plot error distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(predictions['Error'], kde=True)
    plt.title(f'Error Distribution for {target_col}')
    plt.xlabel('Error')
    plt.savefig(f'plots/lgbm_error_distribution_{target_col}.png')
    plt.close()
    
    # Calculate SHAP values
    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        
        # Plot SHAP summary
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test, show=False)
        plt.title(f'SHAP Feature Importance for {target_col}')
        plt.tight_layout()
        plt.savefig(f'plots/lgbm_shap_summary_{target_col}.png')
        plt.close()
        
        # Plot SHAP dependence plots for top features
        top_features = feature_importances['Feature'].head(3).tolist()
        for feature in top_features:
            plt.figure(figsize=(10, 6))
            shap.dependence_plot(feature, shap_values, X_test, show=False)
            plt.title(f'SHAP Dependence Plot for {feature}')
            plt.tight_layout()
            plt.savefig(f'plots/lgbm_shap_dependence_{feature}_{target_col}.png')
            plt.close()
    except Exception as e:
        print(f"Error calculating SHAP values: {e}")
        shap_values = None
    
    # Return results
    metrics = {
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_mape': train_mape,
        'train_r2': train_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_mape': test_mape,
        'test_r2': test_r2
    }
    
    return model, predictions, feature_importances, metrics, shap_values

def recursive_forecast(model, df, target_col, forecast_horizon=12):
    """
    Generate recursive forecasts using the trained model
    
    Parameters:
    -----------
    model : LGBMRegressor
        Trained model
    df : pd.DataFrame
        Input dataframe with features
    target_col : str
        Name of the target column
    forecast_horizon : int
        Number of periods to forecast
        
    Returns:
    --------
    pd.Series
        Forecasted values
    """
    print(f"Generating {forecast_horizon} period forecast...")
    
    # Create a copy of the dataframe
    df_forecast = df.copy()
    
    # Get the last date in the dataframe
    last_date = df_forecast.index[-1]
    
    # Create a list to store forecasts
    forecasts = []
    
    # Generate forecasts recursively
    for i in range(forecast_horizon):
        # Get the features for the next period
        if i == 0:
            # For the first forecast, use the last row of the dataframe
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        else:
            # Update the dataframe with the previous forecast
            # This requires updating all the lag features, rolling means, etc.
            # For simplicity, we'll just use the last forecast as a naive prediction
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        
        # Make prediction
        forecast = model.predict(X_next)[0]
        
        # Store forecast
        forecasts.append(forecast)
        
        # Create next date
        next_date = last_date + pd.DateOffset(months=i+1)
        
        # For a more accurate implementation, we would need to update all features
        # based on the new forecast, but this is a simplified version
    
    # Create a Series with the forecasts
    future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=forecast_horizon, freq='MS')
    forecast_series = pd.Series(forecasts, index=future_dates)
    
    # Plot historical data with forecasts
    plt.figure(figsize=(14, 7))
    plt.plot(df[target_col].index, df[target_col], label='Historical Data')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', color='red', linestyle='--')
    plt.title(f'LightGBM: {target_col} Forecast')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/lgbm_future_forecast_{target_col}.png')
    plt.close()
    
    return forecast_series

def main():
    # Create plots directory if it doesn't exist
    import os
    if not os.path.exists('plots'):
        os.makedirs('plots')
    
    # Load data
    file_path = 'data/analyzed_time_series.csv'
    df = load_data(file_path)
    
    # Engineer features
    df_engineered = engineer_features(df)
    
    # Run LightGBM for CPI Year-over-Year
    target_col = 'cpi_yoy'
    model_yoy, predictions_yoy, feature_importances_yoy, metrics_yoy, shap_values_yoy = run_lightgbm(
        df_engineered, target_col, test_size=0.2, tune_hyperparams=True
    )
    
    # Generate future forecasts for YoY
    forecast_yoy = recursive_forecast(model_yoy, df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_yoy)
    
    # Run LightGBM for CPI Month-over-Month
    target_col = 'cpi_mom'
    model_mom, predictions_mom, feature_importances_mom, metrics_mom, shap_values_mom = run_lightgbm(
        df_engineered, target_col, test_size=0.2, tune_hyperparams=True
    )
    
    # Generate future forecasts for MoM
    forecast_mom = recursive_forecast(model_mom, df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_mom)
    
    # Save metrics to CSV
    metrics_data = [
        {
            'Model': 'LightGBM', 'Target': 'CPI MoM',
            'Train_RMSE': metrics_mom['train_rmse'], 'Train_MAE': metrics_mom['train_mae'], 
            'Train_MAPE': metrics_mom['train_mape'], 'Train_R2': metrics_mom['train_r2'],
            'Test_RMSE': metrics_mom['test_rmse'], 'Test_MAE': metrics_mom['test_mae'], 
            'Test_MAPE': metrics_mom['test_mape'], 'Test_R2': metrics_mom['test_r2']
        },
        {
            'Model': 'LightGBM', 'Target': 'CPI YoY',
            'Train_RMSE': metrics_yoy['train_rmse'], 'Train_MAE': metrics_yoy['train_mae'], 
            'Train_MAPE': metrics_yoy['train_mape'], 'Train_R2': metrics_yoy['train_r2'],
            'Test_RMSE': metrics_yoy['test_rmse'], 'Test_MAE': metrics_yoy['test_mae'], 
            'Test_MAPE': metrics_yoy['test_mape'], 'Test_R2': metrics_yoy['test_r2']
        }
    ]
    
    metrics_df = pd.DataFrame(metrics_data)
    metrics_df.to_csv('plots/lightgbm_metrics.csv', index=False)
    print("\nMetrics saved to plots/lightgbm_metrics.csv")

if __name__ == "__main__":
    main()

Running LightGBM model for cpi_yoy...
Training data: (269, 65), Test data: (68, 65)
Performing hyperparameter tuning...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4914
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 65
[LightGBM] [Info] Start training from score 106.334610
Best parameters: {'subsample': 1.0, 'reg_lambda': 0.5, 'reg_alpha': 0, 'num_leaves': 127, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.2, 'colsample_bytree': 0.7}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4914
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 65
[LightGBM] [Inf

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

def load_data(file_path):
    """Load and prepare the time series data"""
    df = pd.read_csv(file_path)
    
    # Convert time column to datetime
    df['time'] = pd.to_datetime(df['time'])
    
    # Set time as index
    df.set_index('time', inplace=True)
    
    return df

def engineer_features(df):
    """Create additional features for modeling"""
    df_copy = df.copy()
    
    # Create lag features
    for i in range(1, 13):
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_lag_{i}'] = df_copy['cpi_mom'].shift(i)
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_lag_{i}'] = df_copy['cpi_yoy'].shift(i)
    
    # Create rolling window features
    for window in [3, 6, 12]:
        if 'cpi_mom' in df_copy.columns:
            df_copy[f'cpi_mom_rolling_mean_{window}'] = df_copy['cpi_mom'].rolling(window=window).mean()
            df_copy[f'cpi_mom_rolling_std_{window}'] = df_copy['cpi_mom'].rolling(window=window).std()
        if 'cpi_yoy' in df_copy.columns:
            df_copy[f'cpi_yoy_rolling_mean_{window}'] = df_copy['cpi_yoy'].rolling(window=window).mean()
            df_copy[f'cpi_yoy_rolling_std_{window}'] = df_copy['cpi_yoy'].rolling(window=window).std()
    
    # Create economic indicator lag features
    for i in range(1, 4):
        if 'oil_price' in df_copy.columns:
            df_copy[f'oil_price_lag_{i}'] = df_copy['oil_price'].shift(i)
        if 'gold_price' in df_copy.columns:
            df_copy[f'gold_price_lag_{i}'] = df_copy['gold_price'].shift(i)
        if 'interest_rate' in df_copy.columns:
            df_copy[f'interest_rate_lag_{i}'] = df_copy['interest_rate'].shift(i)
    
    # Add month and year as cyclical features
    if 'month' in df_copy.columns:
        df_copy['month_sin'] = np.sin(2 * np.pi * df_copy['month']/12)
        df_copy['month_cos'] = np.cos(2 * np.pi * df_copy['month']/12)
    
    # Create interaction features
    if all(col in df_copy.columns for col in ['oil_price', 'gold_price']):
        df_copy['oil_gold_ratio'] = df_copy['oil_price'] / df_copy['gold_price']
    
    # Drop rows with NaN values (due to lag features)
    df_clean = df_copy.dropna()
    
    return df_clean

def calculate_metrics(actual, predicted):
    """Calculate evaluation metrics"""
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    
    return rmse, mae, mape, r2

def run_svr(df, target_col, test_size=0.2, kernel='rbf', tune_hyperparams=True):
    """
    Train and evaluate a Support Vector Regression model for time series forecasting
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with engineered features
    target_col : str
        Name of the target column
    test_size : float
        Proportion of data to use for testing
    kernel : str
        Kernel type to be used in the algorithm ('linear', 'poly', 'rbf', 'sigmoid')
    tune_hyperparams : bool
        Whether to perform hyperparameter tuning
        
    Returns:
    --------
    tuple
        Model, predictions, metrics
    """
    print(f"Running SVR model with {kernel} kernel for {target_col}...")
    
    # Define features and target
    X = df.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df.columns], axis=1)
    y = df[target_col]
    
    # Split data into training and testing sets (time-based split)
    split_idx = int(len(df) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")
    
    # Create pipeline with scaling (important for SVR)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svr', SVR(kernel=kernel))
    ])
    
    if tune_hyperparams:
        print("Performing hyperparameter tuning...")
        # Define parameter grid based on kernel
        if kernel == 'linear':
            param_grid = {
                'svr__C': [0.1, 1, 10, 100],
                'svr__epsilon': [0.01, 0.1, 0.2, 0.5]
            }
        elif kernel == 'poly':
            param_grid = {
                'svr__C': [0.1, 1, 10, 100],
                'svr__epsilon': [0.01, 0.1, 0.2, 0.5],
                'svr__degree': [2, 3, 4],
                'svr__gamma': ['scale', 'auto', 0.1, 0.01]
            }
        else:  # rbf or sigmoid
            param_grid = {
                'svr__C': [0.1, 1, 10, 100],
                'svr__epsilon': [0.01, 0.1, 0.2, 0.5],
                'svr__gamma': ['scale', 'auto', 0.1, 0.01]
            }
        
        # Use TimeSeriesSplit for cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Create and fit GridSearchCV
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        # Get best parameters and model
        best_params = grid_search.best_params_
        print(f"Best parameters: {best_params}")
        
        # Use the best model
        pipeline = grid_search.best_estimator_
    else:
        # Use default parameters
        pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    # Calculate metrics
    train_rmse, train_mae, train_mape, train_r2 = calculate_metrics(y_train, y_pred_train)
    test_rmse, test_mae, test_mape, test_r2 = calculate_metrics(y_test, y_pred_test)
    
    print(f"Training - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, MAPE: {train_mape:.2f}%, R²: {train_r2:.4f}")
    print(f"Testing - RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}, MAPE: {test_mape:.2f}%, R²: {test_r2:.4f}")
    
    # Plot actual vs predicted
    plt.figure(figsize=(14, 7))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', color='blue')
    plt.plot(df.index[-len(y_test):], y_pred_test, label='Predicted', color='red', linestyle='--')
    plt.title(f'SVR ({kernel}): Actual vs Predicted {target_col}')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/svr_{kernel}_predictions_{target_col}.png')
    plt.close()
    
    # Create a DataFrame with predictions
    predictions = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred_test,
        'Error': y_test - y_pred_test
    })
    
    # Plot error distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(predictions['Error'], kde=True)
    plt.title(f'Error Distribution for SVR ({kernel}) - {target_col}')
    plt.xlabel('Error')
    plt.savefig(f'plots/svr_{kernel}_error_distribution_{target_col}.png')
    plt.close()
    
    # Plot residuals vs fitted values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred_test, predictions['Error'])
    plt.axhline(y=0, color='r', linestyle='-')
    plt.title(f'Residuals vs Fitted Values for SVR ({kernel}) - {target_col}')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.savefig(f'plots/svr_{kernel}_residuals_{target_col}.png')
    plt.close()
    
    # Return results
    metrics = {
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_mape': train_mape,
        'train_r2': train_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_mape': test_mape,
        'test_r2': test_r2
    }
    
    return pipeline, predictions, metrics

def recursive_forecast(model, df, target_col, forecast_horizon=12):
    """
    Generate recursive forecasts using the trained model
    
    Parameters:
    -----------
    model : Pipeline
        Trained model pipeline
    df : pd.DataFrame
        Input dataframe with features
    target_col : str
        Name of the target column
    forecast_horizon : int
        Number of periods to forecast
        
    Returns:
    --------
    pd.Series
        Forecasted values
    """
    print(f"Generating {forecast_horizon} period forecast...")
    
    # Create a copy of the dataframe
    df_forecast = df.copy()
    
    # Get the last date in the dataframe
    last_date = df_forecast.index[-1]
    
    # Create a list to store forecasts
    forecasts = []
    
    # Generate forecasts recursively
    for i in range(forecast_horizon):
        # Get the features for the next period
        if i == 0:
            # For the first forecast, use the last row of the dataframe
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        else:
            # Update the dataframe with the previous forecast
            # This requires updating all the lag features, rolling means, etc.
            # For simplicity, we'll just use the last forecast as a naive prediction
            X_next = df_forecast.drop([col for col in ['cpi_mom', 'cpi_yoy', target_col] if col in df_forecast.columns], axis=1).iloc[-1:].values
        
        # Make prediction
        forecast = model.predict(X_next)[0]
        
        # Store forecast
        forecasts.append(forecast)
        
        # Create next date
        next_date = last_date + pd.DateOffset(months=i+1)
        
        # For a more accurate implementation, we would need to update all features
        # based on the new forecast, but this is a simplified version
    
    # Create a Series with the forecasts
    future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=forecast_horizon, freq='MS')
    forecast_series = pd.Series(forecasts, index=future_dates)
    
    # Plot historical data with forecasts
    plt.figure(figsize=(14, 7))
    plt.plot(df[target_col].index, df[target_col], label='Historical Data')
    plt.plot(forecast_series.index, forecast_series, label='Forecast', color='red', linestyle='--')
    plt.title(f'SVR: {target_col} Forecast')
    plt.xlabel('Date')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'plots/svr_future_forecast_{target_col}.png')
    plt.close()
    
    return forecast_series

def main():
    # Create plots directory if it doesn't exist
    import os
    if not os.path.exists('plots'):
        os.makedirs('plots')
    
    # Load data
    file_path = 'data/analyzed_time_series.csv'
    df = load_data(file_path)
    
    # Engineer features
    df_engineered = engineer_features(df)
    
    # List of kernels to try
    kernels = ['linear', 'rbf', 'poly']
    
    # Dictionary to store results
    results = {}
    
    # Run SVR with different kernels for CPI Year-over-Year
    target_col = 'cpi_yoy'
    for kernel in kernels:
        print(f"\n{'-'*50}")
        print(f"Running SVR with {kernel} kernel for {target_col}")
        print(f"{'-'*50}")
        
        model, predictions, metrics = run_svr(
            df_engineered, target_col, test_size=0.2, kernel=kernel, tune_hyperparams=True
        )
        
        results[f"{kernel}_yoy"] = {
            'model': model,
            'predictions': predictions,
            'metrics': metrics
        }
    
    # Find the best kernel for YoY based on test RMSE
    best_kernel_yoy = min(kernels, key=lambda k: results[f"{k}_yoy"]['metrics']['test_rmse'])
    print(f"\nBest kernel for {target_col}: {best_kernel_yoy}")
    
    # Generate future forecasts using the best model for YoY
    forecast_yoy = recursive_forecast(results[f"{best_kernel_yoy}_yoy"]['model'], df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_yoy)
    
    # Run SVR with different kernels for CPI Month-over-Month
    target_col = 'cpi_mom'
    for kernel in kernels:
        print(f"\n{'-'*50}")
        print(f"Running SVR with {kernel} kernel for {target_col}")
        print(f"{'-'*50}")
        
        model, predictions, metrics = run_svr(
            df_engineered, target_col, test_size=0.2, kernel=kernel, tune_hyperparams=True
        )
        
        results[f"{kernel}_mom"] = {
            'model': model,
            'predictions': predictions,
            'metrics': metrics
        }
    
    # Find the best kernel for MoM based on test RMSE
    best_kernel_mom = min(kernels, key=lambda k: results[f"{k}_mom"]['metrics']['test_rmse'])
    print(f"\nBest kernel for {target_col}: {best_kernel_mom}")
    
    # Generate future forecasts using the best model for MoM
    forecast_mom = recursive_forecast(results[f"{best_kernel_mom}_mom"]['model'], df_engineered, target_col, forecast_horizon=24)
    print(f"\nFuture {target_col} forecasts:")
    print(forecast_mom)
    
    # Compare models
    print("\n\nModel Comparison - CPI Year-over-Year:")
    print("-" * 80)
    print(f"{'Kernel':<10} {'Train RMSE':<12} {'Test RMSE':<12} {'Train R²':<12} {'Test R²':<12}")
    print("-" * 80)
    for kernel in kernels:
        metrics = results[f"{kernel}_yoy"]['metrics']
        print(f"{kernel:<10} {metrics['train_rmse']:<12.4f} {metrics['test_rmse']:<12.4f} {metrics['train_r2']:<12.4f} {metrics['test_r2']:<12.4f}")
    
    print("\n\nModel Comparison - CPI Month-over-Month:")
    print("-" * 80)
    print(f"{'Kernel':<10} {'Train RMSE':<12} {'Test RMSE':<12} {'Train R²':<12} {'Test R²':<12}")
    print("-" * 80)
    for kernel in kernels:
        metrics = results[f"{kernel}_mom"]['metrics']
        print(f"{kernel:<10} {metrics['train_rmse']:<12.4f} {metrics['test_rmse']:<12.4f} {metrics['train_r2']:<12.4f} {metrics['test_r2']:<12.4f}")
    
    # Save metrics to CSV
    metrics_data = []
    for kernel in kernels:
        metrics_yoy = results[f"{kernel}_yoy"]['metrics']
        metrics_mom = results[f"{kernel}_mom"]['metrics']
        
        metrics_data.append({
            'Model': f"SVR ({kernel})",
            'Target': 'CPI YoY',
            'Train_RMSE': metrics_yoy['train_rmse'],
            'Train_MAE': metrics_yoy['train_mae'],
            'Train_MAPE': metrics_yoy['train_mape'],
            'Train_R2': metrics_yoy['train_r2'],
            'Test_RMSE': metrics_yoy['test_rmse'],
            'Test_MAE': metrics_yoy['test_mae'],
            'Test_MAPE': metrics_yoy['test_mape'],
            'Test_R2': metrics_yoy['test_r2']
        })
        
        metrics_data.append({
            'Model': f"SVR ({kernel})",
            'Target': 'CPI MoM',
            'Train_RMSE': metrics_mom['train_rmse'],
            'Train_MAE': metrics_mom['train_mae'],
            'Train_MAPE': metrics_mom['train_mape'],
            'Train_R2': metrics_mom['train_r2'],
            'Test_RMSE': metrics_mom['test_rmse'],
            'Test_MAE': metrics_mom['test_mae'],
            'Test_MAPE': metrics_mom['test_mape'],
            'Test_R2': metrics_mom['test_r2']
        })
    
    metrics_df = pd.DataFrame(metrics_data)
    metrics_df.to_csv('plots/svr_metrics.csv', index=False)
    print("\nMetrics saved to plots/svr_metrics.csv")

if __name__ == "__main__":
    main()


--------------------------------------------------
Running SVR with linear kernel for cpi_yoy
--------------------------------------------------
Running SVR model with linear kernel for cpi_yoy...
Training data: (269, 63), Test data: (68, 63)
Performing hyperparameter tuning...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'svr__C': 10, 'svr__epsilon': 0.1}
Training - RMSE: 0.0659, MAE: 0.0573, MAPE: 0.05%, R²: 0.9999
Testing - RMSE: 0.1576, MAE: 0.1353, MAPE: 0.13%, R²: 0.9833

--------------------------------------------------
Running SVR with rbf kernel for cpi_yoy
--------------------------------------------------
Running SVR model with rbf kernel for cpi_yoy...
Training data: (269, 63), Test data: (68, 63)
Performing hyperparameter tuning...
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters: {'svr__C': 100, 'svr__epsilon': 0.01, 'svr__gamma': 0.01}
Training - RMSE: 0.0098, MAE: 0.0097, MAPE: 0.01%, R²: 1.0000
Testing - 