<a href="https://colab.research.google.com/github/TalaQattan/tech-interview-handbook/blob/main/datathone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("📦 Installing packages...")
# Install all necessary packages with specific versions in one go
!pip install -q numpy==1.26.4 pmdarima xgboost lightgbm openpyxl pyarrow pandas==2.2.2

print("✅ Installation complete!")

In [1]:
# 2. Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import pmdarima as pmd
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Linear models + scaling
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("Setup complete.\n")

# ============================================
# CONFIG: Initial settings (will auto-adjust)
# ============================================
TRAIN_WINDOW = 12   # Target: 12 months training window
FH = 1              # Forecast horizon: 1 month ahead
MAX_LAG = 6         # Lag depth for ML models

# Files
EXCEL_RAW = "Claims.xlsx"  # <-- Change this if needed
CLEAN_PARQUET = "claims_clean.parquet"

print(f"Looking for: {EXCEL_RAW}\n")

# ============================================
# LOAD & CLEAN DATA
# ============================================

def parse_yyyymm(series):
    """Convert YYYYMM format to datetime"""
    s = series.astype(str).str.extract(r"(\d{6})", expand=False)
    return pd.to_datetime(s, format="%Y%m", errors="coerce")

# Load data
print("Loading data...")
if Path(CLEAN_PARQUET).exists():
    df = pd.read_parquet(CLEAN_PARQUET)  # fixed typo
    print("Loaded from parquet")
else:
    df = pd.read_excel(EXCEL_RAW, engine="openpyxl")
    print(f"Loaded {len(df):,} rows from Excel")

# Make columns consistent
df.columns = [str(c).strip().upper() for c in df.columns]

# ----------------------------
# Enforce TREATMENT <= BATCH
# ----------------------------
def to_dt(col):
    return pd.to_datetime(
        col.astype(str).str.extract(r"(\d{6})", expand=False),
        format="%Y%m",
        errors="coerce"
    )

treat_dt = to_dt(df["TREATMENT_PERIOD"]) if "TREATMENT_PERIOD" in df.columns else None
batch_dt = to_dt(df["BATCH_PERIOD"])     if "BATCH_PERIOD"     in df.columns else None

if (treat_dt is not None) and (batch_dt is not None):
    mask_valid = treat_dt.notna() & batch_dt.notna() & (treat_dt <= batch_dt)
    kept = int(mask_valid.sum()); dropped = int((~mask_valid).sum())
    print(f"Order filter: kept {kept:,}, dropped {dropped:,} (invalid or TREATMENT > BATCH)")
    df = df.loc[mask_valid].copy()
    df["_TREAT_DT"] = treat_dt.loc[mask_valid]
    df["_BATCH_DT"] = batch_dt.loc[mask_valid]
else:
    print("Skipped order filter (one of the columns missing).")

# Filter to approved claims
if "STATUS" in df.columns:
    print("\nAvailable status codes:")
    print(df["STATUS"].value_counts().head(10))

    df["STATUS"] = df["STATUS"].astype(str).str.upper()
    approved_codes = ["APPROVED", "AC", "PAID"]
    df = df[df["STATUS"].isin(approved_codes)]
    print(f"\nFiltered to {len(df):,} approved claims")
else:
    print("No STATUS column found - using all rows")

# Ensure numeric positive amounts
df["CLAIMS_AMOUNT"] = pd.to_numeric(df.get("CLAIMS_AMOUNT"), errors="coerce")
df = df[df["CLAIMS_AMOUNT"] > 0]
print(f"{len(df):,} claims with positive amounts\n")

# Monthly aggregation
if "_TREAT_DT" in df.columns:
    month_idx = df["_TREAT_DT"]
elif "_BATCH_DT" in df.columns:
    month_idx = df["_BATCH_DT"]
elif "TREATMENT_PERIOD" in df.columns:
    month_idx = parse_yyyymm(df["TREATMENT_PERIOD"])
elif "BATCH_PERIOD" in df.columns:
    month_idx = parse_yyyymm(df["BATCH_PERIOD"])
else:
    raise ValueError("Need TREATMENT_PERIOD or BATCH_PERIOD column (YYYYMM format)")

df = df.loc[month_idx.notna()].copy()
df["_MONTH"] = month_idx.dt.to_period("M").dt.to_timestamp()

monthly = (
    df.groupby("_MONTH", as_index=True)["CLAIMS_AMOUNT"]
      .sum()
      .sort_index()
      .asfreq("MS", fill_value=0.0)
      .to_frame(name="TOTAL_COST")
)

print("="*60)
print("Monthly Data Summary")
print("="*60)
print(f"Date range: {monthly.index.min().date()} to {monthly.index.max().date()}")
print(f"Total months: {len(monthly)}")
print(f"Total claims amount: ${monthly['TOTAL_COST'].sum():,.0f}")
print(f"Average monthly cost: ${monthly['TOTAL_COST'].mean():,.0f}")
print("\nLast 6 months:")
print(monthly.tail(6))

# ============================================
# AUTO-ADJUST CV PARAMETERS BASED ON DATA
# ============================================
total_months = len(monthly)
y = monthly["TOTAL_COST"].copy()
X_exog = None  # No exogenous features

print("\n" + "="*60)
print("AUTO-ADJUSTING CV PARAMETERS")
print("="*60)

# Determine optimal training window
if total_months < 12:
    TRAIN_WINDOW = max(6, total_months - 3)  # Use most data, leave room for CV
    print(f"Limited data: Adjusting TRAIN_WINDOW to {TRAIN_WINDOW} months")
elif total_months < 18:
    TRAIN_WINDOW = 10
    print(f"Moderate data: Adjusting TRAIN_WINDOW to {TRAIN_WINDOW} months")
else:
    TRAIN_WINDOW = 12
    print(f"Sufficient data: Using TRAIN_WINDOW = {TRAIN_WINDOW} months")

# Adjust MAX_LAG based on training window
MAX_LAG = min(6, TRAIN_WINDOW // 2)
print(f"MAX_LAG adjusted to {MAX_LAG} (<= {TRAIN_WINDOW}//2)")

# Calculate maximum possible folds
min_months_needed = TRAIN_WINDOW + FH
max_possible_folds = total_months - TRAIN_WINDOW - FH + 1

if total_months < min_months_needed:
    raise ValueError(
        f"ERROR: Need at least {min_months_needed} months for CV\n"
        f"   You have: {total_months} months\n"
        f"   Options: 1) Get more data, 2) Reduce TRAIN_WINDOW, or 3) Skip CV"
    )

# Determine optimal number of folds
if max_possible_folds >= 5:
    N_SPLITS = 5
    print(f"Using N_SPLITS = {N_SPLITS} folds (optimal)")
elif max_possible_folds >= 3:
    N_SPLITS = max_possible_folds
    print(f"Using N_SPLITS = {N_SPLITS} folds (maximum possible)")
else:
    N_SPLITS = max(1, max_possible_folds)
    print(f"Using N_SPLITS = {N_SPLITS} fold (very limited data)")

print("\nFinal CV Configuration:")
print(f"   - Training window: {TRAIN_WINDOW} months")
print(f"   - Forecast horizon: {FH} month")
print(f"   - CV folds: {N_SPLITS}")
print(f"   - Max lag features: {MAX_LAG}")
print(f"   - Total evaluations: {N_SPLITS} per model")

# ============================================
# HELPER FUNCTIONS
# ============================================

def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error (in %)"""
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    mask = denom != 0
    if mask.sum() == 0:
        return 0.0
    return np.mean(np.abs(y_true[mask] - y_pred[mask]) / denom[mask]) * 100.0

def build_supervised_from_series(y_series, extra_X=None, max_lag=MAX_LAG):
    """Create lagged features and rolling stats"""
    df_feat = pd.DataFrame(index=y_series.index)
    df_feat["y"] = y_series.values

    # Lag features
    for L in range(1, max_lag+1):
        df_feat[f"lag_{L}"] = y_series.shift(L)

    # Rolling statistics (adapt to training window)
    roll_windows = [3, min(6, max_lag)]
    for w in range(len(roll_windows)):
        window = roll_windows[w]
        df_feat[f"roll_mean_{window}"] = y_series.shift(1).rolling(window).mean()
        df_feat[f"roll_std_{window}"]  = y_series.shift(1).rolling(window).std()

    # Calendar features
    df_feat["month"] = df_feat.index.month
    df_feat["quarter"] = df_feat.index.quarter

    if extra_X is not None:
        df_feat = df_feat.join(extra_X, how="left")

    # NOTE: We do NOT dropna() here. NaNs will be handled during splitting/forecasting.
    return df_feat.drop(columns=["y"]), df_feat["y"]

def rolling_window_splits(y_index, window=TRAIN_WINDOW, fh=FH, n_splits=N_SPLITS):
    """
    Fixed-length rolling window CV
    Ensures all folds are valid and evenly spaced
    """
    n = len(y_index)

    # Ensure we have enough data
    if n < window + fh:
        raise ValueError(f"Need at least {window + fh} months, have {n}")

    # Calculate valid range for test set end positions
    first_test_end = window + fh
    last_test_end = n

    if n_splits == 1:
        # Single fold: use the very last possible window
        test_end_positions = [last_test_end]
    else:
        # Multiple folds: evenly space them
        test_end_positions = np.linspace(first_test_end, last_test_end, n_splits, dtype=int)

    for test_end in test_end_positions:
        train_start = test_end - fh - window
        train_end = test_end - fh
        test_start = train_end
        test_end_ = test_end

        train_idx = np.arange(train_start, train_end)
        test_idx  = np.arange(test_start, test_end_)

        yield train_idx, test_idx

# ============================================
# MODEL EVALUATION
# ============================================

def evaluate_models_auto_cv(y_series, X_exog=None, window=TRAIN_WINDOW, fh=FH, n_splits=N_SPLITS):
    """Evaluate all models with auto-adjusted CV"""
    results = []
    idx = y_series.index

    print("\n" + "="*60)
    print("CROSS-VALIDATION")
    print("="*60)
    print(f"Configuration: {n_splits} folds, train={window} months, forecast={fh} month\n")

    # Pre-build full feature table - don't dropna here
    X_full, y_full = build_supervised_from_series(y_series, extra_X=X_exog, max_lag=MAX_LAG)

    # Add baseline (12-month average) for comparison
    print("Models to evaluate:")
    print("  - Baseline (12-month moving average)")
    print("  - ARIMA")
    print("  - Ridge Regression")
    print("  - ElasticNet")
    print("  - XGBoost")
    print("  - LightGBM\n")

    for fold, (tr, te) in enumerate(rolling_window_splits(idx, window=window, fh=fh, n_splits=n_splits), start=1):
        train_dates = f"{idx[tr[0]].date()} to {idx[tr[-1]].date()}"
        test_date = idx[te[0]].date()
        print(f"Fold {fold}/{n_splits}: Train [{train_dates}] -> Test [{test_date}]")

        y_tr = y_series.iloc[tr]
        y_te = y_series.iloc[te]

        # -----------------------
        # BASELINE: 12-month (or window) average
        # -----------------------
        baseline_window = min(12, len(y_tr))
        baseline_pred = y_tr.iloc[-baseline_window:].mean()
        results.append(["Baseline_MA", fold,
                       float(np.sqrt((y_te.values[0] - baseline_pred)**2)),
                       float(np.abs(y_te.values[0] - baseline_pred)),
                       float(smape(y_te.values, np.array([baseline_pred])))])

        # -----------------------
        # ARIMA
        # -----------------------
        try:
            arima = pmd.auto_arima(y_tr, seasonal=False, stepwise=True,
                                   suppress_warnings=True, error_action="ignore",
                                   max_p=3, max_q=3)  # Limit complexity for small data
            y_pred = arima.predict(n_periods=fh)[0]
            results.append(["ARIMA", fold,
                            float(np.sqrt((y_te.values[0] - y_pred)**2)),
                            float(np.abs(y_te.values[0] - y_pred)),
                            float(smape(y_te.values, np.array([y_pred])))])
        except Exception:
            results.append(["ARIMA", fold, np.nan, np.nan, np.nan])

        # -----------------------
        # ML MODELS (with lag features)
        # -----------------------
        # Build once outside the loop (already done), slice here
        X_tr = X_full.iloc[tr].dropna()                 # Drop NaNs for training
        y_tr_full = y_full.iloc[tr].loc[X_tr.index]     # Align y_full with X_tr after dropping NaNs
        X_te = X_full.iloc[te]                          # Keep NaNs in test for prediction

        # Check if we have enough training data for ML models
        min_ml_rows = max(5, MAX_LAG)

        if len(X_te) >= 1 and len(X_tr) >= min_ml_rows:
            # Ridge
            try:
                ridge = Pipeline([
                    ("scaler", StandardScaler(with_mean=True, with_std=True)),
                    ("model", Ridge(alpha=1.0))
                ])
                ridge.fit(X_tr, y_tr_full)
                y_pred = ridge.predict(X_te)[0]
                results.append(["Ridge", fold,
                                float(np.sqrt((y_te.values[0] - y_pred)**2)),
                                float(np.abs(y_te.values[0] - y_pred)),
                                float(smape(y_te.values, np.array([y_pred])))])
            except Exception:
                results.append(["Ridge", fold, np.nan, np.nan, np.nan])

            # ElasticNet
            try:
                enet = Pipeline([
                    ("scaler", StandardScaler(with_mean=True, with_std=True)),
                    ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42, max_iter=10000))
                ])
                enet.fit(X_tr, y_tr_full)
                y_pred = enet.predict(X_te)[0]
                results.append(["ElasticNet", fold,
                                float(np.sqrt((y_te.values[0] - y_pred)**2)),
                                float(np.abs(y_te.values[0] - y_pred)),
                                float(smape(y_te.values, np.array([y_pred])))])
            except Exception:
                results.append(["ElasticNet", fold, np.nan, np.nan, np.nan])

            # XGBoost
            try:
                xgb = XGBRegressor(n_estimators=300, max_depth=3, learning_rate=0.05,
                                   subsample=0.9, colsample_bytree=0.9, random_state=42,
                                   verbosity=0)
                xgb.fit(X_tr, y_tr_full)
                y_pred = xgb.predict(X_te)[0]
                results.append(["XGBoost", fold,
                                float(np.sqrt((y_te.values[0] - y_pred)**2)),
                                float(np.abs(y_te.values[0] - y_pred)),
                                float(smape(y_te.values, np.array([y_pred])))])
            except Exception:
                results.append(["XGBoost", fold, np.nan, np.nan, np.nan])

            # LightGBM
            try:
                lgbm = LGBMRegressor(n_estimators=300, num_leaves=15, learning_rate=0.05,
                                     subsample=0.9, colsample_bytree=0.9, random_state=42,
                                     verbose=-1)
                lgbm.fit(X_tr, y_tr_full)
                y_pred = lgbm.predict(X_te)[0]
                results.append(["LightGBM", fold,
                                float(np.sqrt((y_te.values[0] - y_pred)**2)),
                                float(np.abs(y_te.values[0] - y_pred)),
                                float(smape(y_te.values, np.array([y_pred])))])
            except Exception:
                results.append(["LightGBM", fold, np.nan, np.nan, np.nan])
        else:
            # Not enough rows for ML
            for name in ["Ridge", "ElasticNet", "XGBoost", "LightGBM"]:
                results.append([name, fold, np.nan, np.nan, np.nan])

    res_df = pd.DataFrame(results, columns=["Model","Fold","RMSE","MAE","sMAPE"])

    # Calculate average metrics, handling NaNs
    leaderboard = (res_df.groupby("Model", as_index=True)
                        .agg(RMSE=("RMSE","mean"),
                             MAE=("MAE","mean"),
                             sMAPE=("sMAPE","mean"),
                             Valid_Folds=("RMSE", lambda x: x.notna().sum()))
                        .sort_values("RMSE"))

    # ============================================
    # ADD: Accuracy % from RMSE + MAE in SAR
    # ============================================
    target_range = float(y_series.max() - y_series.min())
    leaderboard = leaderboard.copy()
    if target_range > 0 and np.isfinite(target_range):
        leaderboard["Accuracy_%"] = (1.0 - (leaderboard["RMSE"] / target_range)) * 100.0
        leaderboard["Accuracy_%"] = leaderboard["Accuracy_%"].clip(lower=-100, upper=100).round(2)
    else:
        leaderboard["Accuracy_%"] = np.nan
        print("Warning: Accuracy_% not computed because target_range is zero or invalid.")

    leaderboard["MAE_SAR"] = leaderboard["MAE"].round(0)

    # Optional: reorder columns for readability
    cols_order = ["RMSE", "MAE_SAR", "sMAPE", "Accuracy_%", "Valid_Folds"]
    leaderboard = leaderboard.reindex(columns=[c for c in cols_order if c in leaderboard.columns])

    return res_df, leaderboard

# ============================================
# RUN EVALUATION
# ============================================

cv_results, leaderboard = evaluate_models_auto_cv(y, X_exog=X_exog,
                                                  window=TRAIN_WINDOW, fh=FH, n_splits=N_SPLITS)

print("\n" + "="*60)
print("FINAL LEADERBOARD (Lower is Better for RMSE/MAE/sMAPE; Higher is Better for Accuracy_%)")
print("="*60)
print(leaderboard.to_string())
print("\n")

# Show which models completed successfully
valid_models = leaderboard[leaderboard['Valid_Folds'] == N_SPLITS]
if len(valid_models) > 0:
    print(f"Models with all {N_SPLITS} successful folds:")
    print(f"   {', '.join(valid_models.index.tolist())}")
else:
    print(f"No models completed all {N_SPLITS} folds successfully")

# ============================================
# FEATURE IMPORTANCE (if winner is ML model)
# ============================================

best_model = leaderboard.index[0]
if best_model in ["XGBoost", "LightGBM", "Ridge", "ElasticNet"]:
    print("\n" + "="*60)
    print(f"FEATURE IMPORTANCE FOR {best_model}")
    print("="*60)

    # Re-build full feature table - don't dropna here
    X_full, y_full = build_supervised_from_series(y, extra_X=X_exog, max_lag=MAX_LAG)

    # Use last training window for feature importance, dropna for training
    last_window_idx = y.index[-TRAIN_WINDOW:]
    X_tr = X_full.loc[last_window_idx].dropna()
    y_tr = y_full.loc[X_tr.index]  # Align y_full with X_tr

    if best_model == "XGBoost":
        model = XGBRegressor(n_estimators=300, max_depth=3, learning_rate=0.05,
                             subsample=0.9, colsample_bytree=0.9, random_state=42, verbosity=0)
    elif best_model == "LightGBM":
        model = LGBMRegressor(n_estimators=300, num_leaves=15, learning_rate=0.05,
                              subsample=0.9, colsample_bytree=0.9, random_state=42, verbose=-1)
    elif best_model == "Ridge":
        model = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("model", Ridge(alpha=1.0))
        ])
    else:  # ElasticNet
        model = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42, max_iter=10000))
        ])

    model.fit(X_tr, y_tr)

    # Extract feature importance
    if best_model in ["XGBoost", "LightGBM"]:
        feat_imp = pd.DataFrame({
            'Feature': X_tr.columns,
            'Importance': model.feature_importances_
        }).sort_values('Importance', ascending=False)
    else:  # Ridge or ElasticNet
        feat_imp = pd.DataFrame({
            'Feature': X_tr.columns,
            'Coefficient': np.abs(model.named_steps['model'].coef_)
        }).sort_values('Coefficient', ascending=False)
        feat_imp.columns = ['Feature', 'Importance']

    print("\nTop 10 Most Important Features:")
    print(feat_imp.head(10).to_string(index=False))

# ============================================
# FINAL PREDICTION
# ============================================

print("\n" + "="*60)
print(f"FINAL FORECAST USING: {best_model}")
print("="*60)

last_window_idx = y.index[-TRAIN_WINDOW:]
y_train_last = y.loc[last_window_idx]
next_month = (y.index[-1] + pd.offsets.MonthBegin(1))

if best_model == "Baseline_MA":
    baseline_window = min(12, len(y))
    yhat = float(y.iloc[-baseline_window:].mean())

elif best_model == "ARIMA":
    mdl = pmd.auto_arima(y_train_last, seasonal=False, stepwise=True,
                         suppress_warnings=True, error_action="ignore",
                         max_p=3, max_q=3)
    yhat = float(mdl.predict(n_periods=FH)[0])

else:  # ML models
    # Build features for the entire series (including the forecast month as NaN)
    y_tmp = y.copy()
    y_tmp.loc[next_month] = np.nan
    X_full_for_pred, _ = build_supervised_from_series(y_tmp, extra_X=X_exog, max_lag=MAX_LAG)

    # Select training data (dropna for training)
    X_tr = X_full_for_pred.loc[last_window_idx].dropna()
    y_tr_full = y.loc[X_tr.index]  # y is the original series without the NaN forecast month

    # Select the features for the next month (do NOT drop NaNs here)
    X_next = X_full_for_pred.loc[[next_month]]

    if best_model == "Ridge":
        model = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("model", Ridge(alpha=1.0))
        ])
    elif best_model == "ElasticNet":
        model = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42, max_iter=10000))
        ])
    elif best_model == "XGBoost":
        model = XGBRegressor(n_estimators=300, max_depth=3, learning_rate=0.05,
                             subsample=0.9, colsample_bytree=0.9, random_state=42, verbosity=0)
    else:  # LightGBM
        model = LGBMRegressor(n_estimators=300, num_leaves=15, learning_rate=0.05,
                              subsample=0.9, colsample_bytree=0.9, random_state=42, verbose=-1)

    model.fit(X_tr, y_tr_full)
    yhat = float(model.predict(X_next)[0])

forecast = pd.DataFrame({
    "Month": [next_month.strftime('%Y-%m')],
    "Predicted_Cost": [f"${yhat:,.2f}"],
    "Model_Used": [best_model],
    "Training_Window": [f"{y_train_last.index[0].strftime('%Y-%m')} to {y_train_last.index[-1].strftime('%Y-%m')}"]
})

print(f"\nForecast for {next_month.strftime('%B %Y')}:")
print(forecast.to_string(index=False))

# ============================================
# SAVE OUTPUTS
# ============================================

monthly.to_csv("monthly_approved_cost.csv")
forecast_save = pd.DataFrame({
    "Month": [next_month],
    "Predicted_Cost": [yhat],
    "Model_Used": [best_model],
    "CV_Folds": [N_SPLITS],
    "Training_Window_Months": [TRAIN_WINDOW]
})
forecast_save.to_csv("next_month_forecast.csv", index=False)

# Save detailed CV results
cv_results.to_csv("cv_results_detailed.csv", index=False)
leaderboard.to_csv("model_leaderboard.csv")

print("\n" + "="*60)
print("SAVED FILES:")
print("  monthly_approved_cost.csv")
print("  next_month_forecast.csv")
print("  cv_results_detailed.csv")
print("  model_leaderboard.csv")
print("="*60)
print("\nAnalysis complete.")


Setup complete.

Looking for: Claims.xlsx

Loading data...
Loaded 900,000 rows from Excel
Order filter: kept 899,510, dropped 490 (invalid or TREATMENT > BATCH)

Available status codes:
STATUS
AC    667473
RJ    231888
SU       149
Name: count, dtype: int64

Filtered to 667,473 approved claims
667,473 claims with positive amounts

Monthly Data Summary
Date range: 2023-01-01 to 2023-12-01
Total months: 12
Total claims amount: $206,628,283
Average monthly cost: $17,219,024

Last 6 months:
             TOTAL_COST
_MONTH                 
2023-07-01  16205185.07
2023-08-01  16696400.64
2023-09-01  16917400.96
2023-10-01  18769256.95
2023-11-01  17215728.24
2023-12-01  18341687.91

AUTO-ADJUSTING CV PARAMETERS
Moderate data: Adjusting TRAIN_WINDOW to 10 months
MAX_LAG adjusted to 5 (<= 10//2)
Using N_SPLITS = 2 fold (very limited data)

Final CV Configuration:
   - Training window: 10 months
   - Forecast horizon: 1 month
   - CV folds: 2
   - Max lag features: 5
   - Total evaluations: 2 pe