# Food Price Inflation — OOP Pipeline (Enhanced, Canada)

**Term Project Topic:** *Forecasting Food Price Inflation for Policy Response*  
**New Data Source Added:** *World Bank — Food Prices for Nutrition (Canada subset, 2017–2024)*  
**Legacy Source:** *World Bank CPI / Food Price Inflation (Canada)*

> **Data paths (edit if needed):**  
> - `../data/API_FP.CPI.TOTL.ZG_DS2_en_csv_v2_23195.csv` *(CPI, previously used)*  
> - `../data/2abd45d3-18a3-4ed6-8799-524647bb719a_Data.csv` *(Nutrition main data)*  
> - `../data/2abd45d3-18a3-4ed6-8799-524647bb719a_Series - Metadata.csv` *(Nutrition metadata)*


## 1‑Minute Explanation (What, Why, How)
We added the **Food Prices for Nutrition** dataset (Canada, 2017–2024), which measures the *cost and affordability* of healthy and nutrient‑adequate diets (PPP/day and ratios). This complements CPI and global indices by capturing **affordability stress** on households. Our OOP notebook loads, cleans, and merges these sources; applies **Missing‑values ratio**, **Low‑variance**, **High‑correlation filter**, **PCA**, **Random Forest importance**, and **Sequential Feature Selection**; then performs **STL decomposition** and **ARIMA/SARIMAX** forecasting (with optional exogenous features). Finally, we run **ADF** and **Granger** tests to probe stationarity and predictive relationships. The output is a compact feature set and a short‑horizon forecast suitable for **early‑warning dashboards** and **policy decisions**.


## Hypothesis — Previous vs Revised (Unit 3 framing)

**Previous (from prior submission):**  
- **H₀:** No significant monotonic trend in Canada’s annual food inflation (1960–2024).  
- **H₁:** A significant trend exists.  
- **Result:** *p* < 0.05, trend decreasing → **Reject H₀**.

**Revised (with new dataset & predictive framing):**  
- **H₀ (Predictive Null):** *Affordability/Cost‑of‑Diet* indicators (nutrition dataset) **do not** add statistically significant predictive power for Canada’s monthly/annual food inflation beyond CPI’s own history.  
- **H₁ (Predictive Alt):** *Affordability/Cost‑of‑Diet* indicators **do** add significant predictive power (e.g., Granger‑cause CPI or improve forecast accuracy in ARIMAX/X models).

**Testing Approach:** ADF for stationarity; STL for structure; Granger causality (*x*→CPI); PCA and model‑based importance for dimensionality; ARIMA/SARIMAX for forecasting with exogenous features.


In [None]:
# Imports
import os
import pandas as pd
import numpy as np
from typing import Optional, Dict, List, Tuple

# Stats & TS
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.statespace.sarimax import SARIMAX

# ML
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Viz: per your class rules — matplotlib only, one plot per figure, no explicit colors
import matplotlib.pyplot as plt

pd.set_option('display.width', 140)
pd.set_option('display.max_columns', 200)


In [None]:
class DataPaths:
    def __init__(self,
                 cpi_path: str = '../data/API_FP.CPI.TOTL.ZG_DS2_en_csv_v2_23195.csv',
                 nutrition_data_path: str = '../data/2abd45d3-18a3-4ed6-8799-524647bb719a_Data.csv',
                 nutrition_meta_path: str = '../data/2abd45d3-18a3-4ed6-8799-524647bb719a_Series - Metadata.csv'):
        self.cpi_path = cpi_path
        self.nutrition_data_path = nutrition_data_path
        self.nutrition_meta_path = nutrition_meta_path

    def as_dict(self) -> Dict[str, str]:
        return {'cpi_path': self.cpi_path,
                'nutrition_data_path': self.nutrition_data_path,
                'nutrition_meta_path': self.nutrition_meta_path}

paths = DataPaths()
paths.as_dict()

In [None]:
class CPILoader:
    def __init__(self, path: str):
        self.path = path
        self.df: Optional[pd.DataFrame] = None

    def load(self) -> pd.DataFrame:
        self.df = pd.read_csv(self.path, skiprows=4)
        return self.df

    def canada_long_since(self, year_min: int = 2017) -> pd.DataFrame:
        if self.df is None:
            self.load()
        years = [c for c in self.df.columns if c.isdigit()]
        keep_cols = ['Indicator Name', 'Country Name'] + years
        can = self.df[self.df['Country Name'] == 'Canada'][keep_cols]
        long_df = can.melt(id_vars=['Country Name','Indicator Name'], var_name='Year', value_name='CPI_Value')
        long_df['Year'] = long_df['Year'].astype(int)
        long_df = long_df[long_df['Year'] >= year_min]
        return long_df

class NutritionLoader:
    def __init__(self, data_path: str, meta_path: Optional[str] = None):
        self.data_path = data_path
        self.meta_path = meta_path
        self.df: Optional[pd.DataFrame] = None
        self.meta: Optional[pd.DataFrame] = None

    def load(self) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        self.df = pd.read_csv(self.data_path)
        if self.meta_path and os.path.exists(self.meta_path):
            self.meta = pd.read_csv(self.meta_path)
        return self.df, self.meta

    def canada_since(self, year_min: int = 2017) -> pd.DataFrame:
        if self.df is None:
            self.load()
        df = self.df.rename(columns={'Country Name':'Country','Time':'Year'})
        df = df[df['Country']=='Canada'].copy()
        df['Year'] = df['Year'].astype(int)
        df = df[df['Year'] >= year_min]
        return df

In [None]:
class DataMerger:
    @staticmethod
    def merge_on_year(cpi_long: pd.DataFrame, nutrition_can: pd.DataFrame) -> pd.DataFrame:
        return pd.merge(cpi_long, nutrition_can, on='Year', how='inner')

class FeatureEngineer:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()

    def drop_high_missing(self, threshold: float = 0.4):
        mask = self.df.isna().mean() < threshold
        self.df = self.df.loc[:, mask]
        return self

    def drop_low_variance(self, min_std: float = 0.01):
        num = self.df.select_dtypes(include=[np.number])
        keep = num.loc[:, num.std() > min_std].columns.tolist()
        others = [c for c in self.df.columns if c not in num.columns]
        self.df = self.df[keep + others]
        return self

    def drop_high_correlation(self, corr_thresh: float = 0.85, protect: Optional[List[str]] = None):
        if protect is None: protect = []
        num = self.df.select_dtypes(include=[np.number])
        if num.shape[1] == 0:
            return self
        corr = num.corr()
        to_drop = set()
        cols = corr.columns
        for i in range(len(cols)):
            for j in range(i+1, len(cols)):
                if abs(corr.iloc[i,j]) > corr_thresh:
                    c2 = cols[j]
                    if c2 not in protect:
                        to_drop.add(c2)
        self.df = self.df.drop(columns=list(to_drop), errors='ignore')
        return self

    def get(self) -> pd.DataFrame:
        return self.df

    @staticmethod
    def show_corr(df: pd.DataFrame, title: str = 'Correlation Matrix'):
        num = df.select_dtypes(include=[np.number])
        if num.shape[1] == 0:
            print('No numeric columns to plot.')
            return
        corr = num.corr()
        plt.figure(figsize=(8,6))
        plt.imshow(corr.values, aspect='auto')
        plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
        plt.yticks(range(len(corr.index)), corr.index)
        plt.title(title)
        plt.colorbar()
        plt.tight_layout()
        plt.show()

In [None]:
class DimensionalityReducer:
    def __init__(self, df: pd.DataFrame, target_col: str = 'CPI_Value'):
        self.df = df.copy()
        self.target_col = target_col
        self.scaler: Optional[StandardScaler] = None
        self.pca: Optional[PCA] = None

    def prepare(self, exclude: Optional[List[str]] = None):
        if exclude is None: exclude = []
        X = self.df.select_dtypes(include=[np.number]).drop(columns=list(set(exclude + ['Year'])), errors='ignore')
        y = self.df[self.target_col] if self.target_col in self.df.columns else None
        return X, y

    def fit_pca(self, n_components: int = 2):
        X, _ = self.prepare(exclude=[self.target_col])
        self.scaler = StandardScaler()
        Xs = self.scaler.fit_transform(X)
        self.pca = PCA(n_components=n_components)
        Xp = self.pca.fit_transform(Xs)
        return Xp, list(self.pca.explained_variance_ratio_)

    def plot_variance(self):
        if self.pca is None:
            print('Run fit_pca first.')
            return
        evr = self.pca.explained_variance_ratio_
        plt.figure()
        plt.bar(range(1, len(evr)+1), np.array(evr)*100)
        plt.xlabel('Principal Component')
        plt.ylabel('Variance Explained (%)')
        plt.title('PCA — Variance Explained')
        plt.tight_layout()
        plt.show()

class ModelSelector:
    def __init__(self, df: pd.DataFrame, target_col: str = 'CPI_Value'):
        self.df = df.copy()
        self.target_col = target_col

    def rf_importance(self, exclude: Optional[List[str]] = None) -> pd.DataFrame:
        if exclude is None: exclude = []
        X = self.df.select_dtypes(include=[np.number]).drop(columns=list(set(exclude + [self.target_col])), errors='ignore')
        y = self.df[self.target_col]
        rf = RandomForestRegressor(random_state=42)
        rf.fit(X, y)
        imp = pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_})
        return imp.sort_values('Importance', ascending=False, ignore_index=True)

    def sequential(self, k: int = 3, direction: str = 'forward', exclude: Optional[List[str]] = None) -> List[str]:
        if exclude is None: exclude = []
        X = self.df.select_dtypes(include=[np.number]).drop(columns=list(set(exclude + [self.target_col])), errors='ignore')
        y = self.df[self.target_col]
        model = LinearRegression()
        sfs = SequentialFeatureSelector(model, n_features_to_select=min(k, X.shape[1]), direction=direction)
        sfs.fit(X, y)
        return X.columns[sfs.get_support()].tolist()

In [None]:
class HypothesisTester:
    @staticmethod
    def adf(series: pd.Series, maxlag: Optional[int] = None):
        series = series.dropna()
        stat = adfuller(series, maxlag=maxlag, autolag='AIC')
        return {'adf_stat': stat[0], 'p_value': stat[1], 'n_lags': stat[2], 'n_obs': stat[3]}

    @staticmethod
    def granger(df: pd.DataFrame, x_col: str, y_col: str, maxlag: int = 2):
        sub = df[[y_col, x_col]].dropna().copy()
        res = grangercausalitytests(sub[[y_col, x_col]], maxlag=maxlag, verbose=False)
        return {lag: res[lag][0]['ssr_ftest'][1] for lag in res}

## Pipeline (Load → Merge → Clean → Reduce → Test)

In [None]:
# Load
cpi = CPILoader(paths.cpi_path); cpi.load()
cpi_long = cpi.canada_long_since(2017)

nut = NutritionLoader(paths.nutrition_data_path, paths.nutrition_meta_path); nut.load()
nut_can = nut.canada_since(2017)

print('Rows (CPI long):', len(cpi_long))
print('Rows (Nutrition CAN):', len(nut_can))

# Merge
merged = DataMerger.merge_on_year(cpi_long, nut_can)
print('Merged shape:', merged.shape)
merged.head()

### Cleanup & Filters (Missing Values, Low Variance, High Correlation)

In [None]:
fe = FeatureEngineer(merged)
fe.drop_high_missing(0.4).drop_low_variance(0.01).drop_high_correlation(0.85, protect=['CPI_Value'])
clean_df = fe.get()
FeatureEngineer.show_corr(clean_df, title='Correlation Matrix — Post Filters')
clean_df.head()

### PCA

In [None]:
dr = DimensionalityReducer(clean_df, target_col='CPI_Value')
_, evr = dr.fit_pca(n_components=2)
print('Explained variance ratio:', evr)
dr.plot_variance()

### Random Forest Importance & Sequential Feature Selection

In [None]:
ms = ModelSelector(clean_df, target_col='CPI_Value')
imp = ms.rf_importance(exclude=['Year'])
imp


In [None]:
selected = ms.sequential(k=3, direction='forward', exclude=['Year'])
print('Selected features:', selected)

## STL Decomposition (Target: CPI_Value)
> With annual data (2017–2024), seasonality is limited; monthly data is recommended for stronger seasonal insights.

In [None]:
ts = clean_df[['Year','CPI_Value']].dropna().drop_duplicates().sort_values('Year').set_index('Year')['CPI_Value']
period = 12 if len(ts) >= 24 else max(2, len(ts)//2)
print('Series length:', len(ts), '| STL period:', period)

stl = STL(ts, period=period, robust=True)
res = stl.fit()

plt.figure(); plt.plot(ts.index, ts.values); plt.title('CPI_Value — Original'); plt.tight_layout(); plt.show()
plt.figure(); plt.plot(res.trend.index, res.trend.values); plt.title('CPI_Value — Trend'); plt.tight_layout(); plt.show()
plt.figure(); plt.plot(res.seasonal.index, res.seasonal.values); plt.title('CPI_Value — Seasonal'); plt.tight_layout(); plt.show()
plt.figure(); plt.plot(res.resid.index, res.resid.values); plt.title('CPI_Value — Residual'); plt.tight_layout(); plt.show()

## ARIMA/SARIMAX Forecast
We fit a small-order ARIMA and, when available, include exogenous features from `selected` as SARIMAX exog. We then forecast the next 2 periods with naive exog extension.

In [None]:
series_df = clean_df.sort_values('Year').reset_index(drop=True)
endog = series_df['CPI_Value'].astype(float)

# Choose exogenous columns if available
try:
    exog_cols = [c for c in selected if c in series_df.columns]
except Exception:
    exog_cols = []
exog = series_df[exog_cols].astype(float) if exog_cols else None
print('Exogenous features used:', exog_cols)

order = (1,1,0)
seasonal_order = (0,0,0,0)

model = SARIMAX(endog, exog=exog, order=order, seasonal_order=seasonal_order,
                enforce_stationarity=False, enforce_invertibility=False)
res = model.fit(disp=False)
print(res.summary())

fitted = res.fittedvalues

# Forecast
n_forecast = 2
future_exog = None
if exog is not None and len(exog_cols) > 0:
    future_exog = pd.concat([exog.iloc[-1:]]*n_forecast, ignore_index=True)

pred = res.get_forecast(steps=n_forecast, exog=future_exog)
mean_fc = pred.predicted_mean
ci = pred.conf_int()

last_year = int(series_df['Year'].iloc[-1])
fc_index = list(range(last_year+1, last_year+1+n_forecast))

plt.figure(); plt.plot(series_df['Year'], endog.values, label='Actual'); plt.plot(series_df['Year'], fitted.values, label='Fitted'); plt.title('ARIMA/SARIMAX — Actual vs Fitted'); plt.legend(); plt.tight_layout(); plt.show()
plt.figure(); plt.plot(series_df['Year'], endog.values, label='Actual'); plt.plot(fc_index, mean_fc.values, label='Forecast'); plt.fill_between(fc_index, ci.iloc[:,0].values, ci.iloc[:,1].values, alpha=0.3); plt.title('ARIMA/SARIMAX — Forecast (Next 2)'); plt.legend(); plt.tight_layout(); plt.show()

## Hypothesis Testing Cells (ADF & Granger)
- **ADF:** Stationarity of `CPI_Value` and top affordability feature.  
- **Granger:** Does affordability feature **Granger‑cause** CPI?

In [None]:
adf_cpi = HypothesisTester.adf(series_df['CPI_Value'])
print('ADF — CPI_Value:', adf_cpi)

# Heuristic: pick a nutrition feature that looks like a cost/PPP measure
cand = [c for c in clean_df.columns if ('healthy' in c.lower() and 'ppp' in c.lower()) or ('cost of' in c.lower() and 'diet' in c.lower())]
if cand:
    xcol = cand[0]
    print('Using feature for tests:', xcol)
    adf_x = HypothesisTester.adf(series_df[xcol])
    print('ADF —', xcol, ':', adf_x)
    gr = HypothesisTester.granger(series_df[[xcol,'CPI_Value']].join(series_df['Year']), x_col=xcol, y_col='CPI_Value', maxlag=2)
    print('Granger p-values (x → CPI):', gr)
else:
    print('No cost/PPP nutrition feature auto-detected. Manually set xcol to test Granger causality.')

## 3–5 Minute Presentation Outline (Hard Stop at 5)
1. **Context (≤20s):** Food inflation harms low‑income households; need early warning.
2. **Data (≤40s):** CPI + **Nutrition affordability** (PPP/day, ratios). Why this is new & relevant.
3. **Pipeline (≤90s):** OOP loaders → merge → missing/variance/correlation filters → PCA → RF/Selection → STL → ARIMA.
4. **Findings (≤60s):** Key features, PCA variance %, ADF/Granger hints, 2‑step forecast.
5. **Policy Hook (≤30s):** Signals of affordability stress can prompt targeted subsidies/imports.
6. **Close (≤20s):** Next: monthly granularity, proper exog forecasting, backtesting.


## Quick TOC
- [1-Minute Explanation](#one-minute)
- [100-Word Summary](#hundred)
- [Hypothesis — Previous vs Revised](#hypo)
- [Loaders & Paths (OOP)](#oop)
- [Data Dictionary (from Metadata)](#dict)
- [Pipeline & Cleaning](#pipe)
- [PCA & Feature Selection](#pca)
- [Time Series: STL & ARIMA](#ts)
- [Hypothesis Tests: ADF & Granger](#tests)
- [Presentation Outline (3–5 min)](#pres)


<a id="hundred"></a>

## 100-Word Summary
We integrate the World Bank Food Prices for Nutrition (Canada, 2017–2024) with our CPI series to forecast domestic food price inflation with richer socioeconomic context. Nutrition indicators measure the daily PPP cost and affordability of healthy and nutrient‑adequate diets. Using an OOP pipeline, we load, merge, and prepare data, apply missing‑values, low‑variance, and high‑correlation filters, then reduce dimensionality via PCA, Random Forest importance, and forward/backward feature selection. Finally, we perform STL decomposition, ARIMA/SARIMAX forecasting with exogenous features, and ADF/Granger tests. Our goal is an early‑warning signal for policymakers, linking affordability stress to inflation risks and informing targeted interventions. This concise update reflects the revised hypothesis focus and modeling scope.


<a id="one-minute"></a>

## 1-Minute Explanation (verbal)
- **What:** Add *Food Prices for Nutrition* (Canada) to CPI to capture **affordability** of healthy diets (PPP/day, ratios).
- **Why:** Inflation hurts low‑income households; affordability signals **policy urgency** beyond CPI levels.
- **How:** OOP notebook → loaders → merge → filters (missing/variance/correlation) → PCA → RF importance → forward/backward selection → STL → ARIMA/SARIMAX → ADF/Granger.
- **Outcome:** Compact, interpretable features; short‑horizon forecast; early‑warning indicators for targeted subsidies/imports.


<a id="hypo"></a>

## Hypothesis Testing — What the tests mean
- **ADF (Augmented Dickey–Fuller):** Checks stationarity; non‑stationary series often need differencing before ARIMA. We test `CPI_Value` and key affordability features.
- **Granger Causality:** If past values of X improve prediction of Y (beyond Y’s own past), we say X *Granger‑causes* Y. Here: affordability → CPI.
- **Model‑based Evidence:** PCA (variance captured), Random Forest importance (nonlinear ranking), and sequential selection (parsimonious linear subset) support feature relevance beyond pure causality tests.


<a id="ts"></a>

## Time Series — What we analyze
- **STL Decomposition:** Separates trend, seasonal, and residual components in `CPI_Value`. With annual data, seasonality is limited; monthly CPI is stronger.
- **ARIMA/SARIMAX:** ARIMA models autocorrelation; SARIMAX adds **exogenous** drivers (affordability features). We provide a simple baseline and recommend monthly data + order search for production.


<a id="dict"></a>

## Data Dictionary (auto-extracted from metadata if available)
The cell below reads the series metadata file to display indicator names, units, and definitions for fast reference during your 1‑minute explanation.


In [None]:
# Data dictionary preview (if metadata exists)
try:
    _meta_path = paths.nutrition_meta_path
    if os.path.exists(_meta_path):
        _meta = pd.read_csv(_meta_path)
        cols = [c for c in _meta.columns if any(k in c.lower() for k in ['name','indicator','unit','definition','topic','series'])]
        display(_meta[cols].head(20))
    else:
        print('Metadata file not found at:', _meta_path)
except Exception as e:
    print('Metadata read error:', e)

<a id="pca"></a>

## Feature Selection — Backward (in addition to Forward)
We add backward elimination to complement forward selection, as requested in the rubric.


In [None]:
# Backward selection (Linear Regression baseline)
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

ms_back = ModelSelector(clean_df, target_col='CPI_Value')
X_back = clean_df.select_dtypes(include=[np.number]).drop(columns=['Year','CPI_Value'], errors='ignore')
y_back = clean_df['CPI_Value']

backward_selector = SequentialFeatureSelector(LinearRegression(), n_features_to_select=min(3, X_back.shape[1]), direction='backward')
backward_selector.fit(X_back, y_back)
back_selected = X_back.columns[backward_selector.get_support()].tolist()
print('Backward-selected features:', back_selected)

<a id="oop"></a>

*(OOP loaders and paths defined above.)*

<a id="pipe"></a>

*(Pipeline and cleaning steps executed above.)*

<a id="ts"></a>

*(STL & ARIMA sections included above.)*

<a id="tests"></a>

*(ADF & Granger testing cells included above.)*

<a id="pres"></a>

*(Presentation outline included above.)*

## Monthly CPI Pipeline (Optional — Recommended for Stronger STL/ARIMA)

This section reuses the same OOP approach but with **monthly CPI** (more data points ⇒ better seasonality & ARIMA stability).  
**Expected input file:** `../data/canada_cpi_monthly.csv` with columns:
- `Date` — first day of month (e.g., `2010-01-01`), or `YYYY-MM` parseable by pandas
- `CPI_Value` — monthly CPI (target). You can substitute *Food* CPI or a food inflation rate if available (rename to `CPI_Value`).

**Nutrition (annual) → monthly join:** We merge by year and forward-fill to monthly frequency so affordability features are available each month. Replace with truly monthly exogenous data when you have it.


In [None]:
class MonthlyCPILoader:
    def __init__(self, path: str = '../data/canada_cpi_monthly.csv'):
        self.path = path
        self.df = None

    def load(self):
        df = pd.read_csv(self.path)
        # Parse Date column flexibly
        if 'Date' not in df.columns:
            raise ValueError('Expected a Date column in monthly CPI file.')
        df['Date'] = pd.to_datetime(df['Date'])
        if 'CPI_Value' not in df.columns:
            # try to find a likely target column
            candidates = [c for c in df.columns if c.lower().startswith('cpi') or 'value' in c.lower()]
            if not candidates:
                raise ValueError('Expected a CPI_Value column; add/rename your target to CPI_Value.')
            df = df.rename(columns={candidates[0]: 'CPI_Value'})
        df = df.sort_values('Date').reset_index(drop=True)
        self.df = df
        return df

    def ensure_monthly(self):
        if self.df is None:
            self.load()
        df = self.df.set_index('Date').asfreq('MS')  # month start
        self.df = df.reset_index()
        return self.df

### Run Monthly Pipeline (Loads file if present; prints guidance otherwise)

In [None]:
try:
    mloader = MonthlyCPILoader('../data/canada_cpi_monthly.csv')
    monthly = mloader.load()
    monthly = mloader.ensure_monthly()
    monthly['Year'] = monthly['Date'].dt.year
    print('Monthly CPI rows:', len(monthly))
    display(monthly.head())

    # Bring in annual nutrition → monthly via Year join
    # Reuse previously loaded `nut_can` (annual) if available; otherwise load
    try:
        _nut = nut_can.copy()
    except NameError:
        nut = NutritionLoader(paths.nutrition_data_path, paths.nutrition_meta_path); nut.load()
        _nut = nut.canada_since(2017)

    mmerge = pd.merge(monthly, _nut, on='Year', how='left')
    # Forward-fill any remaining missing values (e.g., at start of year)
    mmerge = mmerge.sort_values('Date').ffill()
    print('Monthly merged shape:', mmerge.shape)
    display(mmerge.head())

    # Apply the same cleaning/filters using FeatureEngineer
    mfe = FeatureEngineer(mmerge)
    mfe.drop_high_missing(0.4).drop_low_variance(0.001).drop_high_correlation(0.9, protect=['CPI_Value'])
    mclean = mfe.get()
    FeatureEngineer.show_corr(mclean, title='Monthly — Correlation Matrix (Post Filters)')

    # PCA on monthly data
    mdr = DimensionalityReducer(mclean, target_col='CPI_Value')
    _, mevr = mdr.fit_pca(n_components=3)
    print('Monthly PCA explained variance ratio:', mevr)
    mdr.plot_variance()

    # Feature importance & selection on monthly
    msel = ModelSelector(mclean, target_col='CPI_Value')
    mimp = msel.rf_importance(exclude=['Year'])
    display(mimp.head(10))
    mselected_fwd = msel.sequential(k=5, direction='forward', exclude=['Year'])
    print('Monthly selected (forward):', mselected_fwd)

    # STL with period=12
    mts = mclean[['Date','CPI_Value']].dropna().sort_values('Date').set_index('Date')['CPI_Value']
    from statsmodels.tsa.seasonal import STL
    stl_m = STL(mts, period=12, robust=True).fit()

    import matplotlib.pyplot as plt
    plt.figure(); plt.plot(mts.index, mts.values); plt.title('Monthly CPI — Original'); plt.tight_layout(); plt.show()
    plt.figure(); plt.plot(stl_m.trend.index, stl_m.trend.values); plt.title('Monthly CPI — Trend'); plt.tight_layout(); plt.show()
    plt.figure(); plt.plot(stl_m.seasonal.index, stl_m.seasonal.values); plt.title('Monthly CPI — Seasonal'); plt.tight_layout(); plt.show()
    plt.figure(); plt.plot(stl_m.resid.index, stl_m.resid.values); plt.title('Monthly CPI — Residual'); plt.tight_layout(); plt.show()

    # SARIMAX with exogenous (use forward-selected if available)
    series_m = mclean.sort_values('Date').reset_index(drop=True)
    endog_m = series_m['CPI_Value'].astype(float)

    exog_cols_m = [c for c in mselected_fwd if c in series_m.columns]
    exog_m = series_m[exog_cols_m].astype(float) if exog_cols_m else None
    print('Monthly exogenous features used:', exog_cols_m)

    from statsmodels.tsa.statespace.sarimax import SARIMAX
    order = (1,1,1)
    seasonal_order = (0,1,1,12)
    mod_m = SARIMAX(endog_m, exog=exog_m, order=order, seasonal_order=seasonal_order,
                    enforce_stationarity=False, enforce_invertibility=False)
    res_m = mod_m.fit(disp=False)
    print(res_m.summary())

    fitted_m = res_m.fittedvalues

    # 6-month forecast
    steps = 6
    future_exog_m = None
    if exog_m is not None and len(exog_cols_m) > 0:
        last_row = exog_m.iloc[-1:]
        future_exog_m = pd.concat([last_row]*steps, ignore_index=True)

    pred_m = res_m.get_forecast(steps=steps, exog=future_exog_m)
    mean_fc_m = pred_m.predicted_mean
    ci_m = pred_m.conf_int()

    # Build forecast index
    last_date = series_m['Date'].iloc[-1]
    fc_index_m = pd.date_range(last_date + pd.offsets.MonthBegin(1), periods=steps, freq='MS')

    plt.figure(); plt.plot(series_m['Date'], endog_m.values, label='Actual'); plt.plot(series_m['Date'], fitted_m.values, label='Fitted'); plt.title('Monthly ARIMA/SARIMAX — Actual vs Fitted'); plt.legend(); plt.tight_layout(); plt.show()
    plt.figure(); plt.plot(series_m['Date'], endog_m.values, label='Actual'); plt.plot(fc_index_m, mean_fc_m.values, label='Forecast'); plt.fill_between(fc_index_m, ci_m.iloc[:,0].values, ci_m.iloc[:,1].values, alpha=0.3); plt.title('Monthly ARIMA/SARIMAX — 6-Month Forecast'); plt.legend(); plt.tight_layout(); plt.show()

    # ADF on monthly
    from statsmodels.tsa.stattools import adfuller
    print('ADF (Monthly CPI):', {'p_value': adfuller(mts.dropna())[1]})

except FileNotFoundError:
    print('Monthly CPI file not found at ../data/canada_cpi_monthly.csv. Add a CSV with columns [Date, CPI_Value].')
except Exception as e:
    print('Monthly pipeline error:', e)