# 02 - Model Prototyping

In this notebook we prototype the core model functions for PredictiFlow: lightweight wrappers for Prophet and ARIMA (with safe fallbacks), evaluation utilities, and a small `select_best_model` routine.

Notes:
- The code uses optional imports (Prophet, statsmodels). If these are not installed the notebook will fall back to simple baselines so you can iterate quickly.
- Later, tested functions can be moved into `backend/app/core/forecasting.py` for production use.

In [None]:
# Imports and safe optional libraries
import pandas as pd
import numpy as np
from typing import Tuple, Dict
import warnings
warnings.filterwarnings('ignore')

# Optional heavy libs
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except Exception:
    PROPHET_AVAILABLE = False

try:
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    STATSMODELS_AVAILABLE = True
except Exception:
    STATSMODELS_AVAILABLE = False

try:
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    SKL_AVAILABLE = True
except Exception:
    SKL_AVAILABLE = False

print('Prophet available:', PROPHET_AVAILABLE)
print('Statsmodels available:', STATSMODELS_AVAILABLE)
print('sklearn available:', SKL_AVAILABLE)

In [None]:
def prepare_df(path_or_df):
    """Load a CSV or accept a DataFrame and return standardized df with columns ['ds','y']
    - ds: datetime
    - y: float
    """
    if isinstance(path_or_df, str):
        df = pd.read_csv(path_or_df)
    else:
        df = path_or_df.copy()
    df = df.iloc[:, :2].copy()
    df.columns = ['ds', 'y']
    df['ds'] = pd.to_datetime(df['ds'])
    df = df.sort_values('ds').reset_index(drop=True)
    df['y'] = pd.to_numeric(df['y'], errors='coerce')
    df['y'] = df['y'].interpolate().fillna(method='bfill').fillna(method='ffill')
    return df

In [None]:
# Prophet wrapper with fallback baseline
def train_prophet(df, periods=30):
    df2 = df[['ds','y']].rename(columns={'y':'y'})
    if PROPHET_AVAILABLE:
        m = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
        m.fit(df2)
        future = m.make_future_dataframe(periods=periods)
        fcst = m.predict(future)[['ds','yhat']]
        return 'prophet', fcst
    # fallback: repeat last value
    last = df2['y'].iloc[-1] if len(df2)>0 else 0.0
    last_date = pd.to_datetime(df2['ds'].iloc[-1]) if len(df2)>0 else pd.Timestamp.today()
    freq = (pd.to_datetime(df2['ds'].iloc[1]) - pd.to_datetime(df2['ds'].iloc[0])) if len(df2)>1 else pd.Timedelta(days=1)
    future_dates = [last_date + (i+1)*freq for i in range(periods)]
    return 'baseline_prophet', pd.DataFrame({'ds':future_dates, 'yhat':[float(last)]*periods})

In [None]:
# ARIMA wrapper with fallback linear extrapolation
def train_arima(df, periods=30):
    df2 = df[['ds','y']].rename(columns={'y':'y'})
    if STATSMODELS_AVAILABLE:
        model = SARIMAX(df2['y'], order=(1,1,1), seasonal_order=(0,0,0,0))
        res = model.fit(disp=False)
        pred = res.get_forecast(steps=periods)
        idx = pd.date_range(start=pd.to_datetime(df2['ds'].iloc[-1]) + pd.Timedelta(days=1), periods=periods)
        return 'arima', pd.DataFrame({'ds':idx, 'yhat':pred.predicted_mean.values})
    # fallback: linear extrapolation using last two points
    if len(df2) >= 2:
        x = np.arange(len(df2))
        coef = np.polyfit(x[-2:], df2['y'].values[-2:], 1)
        future_x = np.arange(len(df2), len(df2)+periods)
        preds = np.polyval(coef, future_x)
    else:
        preds = np.array([float(df2['y'].iloc[-1] if len(df2)>0 else 0.0)]*periods)
    last_date = pd.to_datetime(df2['ds'].iloc[-1]) if len(df2)>0 else pd.Timestamp.today()
    freq = (pd.to_datetime(df2['ds'].iloc[1]) - pd.to_datetime(df2['ds'].iloc[0])) if len(df2)>1 else pd.Timedelta(days=1)
    future_dates = [last_date + (i+1)*freq for i in range(periods)]
    return 'baseline_arima', pd.DataFrame({'ds':future_dates, 'yhat':preds.tolist()})

In [None]:
# Simple evaluation utilities
def eval_metrics(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mae = float(np.mean(np.abs(y_true - y_pred)))
    rmse = float(np.sqrt(np.mean((y_true - y_pred)**2)))
    with np.errstate(divide='ignore', invalid='ignore'):
        denom = np.where(np.abs(y_true) < 1e-8, 1e-8, y_true)
        mape = float(np.mean(np.abs((y_true - y_pred)/denom))*100)
    return {'mae':mae, 'rmse':rmse, 'mape':mape}

def select_best_model(candidates):
    """candidates: list of tuples (name, forecast_df, metrics)
    selects the one with lowest rmse
    """
    best = None
    for name, fcst, metrics in candidates:
        if best is None or metrics['rmse'] < best[2]['rmse']:
            best = (name, fcst, metrics)
    return best

In [None]:
# Quick run on sample data (data/sample_sales.csv)
csv_path = '../data/sample_sales.csv'
df = prepare_df(csv_path)
prophet_name, prophet_fcst = train_prophet(df, periods=30)
arima_name, arima_fcst = train_arima(df, periods=30)

# Evaluate on the last min(30, len(df)) points
h = min(30, len(df))
y_true = df['y'].values[-h:]
prophet_pred = prophet_fcst['yhat'].values[-h:] if len(prophet_fcst)>=h else prophet_fcst['yhat'].values[:h]
arima_pred = arima_fcst['yhat'].values[:h]
prophet_metrics = eval_metrics(y_true, prophet_pred)
arima_metrics = eval_metrics(y_true, arima_pred)

best = select_best_model([ (prophet_name, prophet_fcst, prophet_metrics), (arima_name, arima_fcst, arima_metrics) ])
print('Best model:', best[0])
print('Prophet metrics:', prophet_metrics)
print('ARIMA metrics:', arima_metrics)

## Next steps
- Move tested functions (prepare_df, train_prophet, train_arima, eval_metrics, select_best_model) into `backend/app/core/forecasting.py`.
- Add hyperparameter grid search and time-series cross-validation in a separate notebook when ready.

In [None]:
# Additional: small hyperparameter tuning example (span tuning for exp smoothing)
# This cell provides lightweight implementations for train_exp_smoothing,
# rolling_origin_cv and cv_score so the example can run inside the notebook

# Re-prepare df (in case notebook restarted)
df = prepare_df('../data/sample_sales.csv')
initial_window = max(14, int(len(df) * 0.5))
horizon = 7

# Minimal train_exp_smoothing implementation (local to notebook)
def train_exp_smoothing(d, periods=30, span=10):
    y = d['y'].values
    if len(y) < 2:
        preds = [float(y[-1]) if len(y) > 0 else 0.0] * periods
    else:
        ew = pd.Series(y).ewm(span=span).mean()
        last = float(y[-1])
        prev = float(ew.iloc[-2]) if len(ew) > 1 else float(ew.iloc[-1])
        slope = last - prev
        preds = [last + (i + 1) * slope for i in range(periods)]
    last_date = pd.to_datetime(d['ds'].iloc[-1]) if len(d) > 0 else pd.Timestamp.today()
    freq = (pd.to_datetime(d['ds'].iloc[1]) - pd.to_datetime(d['ds'].iloc[0])) if len(d) > 1 else pd.Timedelta(days=1)
    future = [last_date + (i + 1) * freq for i in range(periods)]
    return 'exp_smoothing', pd.DataFrame({'ds': future, 'yhat': preds})

# Minimal rolling-origin CV: returns list of RMSEs for each fold
def rolling_origin_cv(df, model_fn, initial_window, horizon, step):
    n = len(df)
    start = initial_window
    results = []
    while start + horizon <= n:
        train = df.iloc[:start].copy()
        test = df.iloc[start:start + horizon].copy()
        name, fcst = model_fn(train, periods=horizon)
        preds = fcst['yhat'].values[:len(test)] if len(fcst) >= len(test) else fcst['yhat'].values[-len(test):]
        rmse = ( (test['y'].values - preds) ** 2 ).mean() ** 0.5
        results.append(rmse)
        start += step
    return results

def cv_score(rmses):
    import numpy as _np
    return float(_np.mean(rmses)) if len(rmses) > 0 else float('inf')

# Example small grid for testing speed
params = {'spans': [5, 7, 10, 14]}

best = (None, float('inf'))
for span in params['spans']:
    def model_fn(d, periods, span_local=span):
        return train_exp_smoothing(d, periods=periods, span=span_local)

    res = rolling_origin_cv(df, model_fn, initial_window=initial_window, horizon=horizon, step=horizon)
    score = cv_score(res)
    print(f'span={span} -> cv_rmse={score:.4f}')
    if score < best[1]:
        best = (span, score)

print('Best (example) span:', best)