In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import tracemalloc
import time
import json
try:
    import psutil
    psutil_available = True
    proc = psutil.Process()
except Exception:
    psutil_available = False
    import resource

def infer_file(data_dir: Path, file_arg: str = None) -> Path:
    if file_arg:
        f = Path(file_arg)
        if not f.exists():
            raise FileNotFoundError(f)
        return f
    files = sorted(data_dir.glob("*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No CSV files in {data_dir}")
    return files[0]

def load_series(path: Path, y_col: str = "y"):
    df = pd.read_csv(path)
    if y_col not in df.columns:
        raise ValueError(f"Column '{y_col}' not found in {path}")
    series = df[y_col].copy()

    #series.index = pd.RangeIndex(start=0, stop=len(series), step=1)
    
    # If there's a 't' column and it's datetime-like, try to set index
    if "t" in df.columns:
        try:
            idx = pd.to_datetime(df["t"])
            series.index = idx
        except Exception:
            series.index = pd.RangeIndex(start=0, stop=len(series), step=1)
    else:
        series.index = pd.RangeIndex(start=0, stop=len(series), step=1)
    return series, df

def make_future_index(index, h):
    if isinstance(index, pd.DatetimeIndex):
        freq = index.freq or pd.infer_freq(index)
        if freq is None:
            # fallback to daily
            freq = "D"
        return pd.date_range(start=index[-1] + pd.tseries.frequencies.to_offset(freq), periods=h, freq=freq)
    else:
        return np.arange(len(index), len(index) + h) 


In [5]:
# Directories
project_root = Path.cwd().resolve().parent
data_dir = project_root / "data"
out_dir = project_root / "output" / "python"
data_dir.mkdir(parents=True, exist_ok=True)
out_dir.mkdir(parents=True, exist_ok=True)

# Selection pattern (glob) to filter files in data/
# Example: only linear files with two numbers: linear_*_*.csv
name_glob = "season_USAccDeaths.csv"  # change to e.g., "linear_*_*.csv" or "season_*_*.csv"

# Config (set once for all files)
seasonal_periods = None   # e.g., 12 for monthly seasonality
trend = "add"             # "add", "mul", or "none"
seasonal = "add"           # "add", "mul", or None
ycol = "y"
train_ratio = 0.7         # 70/30 split"

def calc_meme(train, test, tr, sn, sp, meme):
    
    # Train
    if meme:
        tracemalloc.start()
        tracemalloc.reset_peak()

    t0 = time.perf_counter()
    model = ExponentialSmoothing(train, trend=tr, seasonal=sn, seasonal_periods=sp, initialization_method="estimated")
    fitted = model.fit(optimized=True)
    train_time_s = time.perf_counter() - t0

    if meme:
        _, peak = tracemalloc.get_traced_memory()
        mem_used_fit_bytes = peak
    else:
        mem_used_fit_bytes = 0

    # Forecast
    if meme:
        tracemalloc.reset_peak()
    t0 = time.perf_counter()
    fcast = fitted.forecast(len(test))
    predict_time_s = time.perf_counter() - t0

    if meme:
        _, peak = tracemalloc.get_traced_memory()
        mem_used_pred_bytes = peak
        tracemalloc.stop()
    else:
        mem_used_pred_bytes = 0


    return fitted, fcast, train_time_s, predict_time_s, mem_used_fit_bytes, mem_used_pred_bytes

def process_one(csv_path: Path):
    series, raw_df = load_series(csv_path, ycol)
    tr = None if trend == "none" else trend
    sn = None if seasonal == "none" else seasonal
    sp = seasonal_periods if sn else None
    if sn and sp is None:
        sp = 12

    # Split
    split = max(1, int(len(series) * train_ratio))
    train = series.iloc[:split]
    test = series.iloc[split:]

    fitted, fcast, train_time_s, predict_time_s, mem_used_fit_bytes, mem_used_pred_bytes = calc_meme(train, test, tr, sn, sp, True)

    # Index alignment: numeric index like source
    if isinstance(series.index, pd.DatetimeIndex):
        idx_vals = np.arange(len(series), len(series) + len(fcast))
    else:
        try:
            idx_vals = fcast.index.astype(int)
        except Exception:
            idx_vals = np.arange(len(series), len(series) + len(fcast))

    # Save forecast (index, value)
    out_name = f"{csv_path.stem}_hw_h{len(fcast)}_s{sp if sp else 0}.csv"
    out_path = out_dir / out_name
    pd.DataFrame({"index": idx_vals, "value": fcast.values}).to_csv(out_path, index=False)

    # Metrics JSON (matching requested schema)
    y_true = test.values
    y_pred = fcast.values
    err = y_true - y_pred
    rmse = float(np.sqrt(np.mean(err ** 2))) if len(test) else float("nan")
    mae = float(np.mean(np.abs(err))) if len(test) else float("nan")
    with np.errstate(divide='ignore', invalid='ignore'):
        mape_arr = np.abs(err / np.where(y_true == 0, np.nan, y_true)) if len(test) else np.array([np.nan])
        mape = float(np.nanmean(mape_arr) * 100)

    metrics = {
        "file": str(csv_path),
        "n_total": int(len(series)),
        "n_train": int(len(train)),
        "n_test": int(len(test)),
        "train_time_s": float(train_time_s),
        "predict_time_s": float(predict_time_s),
        "mem_used_fit_bytes": int(mem_used_fit_bytes),
        "mem_used_pred_bytes": int(mem_used_pred_bytes),
        "rmse": rmse,
        "mae": mae,
        "mape_pct": mape,
        "forecast_csv": str(out_path),
    }
    metrics_path = out_dir / f"{csv_path.stem}_hw_metrics.json"
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=2)

    # Plot
    plt.figure(figsize=(10, 5))
    plt.plot(train.index, train.values, label="train", marker="o")
    plt.plot(test.index, test.values, label="test", marker="o")
    plt.plot(train.index, fitted.fittedvalues, label="fitted (train)", alpha=0.7)
    try:
        plt.plot(fcast.index, fcast.values, label="forecast", marker="o")
    except Exception:
        plt.plot(idx_vals, fcast.values, label="forecast", marker="o")
    plt.legend()
    plt.title(f"Holt-Winters forecast ({csv_path.name})")
    plot_path = out_dir / f"{csv_path.stem}_hw_plot_h{len(fcast)}.png"
    plt.tight_layout()
    plt.savefig(plot_path)
    plt.close()

    print(f"Done: {csv_path.name} → {out_path.name}, RMSE={rmse:.4f}, MAPE={mape:.2f}%")

In [6]:
# Process CSVs in data matching glob pattern
for csv in sorted(data_dir.glob(name_glob)):
    try:
        process_one(csv)
    except Exception as e:
        print(f"Failed {csv.name}: {e}")

Done: season_USAccDeaths.csv → season_USAccDeaths_hw_h22_s12.csv, RMSE=408.2164, MAPE=3.45%
