In [26]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import tracemalloc
import time
import json
try:
    import psutil
    psutil_available = True
    proc = psutil.Process()
except Exception:
    psutil_available = False
    import resource

def infer_file(data_dir: Path, file_arg: str = None) -> Path:
    if file_arg:
        f = Path(file_arg)
        if not f.exists():
            raise FileNotFoundError(f)
        return f
    files = sorted(data_dir.glob("*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No CSV files in {data_dir}")
    return files[0]

def load_series(path: Path, y_col: str = "y"):
    df = pd.read_csv(path)
    if y_col not in df.columns:
        raise ValueError(f"Column '{y_col}' not found in {path}")
    series = df[y_col].copy()
    # If there's a 't' column and it's datetime-like, try to set index
    if "t" in df.columns:
        try:
            idx = pd.to_datetime(df["t"])
            series.index = idx
        except Exception:
            series.index = pd.RangeIndex(start=0, stop=len(series), step=1)
    else:
        series.index = pd.RangeIndex(start=0, stop=len(series), step=1)
    return series, df

def make_future_index(index, h):
    if isinstance(index, pd.DatetimeIndex):
        freq = index.freq or pd.infer_freq(index)
        if freq is None:
            # fallback to daily
            freq = "D"
        return pd.date_range(start=index[-1] + pd.tseries.frequencies.to_offset(freq), periods=h, freq=freq)
    else:
        return np.arange(len(index), len(index) + h) 


In [None]:
# Directories
project_root = Path.cwd().resolve().parent
data_dir = project_root / "data"
out_dir = project_root / "output" / "python"
data_dir.mkdir(parents=True, exist_ok=True)
out_dir.mkdir(parents=True, exist_ok=True)

# Selection pattern (glob) to filter files in data/
# Example: only linear files with two numbers: linear_*_*.csv
name_glob = "*.csv"  # change to e.g., "linear_*_*.csv" or "season_*_*.csv"

# Config (set once for all files)
seasonal_periods = None   # e.g., 12 for monthly seasonality
trend = "add"             # "add", "mul", or "none"
seasonal = None           # "add", "mul", or None
ycol = "y"
train_ratio = 0.7         # 70/30 split"

def process_one(csv_path: Path):
    series, raw_df = load_series(csv_path, ycol)
    tr = None if trend == "none" else trend
    sn = None if seasonal == "none" else seasonal
    sp = seasonal_periods if sn else None
    if sn and sp is None:
        sp = 12

    # Split
    split = max(1, int(len(series) * train_ratio))
    train = series.iloc[:split]
    test = series.iloc[split:]

    # Train
    if psutil_available:
        mem_before = proc.memory_info().rss
    else:
        mem_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024

    t0 = time.perf_counter()
    model = ExponentialSmoothing(train, trend=tr, seasonal=sn, seasonal_periods=sp, initialization_method="estimated")
    fitted = model.fit(optimized=True)
    train_time_s = time.perf_counter() - t0

    if psutil_available:
        mem_after_fit = proc.memory_info().rss
    else:
        mem_after_fit = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024
    mem_used_fit_bytes = max(0, mem_after_fit - mem_before)

    # Forecast
    t0 = time.perf_counter()
    fcast = fitted.forecast(len(test))
    predict_time_s = time.perf_counter() - t0

    if psutil_available:
        mem_after_pred = proc.memory_info().rss
    else:
        mem_after_pred = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024
    mem_used_pred_bytes = max(0, mem_after_pred - mem_after_fit)

    # Index alignment: numeric index like source
    if isinstance(series.index, pd.DatetimeIndex):
        idx_vals = np.arange(len(series), len(series) + len(fcast))
    else:
        try:
            idx_vals = fcast.index.astype(int)
        except Exception:
            idx_vals = np.arange(len(series), len(series) + len(fcast))

    # Save forecast (index, value)
    out_name = f"{csv_path.stem}_hw_h{len(fcast)}_s{sp if sp else 0}.csv"
    out_path = out_dir / out_name
    pd.DataFrame({"index": idx_vals, "value": fcast.values}).to_csv(out_path, index=False)

    # Metrics JSON (matching requested schema)
    y_true = test.values
    y_pred = fcast.values
    err = y_true - y_pred
    rmse = float(np.sqrt(np.mean(err ** 2))) if len(test) else float("nan")
    mae = float(np.mean(np.abs(err))) if len(test) else float("nan")
    with np.errstate(divide='ignore', invalid='ignore'):
        mape_arr = np.abs(err / np.where(y_true == 0, np.nan, y_true)) if len(test) else np.array([np.nan])
        mape = float(np.nanmean(mape_arr) * 100)

    metrics = {
        "file": str(csv_path),
        "n_total": int(len(series)),
        "n_train": int(len(train)),
        "n_test": int(len(test)),
        "train_time_s": float(train_time_s),
        "predict_time_s": float(predict_time_s),
        "mem_used_fit_bytes": int(mem_used_fit_bytes),
        "mem_used_pred_bytes": int(mem_used_pred_bytes),
        "rmse": rmse,
        "mae": mae,
        "mape_pct": mape,
        "forecast_csv": str(out_path),
    }
    metrics_path = out_dir / f"{csv_path.stem}_hw_metrics.json"
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=2)

    # Plot
    plt.figure(figsize=(10, 5))
    plt.plot(train.index, train.values, label="train", marker="o")
    plt.plot(test.index, test.values, label="test", marker="o")
    plt.plot(train.index, fitted.fittedvalues, label="fitted (train)", alpha=0.7)
    try:
        plt.plot(fcast.index, fcast.values, label="forecast", marker="o")
    except Exception:
        plt.plot(idx_vals, fcast.values, label="forecast", marker="o")
    plt.legend()
    plt.title(f"Holt-Winters forecast ({csv_path.name})")
    plot_path = out_dir / f"{csv_path.stem}_hw_plot_h{len(fcast)}.png"
    plt.tight_layout()
    plt.savefig(plot_path)
    plt.close()

    print(f"Done: {csv_path.name} → {out_path.name}, RMSE={rmse:.4f}, MAPE={mape:.2f}%")

In [None]:
# Process CSVs in data matching glob pattern
for csv in sorted(data_dir.glob(name_glob)):
    try:
        process_one(csv)
    except Exception as e:
        print(f"Failed {csv.name}: {e}")

AR_10000_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_10000_0.csv → AR_10000_0_hw_h3900_s0.csv, RMSE=1.8530, MAPE=0.02%
AR_10000_01.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_10000_01.csv → AR_10000_01_hw_h3900_s0.csv, RMSE=3.0224, MAPE=0.03%
AR_10000_02.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_10000_02.csv → AR_10000_02_hw_h3900_s0.csv, RMSE=3.0405, MAPE=0.03%
AR_10000_03.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_10000_03.csv → AR_10000_03_hw_h3900_s0.csv, RMSE=3.0999, MAPE=0.03%
AR_1000_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_1000_0.csv → AR_1000_0_hw_h391_s0.csv, RMSE=1.6222, MAPE=0.17%
AR_100_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_100_0.csv → AR_100_0_hw_h39_s0.csv, RMSE=1.5440, MAPE=1.51%
AR_500_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_500_0.csv → AR_500_0_hw_h196_s0.csv, RMSE=3.0961, MAPE=0.72%
AR_50_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: AR_50_0.csv → AR_50_0_hw_h20_s0.csv, RMSE=0.8629, MAPE=1.71%
linear_10000_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_10000_0.csv → linear_10000_0_hw_h3900_s0.csv, RMSE=1.0021, MAPE=0.01%
linear_10000_01.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_10000_01.csv → linear_10000_01_hw_h3900_s0.csv, RMSE=1.8504, MAPE=0.01%
linear_10000_02.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_10000_02.csv → linear_10000_02_hw_h3900_s0.csv, RMSE=2.4730, MAPE=0.02%
linear_10000_03.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_10000_03.csv → linear_10000_03_hw_h3900_s0.csv, RMSE=2.9914, MAPE=0.02%
linear_1000_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_1000_0.csv → linear_1000_0_hw_h391_s0.csv, RMSE=0.9961, MAPE=0.10%
linear_100_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_100_0.csv → linear_100_0_hw_h39_s0.csv, RMSE=0.8909, MAPE=0.86%
linear_500_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_500_0.csv → linear_500_0_hw_h196_s0.csv, RMSE=1.0261, MAPE=0.21%
linear_50_0.csv: trend=add, seasonal=None, seasonal_periods=None


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: linear_50_0.csv → linear_50_0_hw_h20_s0.csv, RMSE=0.8340, MAPE=1.33%
season_10000_0.csv: trend=add, seasonal=add, seasonal_periods=24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: season_10000_0.csv → season_10000_0_hw_h3900_s24.csv, RMSE=1.0026, MAPE=0.01%
season_10000_01.csv: trend=add, seasonal=add, seasonal_periods=24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: season_10000_01.csv → season_10000_01_hw_h3900_s24.csv, RMSE=1.8558, MAPE=0.01%
season_10000_02.csv: trend=add, seasonal=add, seasonal_periods=24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: season_10000_02.csv → season_10000_02_hw_h3900_s24.csv, RMSE=2.4828, MAPE=0.02%
season_10000_03.csv: trend=add, seasonal=add, seasonal_periods=24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: season_10000_03.csv → season_10000_03_hw_h3900_s24.csv, RMSE=2.9952, MAPE=0.02%
season_1000_0.csv: trend=add, seasonal=add, seasonal_periods=24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: season_1000_0.csv → season_1000_0_hw_h391_s24.csv, RMSE=1.0130, MAPE=0.10%
season_100_0.csv: trend=add, seasonal=add, seasonal_periods=24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: season_100_0.csv → season_100_0_hw_h39_s24.csv, RMSE=0.9653, MAPE=1.03%
season_500_0.csv: trend=add, seasonal=add, seasonal_periods=24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Done: season_500_0.csv → season_500_0_hw_h196_s24.csv, RMSE=1.0501, MAPE=0.22%
season_50_0.csv: trend=add, seasonal=add, seasonal_periods=24
Failed season_50_0.csv: Cannot compute initial seasonals using heuristic method with less than two full seasonal cycles in the data.


  self._init_dates(dates, freq)
