## Load outputs

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_recall_fscore_support

pred = pd.read_parquet("../data/anomaly_outputs.parquet").sort_values("date").reset_index(drop=True)
pred.head()

Unnamed: 0,date,y_true,y_pred_naive,y_pred_rf,y_pred_gbr,is_anomaly,resid_rf,abs_resid_rf,resid_gbr,abs_resid_gbr,z_resid_rf,anom_z,anom_iso
0,2022-08-02,136.91788,128.837856,128.657569,130.447388,0,8.260311,8.260311,6.470492,6.470492,,0,0
1,2022-08-03,128.704878,136.91788,129.77475,131.850201,0,-1.069872,1.069872,-3.145323,3.145323,,0,0
2,2022-08-04,130.759613,128.704878,126.452089,126.814752,0,4.307523,4.307523,3.94486,3.94486,,0,0
3,2022-08-05,120.303069,130.759613,115.783941,116.297529,0,4.519128,4.519128,4.00554,4.00554,,0,0
4,2022-08-06,106.00191,120.303069,109.138298,108.144682,0,-3.136388,3.136388,-2.142772,2.142772,,0,0


## Forecasting metrics table

In [3]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

y = pred["y_true"].values

rows = []
for name, col in [
    ("Naive", "y_pred_naive"),
    ("RandomForest", "y_pred_rf"),
    ("HistGradientBoosting", "y_pred_gbr"),
]:
    rows.append({
        "model": name,
        "rmse": rmse(y, pred[col].values),
        "mae": mae(y, pred[col].values),
    })

forecast_metrics = pd.DataFrame(rows).sort_values("rmse")
forecast_metrics

Unnamed: 0,model,rmse,mae
1,RandomForest,99.412329,7.690055
2,HistGradientBoosting,123.404215,8.975648
0,Naive,146.023259,9.409141


## Anomaly metrics table (if labels exist)

In [4]:
if "is_anomaly" in pred.columns and pred["is_anomaly"].notna().any():
    y_true = pred["is_anomaly"].fillna(0).astype(int).values
    rows = []
    for name, col in [("Z-score(resid)", "anom_z"), ("IsolationForest(resid)", "anom_iso")]:
        y_pred = pred[col].astype(int).values
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
        rows.append({"method": name, "precision": p, "recall": r, "f1": f1})
    anom_metrics = pd.DataFrame(rows).sort_values("f1", ascending=False)
    anom_metrics
else:
    print("No anomaly labels available. Compare methods by stability, rate, and qualitative review.")

## Operational comparison: anomaly rate + stability

In [5]:
rate_z = pred["anom_z"].mean()
rate_iso = pred["anom_iso"].mean()

summary = pd.DataFrame([
    {"method": "Z-score(resid)", "anomaly_rate": rate_z},
    {"method": "IsolationForest(resid)", "anomaly_rate": rate_iso},
]).sort_values("anomaly_rate")

summary

Unnamed: 0,method,anomaly_rate
0,Z-score(resid),0.013445
1,IsolationForest(resid),0.020168


## Export results (for experiments/results.md)

In [6]:
forecast_metrics.to_csv("../experiments/forecast_metrics.csv", index=False)

if "is_anomaly" in pred.columns and pred["is_anomaly"].notna().any():
    anom_metrics.to_csv("../experiments/anomaly_metrics.csv", index=False)

print("Saved experiments metrics to ../experiments/")

Saved experiments metrics to ../experiments/
