# Baseline validation

Validates historical Amber pulls (usage + prices) and computes an energy-only bill baseline.

Requirements:
- Run `scripts/pull_historical.py` first to generate Parquet files.
- Ensure `AMBER_TOKEN` and `AMBER_SITE_ID` are set (e.g., `set -a; source config/.env; set +a`).


In [None]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Allow import from project
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
from analysis.src import baseline


: 

In [None]:
# Configure paths
# Update these to match the files produced by scripts/pull_historical.py
start = "2025-01-01"
end = "2025-01-07"
base_dir = Path("../../data_processed")
usage_path = base_dir / f"usage_{start}_{end}.parquet"
prices_path = base_dir / f"prices_{start}_{end}.parquet"

usage_path, prices_path


: 

In [None]:
# Load data
usage_raw = pd.read_parquet(usage_path)
prices_raw = pd.read_parquet(prices_path)

usage_raw.head(), prices_raw.head(), len(usage_raw), len(prices_raw)


In [None]:
# Normalise
usage = baseline.normalise_usage(usage_raw)
prices = baseline.normalise_prices(prices_raw)

# Align and compute costs
joined = baseline.align_intervals(usage, prices)
with_cost = baseline.compute_energy_only_cost(joined)
summary = baseline.summarise(with_cost)

summary


In [None]:
# Sanity checks
interval_diffs = with_cost.sort_values("interval_start")["interval_start"].diff().dt.total_seconds().dropna()
mode_interval_minutes = interval_diffs.mode().iloc[0] / 60 if not interval_diffs.empty else None

missing_usage = int(with_cost["missing_usage"].sum()) if "missing_usage" in with_cost else 0
missing_price = int(with_cost["missing_price"].sum()) if "missing_price" in with_cost else 0

print("Detected interval (minutes):", mode_interval_minutes)
print("Missing usage intervals:", missing_usage)
print("Missing price intervals:", missing_price)
print("Expected intervals (rough):", len(with_cost))


In [None]:
# Daily aggregates
with_cost = with_cost.set_index("interval_start")

by_day = with_cost.resample("D").agg({
    "usage_kwh": "sum",
    "interval_cost_dollars": "sum",
    "price_c_per_kwh": "mean",
})

fig, axes = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
by_day["usage_kwh"].plot(ax=axes[0], title="Daily total kWh", color="tab:blue")
by_day["interval_cost_dollars"].plot(ax=axes[1], title="Daily energy-only cost ($)", color="tab:orange")
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(10, 3))
with_cost["price_c_per_kwh"].plot(ax=ax, title="Price (c/kWh)", color="tab:green")
plt.tight_layout()
plt.show()


## How to run
1. Pull data (example):
```bash
set -a; source config/.env; set +a
python scripts/pull_historical.py --start 2025-01-01 --end 2025-01-07 --outdir data_processed
```
2. Update the `start`/`end` variables above to match your files.
3. Run all cells to validate interval coverage and daily costs.

Checks:
- "Detected interval" should be 5 or 30 minutes.
- Missing intervals should be near zero; investigate gaps if large.
- Plots should show reasonable daily totals and costs.
