# EDA + Feature Engineering
Load data, explore, and save feature dataset.

In [11]:
from __future__ import annotations
import pandas as pd
from pathlib import Path
from typing import Optional

DATA_DIR = Path("Data")

# Expected columns in operations_daily: date, site_id, units_produced, power_kwh, downtime_minutes (optional)
# Expected columns in site_meta: site_id, region, capacity, ...

def read_operations(path: Optional[str] = None, data_dir: Path = DATA_DIR) -> pd.DataFrame:
    REQUIRED_COLS = {"date", "site_id", "units_produced", "power_kwh"}

    def _load(p: Path) -> pd.DataFrame:
        df = pd.read_csv(p, parse_dates=["date"])  # type: ignore[arg-type]
        if not REQUIRED_COLS.issubset(df.columns):
            raise ValueError(f"{p.name} missing required columns: {REQUIRED_COLS - set(df.columns)}")
        return df.sort_values(["site_id", "date"]).reset_index(drop=True)

    if path:
        return _load(Path(path))

    candidates = sorted(data_dir.glob("operations_daily_*.csv"))
    if candidates:
        def _days(p: Path) -> int:
            name = p.stem
            token = name.split("_")[-1]
            return int(token[:-1]) if token.endswith("d") and token[:-1].isdigit() else 0
        chosen = max(candidates, key=_days)
        return _load(chosen)

    fallback = data_dir / "operations_daily.csv"
    if fallback.exists():
        return _load(fallback)

    raise FileNotFoundError("No operations_daily CSV found in Data/")

def read_site_meta(path: Optional[str] = None) -> pd.DataFrame:
    p = Path(path) if path else DATA_DIR / "site_meta.csv"
    if not p.exists():
        raise FileNotFoundError(f"site_meta not found at {p}")
    df = pd.read_csv(p)
    return df

In [12]:
from __future__ import annotations
import pandas as pd
import numpy as np

# Feature engineering utilities
# - Calendar features
# - Rolling means as baselines
# - Site metadata join

def add_calendar_features(df: pd.DataFrame, date_col: str = "date") -> pd.DataFrame:
    d = df.copy()
    d[date_col] = pd.to_datetime(d[date_col])
    d["dow"] = d[date_col].dt.dayofweek  # 0=Mon
    d["dom"] = d[date_col].dt.day
    d["month"] = d[date_col].dt.month
    d["week"] = d[date_col].dt.isocalendar().week.astype(int)
    d["is_weekend"] = (d["dow"] >= 5).astype(int)
    return d


def add_rolling_features(df: pd.DataFrame,
                         by: list[str] = ["site_id"],
                         date_col: str = "date",
                         targets: list[str] = ["units_produced", "power_kwh"],
                         windows: list[int] = [3, 7, 14, 28]) -> pd.DataFrame:
    d = df.copy()
    d = d.sort_values(by + [date_col])
    for tgt in targets:
        if tgt not in d.columns:
            continue
        for w in windows:
            d[f"{tgt}_rollmean_{w}"] = (
                d.groupby(by)[tgt]
                .transform(lambda s: s.rolling(w, min_periods=max(1, w//2)).mean())
            )
            d[f"{tgt}_rollstd_{w}"] = (
                d.groupby(by)[tgt]
                .transform(lambda s: s.rolling(w, min_periods=max(1, w//2)).std())
            )
    return d


def join_site_meta(ops: pd.DataFrame, site_meta: pd.DataFrame) -> pd.DataFrame:
    meta = site_meta.copy()

    # Normalize join key dtype
    ops["site_id"] = ops["site_id"].astype(str)
    meta["site_id"] = meta["site_id"].astype(str)

    # Encode categoricals except join key
    for c in meta.select_dtypes(include=["object"]).columns:
        if c != "site_id":
            try:
                meta[c] = meta[c].astype("category").cat.codes
            except Exception:
                pass

    return ops.merge(meta, on="site_id", how="left")


def prepare_features(ops: pd.DataFrame, site_meta: pd.DataFrame | None = None) -> pd.DataFrame:
    d = add_calendar_features(ops)
    d = add_rolling_features(d)
    if site_meta is not None:
        d = join_site_meta(d, site_meta)
    return d


In [13]:
import pandas as pd
from pathlib import Path

ops = read_operations()
meta = read_site_meta()
features = prepare_features(ops, meta)
features.head()

Unnamed: 0,date,site_id,units_produced,downtime_minutes,power_kwh,rework_units,defects_ppm,staff_count,material_cost_per_unit,price_per_unit,...,power_kwh_rollstd_3,power_kwh_rollmean_7,power_kwh_rollstd_7,power_kwh_rollmean_14,power_kwh_rollstd_14,power_kwh_rollmean_28,power_kwh_rollstd_28,region,commissioned_year,shift_hours_per_day
0,2025-01-01,S1,1280,34,4211,27,453,56,71.94,96.04,...,,,,,,,,2,2018,20
1,2025-01-02,S1,1249,193,5471,40,480,55,71.87,95.9,...,890.954544,,,,,,,2,2018,20
2,2025-01-03,S1,1163,43,4178,33,454,53,72.04,96.11,...,737.1723,4620.0,737.1723,,,,,2,2018,20
3,2025-01-04,S1,1348,32,4554,35,387,54,72.26,95.99,...,665.095732,4603.5,602.802621,,,,,2,2018,20
4,2025-01-05,S1,0,0,0,0,0,0,0.0,0.0,...,2527.712273,3682.8,2123.904588,,,,,2,2018,20
