### Aggregate Historical Demand per Component (from CSV)

In [None]:
import os
import pandas as pd

# ---- Paths ----
os.makedirs("clean_data", exist_ok=True)
CLEAN_PO = "clean_data/production_orders_clean.csv"
RAW_PO   = "raw_data/production_orders.csv"

In [None]:
# ---- Load production orders ----
po_path = CLEAN_PO if os.path.exists(CLEAN_PO) else RAW_PO
po = pd.read_csv(po_path, parse_dates=["date"])

In [None]:
# Sanity: required columns
req_cols = {"date", "component_id", "units_required", "units_issued"}
missing = req_cols - set(po.columns)
if missing:
    raise ValueError(f"Missing required columns in {po_path}: {missing}")

In [None]:
# ---- DAILY aggregation ----
daily = (
    po.groupby(["component_id", "date"], as_index=False)
      .agg(actual_units=("units_required", "sum"),
           actual_issued=("units_issued", "sum"))
      .sort_values(["component_id", "date"])
)
daily.to_csv("clean_data/historical_demand_daily.csv", index=False)

In [None]:
# ---- MONTHLY aggregation with complete month range per component ----
daily["month"] = daily["date"].dt.to_period("M")

monthly = (
    daily.groupby(["component_id", "month"], as_index=False)
         .agg(actual_units=("actual_units", "sum"),
              actual_issued=("actual_issued", "sum"))
         .sort_values(["component_id", "month"])
)

def _complete_months(df_comp: pd.DataFrame) -> pd.DataFrame:
    comp_id = df_comp["component_id"].iloc[0]
    full_idx = pd.period_range(df_comp["month"].min(), df_comp["month"].max(), freq="M")
    df_comp = df_comp.set_index("month").reindex(full_idx)
    df_comp.index.name = "month"
    df_comp["component_id"] = comp_id
    df_comp[["actual_units", "actual_issued"]] = df_comp[["actual_units", "actual_issued"]].fillna(0)
    return df_comp.reset_index()

monthly_full = (
    monthly.groupby("component_id", group_keys=False)
           .apply(_complete_months)
           .sort_values(["component_id", "month"])
)

In [None]:
# Convert Period to 'YYYY-MM' string for portability
monthly_full["month"] = monthly_full["month"].astype(str)

monthly_full.to_csv("clean_data/historical_demand_monthly.csv", index=False)

print("Saved:")
print(" - clean_data/historical_demand_daily.csv")
print(" - clean_data/historical_demand_monthly.csv")

### Rolling Averages & Seasonal Decomposition (STL) per Component

In [None]:
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL

os.makedirs("clean_data", exist_ok=True)

In [None]:
# --- Load monthly demand ---
path = "clean_data/historical_demand_monthly.csv"
df = pd.read_csv(path)

In [None]:
# Ensure required columns
required = {"component_id", "month", "actual_units", "actual_issued"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns in {path}: {missing}")

In [None]:
# Normalize month to Timestamp (month start)
df["month"] = pd.to_datetime(df["month"], format="%Y-%m")

# Sort for stable rolling/decomposition
df = df.sort_values(["component_id", "month"]).reset_index(drop=True)

# Container for results
out_parts = []

In [None]:
# Process each component separately
for comp_id, g in df.groupby("component_id", sort=False):
    g = g.sort_values("month").copy()
    g.set_index("month", inplace=True)

    # Rolling features (centered to reduce phase lag, with min periods)
    g["ra_3m"]  = g["actual_units"].rolling(window=3, min_periods=2, center=True).mean()
    g["ra_6m"]  = g["actual_units"].rolling(window=6, min_periods=3, center=True).mean()
    g["rstd_6m"] = g["actual_units"].rolling(window=6, min_periods=3, center=True).std()

    # STL decomposition (monthly seasonality = 12)
    n = len(g)
    if n >= 12 and g["actual_units"].notna().sum() >= 12:
        # Fill small gaps for STL stability
        series = g["actual_units"].astype(float).interpolate(limit_direction="both")
        try:
            stl = STL(series, period=12, robust=True)
            res = stl.fit()
            g["stl_trend"]    = res.trend
            g["stl_seasonal"] = res.seasonal
            g["stl_resid"]    = res.resid
        except Exception:
            # If STL fails for any reason, leave NaNs
            g["stl_trend"] = np.nan
            g["stl_seasonal"] = np.nan
            g["stl_resid"] = np.nan
    else:
        g["stl_trend"] = np.nan
        g["stl_seasonal"] = np.nan
        g["stl_resid"] = np.nan

    g["component_id"] = comp_id
    out_parts.append(g.reset_index())

# Concatenate and save
features = pd.concat(out_parts, axis=0, ignore_index=True)

# Order and tidy columns
cols = [
    "component_id", "month", "actual_units", "actual_issued",
    "ra_3m", "ra_6m", "rstd_6m",
    "stl_trend", "stl_seasonal", "stl_resid"
]
features = features[cols].sort_values(["component_id", "month"]).copy()

# Save
out_path = "clean_data/demand_rolling_stl_features.csv"
features.to_csv(out_path, index=False)

print(f" Rolling & STL features saved to: {out_path}")
print(features.head(8))


### Repair supply inputs & recompute risk flags


In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
#  Paths & helpers

base_clean = Path("cleaned_data") if Path("cleaned_data").exists() else Path("clean_data")
base_raw   = Path("raw_data")

def load_csv_safe(path_primary: Path, path_fallback: Path = None, parse_cols=None):
    if path_primary and path_primary.exists():
        return pd.read_csv(path_primary, parse_dates=parse_cols or [])
    if path_fallback and path_fallback.exists():
        return pd.read_csv(path_fallback, parse_dates=parse_cols or [])
    raise FileNotFoundError(f"Neither {path_primary} nor {path_fallback} found.")

In [None]:
# 1) Forecast frame
try:
    _ = forecast_df  # if already in memory
except NameError:
    # Load prior output just to get forecasts
    fc_path = base_clean / "forecast_next_month_risk.csv"
    _temp = pd.read_csv(fc_path)
    # Keep only forecast inputs
    needed_cols = ["component_id", "forecast_month", "forecast_units"]
    extra = [c for c in ["method"] if c in _temp.columns]
    forecast_df = _temp[needed_cols + extra].copy()

# Ensure types
forecast_df["forecast_units"] = forecast_df["forecast_units"].astype(float).clip(lower=0)

In [None]:
# 2) Load inventory & deliveries
# Inventory levels
inv = load_csv_safe(
    base_clean / "inventory_levels_clean.csv",
    base_raw   / "inventory_levels.csv",
    parse_cols=["date"]
)

# Deliveries
dl = load_csv_safe(
    base_clean / "delivery_logs_clean.csv",
    base_raw   / "delivery_logs.csv",
    parse_cols=["order_date","expected_delivery_date","actual_delivery_date"]
)

# Sanity columns
if "closing_stock" not in inv.columns:
    # support alternative naming, else error
    alt = [c for c in inv.columns if c.lower() == "closing_stock"]
    if alt:
        inv.rename(columns={alt[0]: "closing_stock"}, inplace=True)
    else:
        raise KeyError("inventory_levels is missing 'closing_stock' column.")

In [None]:
# 3) Latest non-zero stock snapshot (last 90 days fallback)
today = pd.Timestamp.today().normalize()
lookback_start = today - pd.Timedelta(days=90)

# restrict to last 90 days if available
inv_recent = inv[inv["date"] >= inv["date"].max() - pd.Timedelta(days=120)].copy()

def last_nonzero_stock(g: pd.DataFrame) -> int:
    g = g.sort_values("date")
    # try last non-zero in last 90 days
    g90 = g[g["date"] >= lookback_start]
    nz = g90[g90["closing_stock"] > 0]["closing_stock"]
    if not nz.empty:
        return int(nz.iloc[-1])
    # fallback: latest value overall (may be zero)
    return int(g["closing_stock"].iloc[-1])

latest_stock = (
    inv_recent.groupby("component_id", as_index=False)
             .apply(lambda x: pd.Series({"latest_closing_stock": last_nonzero_stock(x)}))
)

In [None]:
# 4) Inbound in next 30 days from TODAY
window_end = today + pd.Timedelta(days=30)

# prefer quantity_received; fallback to quantity_ordered
qty_col = "quantity_received" if "quantity_received" in dl.columns else "quantity_ordered"
if qty_col not in dl.columns:
    raise KeyError("delivery_logs missing 'quantity_received' and 'quantity_ordered' columns.")

inbound_mask = (
    (dl["expected_delivery_date"] >= today) &
    (dl["expected_delivery_date"] <= window_end) &
    ( dl["actual_delivery_date"].isna() | (dl["actual_delivery_date"] > today) )
)

inbound_30 = (
    dl.loc[inbound_mask]
      .groupby("component_id", as_index=False)[qty_col]
      .sum()
      .rename(columns={qty_col: "inbound_next_30"})
)

In [None]:
# 5) Merge & recompute coverage/risk
risk = (
    forecast_df.merge(latest_stock, on="component_id", how="left")
               .merge(inbound_30, on="component_id", how="left")
               .copy()
)

risk["latest_closing_stock"] = risk["latest_closing_stock"].fillna(0).clip(lower=0).astype(int)
risk["inbound_next_30"] = risk["inbound_next_30"].fillna(0).astype(int)

risk["coverage_ratio"] = np.where(
    risk["forecast_units"] > 0,
    (risk["latest_closing_stock"] + risk["inbound_next_30"]) / risk["forecast_units"],
    np.inf
)

risk["risk_flag"] = pd.cut(
    risk["coverage_ratio"],
    bins=[-np.inf, 0.8, 1.0, np.inf],
    labels=["High", "Medium", "Low"]
)


# 6) Save
out_dir = base_clean if base_clean.exists() else Path("clean_data")
out_dir.mkdir(exist_ok=True, parents=True)
out_path = out_dir / "forecast_next_month_risk.csv"
risk.sort_values(["risk_flag","coverage_ratio","component_id"]).to_csv(out_path, index=False)

print(f" Recomputed risk table saved to: {out_path}")
print(risk.head(10))
