In [22]:
# supplier_performance_pipeline.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# ---------------------------
# 1. Feature Engineering
# ---------------------------
def engineer_order_features(df):
    df["Order_Date"] = pd.to_datetime(df["Order_Date"])
    df["Delivery_Date"] = pd.to_datetime(df["Delivery_Date"])

    # Late delivery flag
    df["late_delivery"] = (df["Delivery_Date"] > df["Order_Date"] + pd.Timedelta(days=7)).astype(int)

    # Cost overrun flag
    df["cost_overrun"] = (df["Unit_Price"] > df["Negotiated_Price"]).astype(int)

    # Defect rate per order
    df["defect_rate"] = df["Defective_Units"] / df["Quantity"]
    df["defect_rate"] = df["defect_rate"].fillna(0)

    return df

# ---------------------------
# 2. Aggregate Supplier KPIs
# ---------------------------
def aggregate_supplier_kpis(df):
    feats = df.groupby("Supplier").agg(
        late_rate=("late_delivery", "mean"),
        cost_overrun_rate=("cost_overrun", "mean"),
        defect_rate=("defect_rate", "mean"),
        avg_unit_price=("Unit_Price", "mean")
    ).reset_index()
    return feats

# ---------------------------
# 3. Risk Scoring
# ---------------------------
def classify_risk(score):
    if score >= 70:
        return "High"
    elif score >= 50:
        return "Moderate"
    else:
        return "Low"

def compute_risk_score(feats):
    score = (
        feats["late_rate"] * 40 +
        feats["cost_overrun_rate"] * 30 +
        feats["defect_rate"] * 40 +
        feats["avg_unit_price"].rank(pct=True) * 20
    )
    feats["risk_score"] = score.clip(0, 100)
    feats["risk_class"] = feats["risk_score"].apply(classify_risk)
    return feats

# ---------------------------
# 4. ML Clustering (optional)
# ---------------------------
def ml_cluster_risk(feats):
    X = feats[["late_rate", "cost_overrun_rate", "defect_rate", "avg_unit_price"]]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    feats["cluster"] = kmeans.fit_predict(X_scaled)

    return feats

# ---------------------------
# 5. Forecasting Late Deliveries
# ---------------------------
def forecast_late_rate(df):
    monthly = df.groupby(pd.Grouper(key="Order_Date", freq="M"))["late_delivery"].mean()
    monthly = monthly.fillna(0)

    if len(monthly.dropna()) < 3:
        print("⚠️ Not enough data for forecasting")
        return monthly.to_frame(name="late_rate")

    model = ExponentialSmoothing(monthly, trend="add", seasonal=None)
    fit = model.fit()
    forecast = fit.forecast(6)

    forecast_df = pd.DataFrame({"late_rate": monthly, "forecast": forecast})
    return forecast_df

# ---------------------------
# 6. Alerts
# ---------------------------
def generate_alerts(feats):
    return feats[feats["risk_class"] == "High"][["Supplier", "risk_score", "risk_class"]]

# ---------------------------
# 7. Main Pipeline
# ---------------------------
def main(csv_path="Procurement KPI Analysis Dataset.csv"):
    df = pd.read_csv(csv_path)

    # Step 1: Feature engineering
    df = engineer_order_features(df)

    # Step 2: Supplier KPIs
    feats = aggregate_supplier_kpis(df)

    # Step 3: Risk scores
    feats = compute_risk_score(feats)

    # Step 4: ML clustering
    feats = ml_cluster_risk(feats)

    # Step 5: Forecasting
    forecast = forecast_late_rate(df)

    # Step 6: Alerts
    alerts = generate_alerts(feats)

    # ---------------------------
    # Outputs
    # ---------------------------
    print("\n=== Supplier Risk Scores ===")
    print(feats[["Supplier", "risk_score", "risk_class", "cluster"]])

    print("\n=== Forecast (Late Delivery Rate) ===")
    print(forecast.tail(10))

    print("\n=== Alerts ===")
    if alerts.empty:
        print("No high-risk suppliers")
    else:
        print(alerts)

    # Save outputs
    feats.to_csv("artifacts/supplier_risk_scores.csv", index=False)
    forecast.to_csv("artifacts/late_rate_forecast.csv")
    alerts.to_csv("artifacts/alerts.csv", index=False)

# ---------------------------
if __name__ == "__main__":
    main()



=== Supplier Risk Scores ===
          Supplier  risk_score risk_class  cluster
0        Alpha_Inc   65.734943   Moderate        1
1    Beta_Supplies   65.667905   Moderate        0
2  Delta_Logistics   62.206279   Moderate        2
3    Epsilon_Group   75.379857       High        0
4         Gamma_Co   70.736078       High        1

=== Forecast (Late Delivery Rate) ===
            late_rate  forecast
2023-10-31   0.551724       NaN
2023-11-30   0.522727       NaN
2023-12-31   0.538462       NaN
2024-01-31   0.000000       NaN
2024-02-29        NaN  0.476075
2024-03-31        NaN  0.469059
2024-04-30        NaN  0.462043
2024-05-31        NaN  0.455028
2024-06-30        NaN  0.448012
2024-07-31        NaN  0.440997

=== Alerts ===
        Supplier  risk_score risk_class
3  Epsilon_Group   75.379857       High
4       Gamma_Co   70.736078       High
