In [None]:
# Robust loader for delivery logs: find, standardize, and load
from pathlib import Path
import shutil
import pandas as pd

In [None]:
# 1) Search candidates
roots = [Path("clean_data"), Path("cleaned_data"), Path("raw_data"), Path(".")]
patterns = [
    "*delivery_logs*.csv", "*delivery_log*.csv",
    "*Delivery_Logs*.csv", "*Delivery_Log*.csv"
]

candidates = []
for root in roots:
    if root.exists():
        for pat in patterns:
            candidates.extend(root.glob(pat))

In [None]:
# 2) Pick the best candidate (prefer cleaned over raw, plural over singular)
def rank(p: Path) -> tuple:
    score = 0
    name = p.name.lower()
    parent = p.parent.name.lower()
    if "clean" in parent: score += 3
    if "raw" in parent:   score += 1
    if "logs" in name:    score += 2
    if "clean" in name:   score += 2
    return (-score, len(name))  # smaller tuple is better

if not candidates:
    raise FileNotFoundError(
        "Could not find a delivery logs CSV. "
        "Expected something like 'delivery_logs.csv' in clean_data/, cleaned_data/, or raw_data/."
    )

best = sorted(candidates, key=rank)[0]
print(f" Found delivery logs file: {best}")

In [None]:
# 3) Standardize path: copy to clean_data/delivery_logs.csv
clean_dir = Path("clean_data")
clean_dir.mkdir(parents=True, exist_ok=True)
canonical = clean_dir / "delivery_logs.csv"
if best.resolve() != canonical.resolve():
    shutil.copy2(best, canonical)
    print(f" Copied to canonical path: {canonical}")
else:
    print(f" Already at canonical path: {canonical}")

In [None]:
# 4) Load with correct date parsing
delivery_logs = pd.read_csv(
    canonical,
    parse_dates=["order_date", "expected_delivery_date", "actual_delivery_date"],
    infer_datetime_format=True
)

# 5) Quick sanity peek
print(f"Rows: {len(delivery_logs):,}")
print("Columns:", delivery_logs.columns.tolist())
print(delivery_logs.head(3))


### Pareto of Delay Causes (from delivery_logs)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
# 1) Build delay buckets
dl = delivery_logs.copy()
dl["delay_days"] = (pd.to_datetime(dl["actual_delivery_date"]) - pd.to_datetime(dl["expected_delivery_date"])).dt.days
dl["delay_days"] = dl["delay_days"].fillna(np.nan)

def bucket_delay(row):
    if pd.isna(row["actual_delivery_date"]):
        return "Undelivered (open)"
    if row["quantity_received"] < row["quantity_ordered"]:
        return "Partial fulfillment"
    if row["delay_days"] is np.nan or row["delay_days"] <= 0:
        return "On-time"
    if row["delay_days"] >= 14:
        return "Severe delay (14d+)"
    return "Moderate delay (1–13d)"

dl["delay_bucket"] = dl.apply(bucket_delay, axis=1)

In [None]:
# 2) Aggregate counts and cumulative %
counts = (dl["delay_bucket"].value_counts()
          .rename_axis("delay_bucket")
          .reset_index(name="count")
          .sort_values("count", ascending=False)
          .reset_index(drop=True))
counts["cum_count"] = counts["count"].cumsum()
counts["cum_pct"] = counts["cum_count"] / counts["count"].sum() * 100

In [None]:
# 3) Plot (matplotlib only)
plt.figure(figsize=(10, 6))
ax = counts.plot(x="delay_bucket", y="count", kind="bar", legend=False, rot=20)
ax.set_xlabel("Delay Cause (Bucket)")
ax.set_ylabel("Deliveries (count)")
ax.set_title("Pareto of Delivery Delay Causes (Overall)")

ax2 = ax.twinx()
ax2.plot(range(len(counts)), counts["cum_pct"], marker="o")
ax2.set_ylabel("Cumulative %")
ax2.set_ylim(0, 110)

# annotate last point
last_x = len(counts) - 1
ax2.annotate(f"{counts['cum_pct'].iloc[-1]:.1f}%",
             xy=(last_x, counts["cum_pct"].iloc[-1]),
             xytext=(last_x, min(100, counts['cum_pct'].iloc[-1] + 5)))

plt.tight_layout()

fig_dir = Path("clean_data") / "figures"
fig_dir.mkdir(parents=True, exist_ok=True)
out_path = fig_dir / "pareto_delay_buckets.png"
plt.savefig(out_path, dpi=200)
plt.close()

print(f"\ Saved Pareto chart: {out_path}")
print(counts)
 

### Monthly Delay Trend

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
# Load if not already in memory
try:
    dl = delivery_logs.copy()
except NameError:
    dl = pd.read_csv(
        Path("clean_data")/"delivery_logs.csv",
        parse_dates=["order_date","expected_delivery_date","actual_delivery_date"]
    )

# Derive flags
dl["is_late"] = dl["actual_delivery_date"].notna() & (dl["actual_delivery_date"] > dl["expected_delivery_date"])
dl["is_undelivered"] = dl["actual_delivery_date"].isna()

# Month on expected date timeline
dl["month"] = dl["expected_delivery_date"].dt.to_period("M").astype(str)

# Aggregate: delay_or_open rate per month
monthly = (
    dl.groupby("month", as_index=False)
      .agg(total_deliveries=("delivery_id","size"),
           late_or_open=("is_late","sum"))
)
monthly["late_or_open"] += dl.groupby("month")["is_undelivered"].sum().values
monthly["delay_rate_pct"] = (monthly["late_or_open"] / monthly["total_deliveries"] * 100).round(2)

In [None]:
# Plot
plt.figure(figsize=(10,6))
plt.plot(monthly["month"], monthly["delay_rate_pct"], marker="o")
plt.title("Monthly Delay Trend (% late or undelivered)")
plt.xlabel("Month")
plt.ylabel("Delay Rate (%)")
plt.xticks(rotation=45, ha="right")
plt.grid(True)
plt.tight_layout()

# Save + show
fig_dir = Path("clean_data")/"figures"
fig_dir.mkdir(parents=True, exist_ok=True)
out_path = fig_dir/"monthly_delay_rate.png"
plt.savefig(out_path, dpi=200)
plt.show()

print(f" Saved monthly delay trend: {out_path}")
print(monthly.tail(6))


### Top Suppliers by Overall Delay Rate

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

BASE = Path("clean_data")
FIG_DIR = BASE / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Load deliveries if not already in memory
try:
    dl = delivery_logs.copy()
except NameError:
    dl = pd.read_csv(
        BASE / "delivery_logs.csv",
        parse_dates=["order_date","expected_delivery_date","actual_delivery_date"]
    )

# Optional supplier names
suppliers = None
for p in [BASE/"suppliers.csv", BASE/"suppliers_clean.csv"]:
    if p.exists():
        suppliers = pd.read_csv(p)
        break

# Flags
dl["is_late"] = dl["actual_delivery_date"].notna() & (dl["actual_delivery_date"] > dl["expected_delivery_date"])
dl["is_undelivered"] = dl["actual_delivery_date"].isna()
dl["late_or_open"] = dl["is_late"] | dl["is_undelivered"]

# Aggregate per supplier
sup = (
    dl.groupby("supplier_id", as_index=False)
      .agg(total_deliveries=("delivery_id","size"),
           late_or_open=("late_or_open","sum"))
)
sup["delay_rate_pct"] = (sup["late_or_open"] / sup["total_deliveries"] * 100).round(2)

# Attach names if available
label_col = "supplier_id"
if suppliers is not None and {"supplier_id","supplier_name"}.issubset(suppliers.columns):
    sup = sup.merge(suppliers[["supplier_id","supplier_name"]], on="supplier_id", how="left")
    label_col = "supplier_name"

# Filter to avoid tiny samples (best practice)
sup_filt = sup[sup["total_deliveries"] >= 100].copy()

# Top 10 worst by delay rate
top10 = sup_filt.sort_values(["delay_rate_pct","total_deliveries"], ascending=[False, False]).head(10)
top10 = top10.sort_values("delay_rate_pct")  # for nicer horizontal plot ordering

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top10[label_col], top10["delay_rate_pct"])
plt.xlabel("Delay Rate (%)")
plt.ylabel("Supplier")
plt.title("Top Suppliers by Overall Delay Rate (late or undelivered)")
for i, v in enumerate(top10["delay_rate_pct"]):
    plt.text(v + 0.5, i, f"{v:.1f}%", va="center")

plt.tight_layout()
out_path = FIG_DIR / "top_suppliers_delay_rate.png"
plt.savefig(out_path, dpi=200)
plt.show()

print(f" Saved: {out_path}")
print(top10[[label_col, "total_deliveries", "delay_rate_pct"]])


### Overall Delay Rate by Country

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

BASE = Path("clean_data")
FIG_DIR = BASE / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# 1) Load deliveries
delivery_path = BASE / "delivery_logs.csv"
dl = pd.read_csv(
    delivery_path,
    parse_dates=["order_date","expected_delivery_date","actual_delivery_date"]
)

In [None]:
# 2) Find suppliers file with required columns
req_cols = {"supplier_id","supplier_name","country","tier_level","on_time_rating"}
sup_path = None
for p in BASE.glob("suppliers*.csv"):
    try:
        test = pd.read_csv(p, nrows=1)
        if req_cols.issubset(set(test.columns)):
            sup_path = p
            break
    except Exception:
        continue

if sup_path is None:
    raise FileNotFoundError(
        "Could not find a suppliers CSV in clean_data/ containing "
        f"columns {sorted(req_cols)}. Please place it there (e.g., suppliers_master.csv)."
    )

suppliers = pd.read_csv(sup_path)

In [None]:
# 3) Merge country to deliveries
dl = dl.merge(suppliers[["supplier_id","country"]], on="supplier_id", how="left")

In [None]:
# 4) Late or undelivered flag
dl["is_late"] = dl["actual_delivery_date"].notna() & (dl["actual_delivery_date"] > dl["expected_delivery_date"])
dl["is_undelivered"] = dl["actual_delivery_date"].isna()
dl["late_or_open"] = dl["is_late"] | dl["is_undelivered"]

In [None]:
# 5) Aggregate by country
country_perf = (
    dl.groupby("country", as_index=False)
      .agg(total_deliveries=("delivery_id","size"),
           late_or_open=("late_or_open","sum"))
)
country_perf["delay_rate_pct"] = (country_perf["late_or_open"] / country_perf["total_deliveries"] * 100).round(2)
country_perf = country_perf.sort_values("delay_rate_pct", ascending=False)

In [None]:
# 6) Plot and save
plt.figure(figsize=(10,6))
bars = plt.barh(country_perf["country"], country_perf["delay_rate_pct"])
plt.xlabel("Delay Rate (%)")
plt.ylabel("Country")
plt.title("Overall Delay Rate by Country (late or undelivered)")
for bar, v in zip(bars, country_perf["delay_rate_pct"]):
    plt.text(v + 0.5, bar.get_y() + bar.get_height()/2, f"{v:.1f}%", va="center")

plt.gca().invert_yaxis()
plt.tight_layout()
out_path = FIG_DIR / "overall_delay_rate_by_country.png"
plt.savefig(out_path, dpi=200)
plt.show()

print(f" Saved: {out_path}")
print(country_perf)


###  Monthly Delay Rate by Country (trend lines)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

BASE = Path("clean_data")
FIG_DIR = BASE / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# --- Load deliveries ---
dl = pd.read_csv(
    BASE / "delivery_logs.csv",
    parse_dates=["order_date","expected_delivery_date","actual_delivery_date"]
)

In [None]:
# --- Load suppliers (auto-detect a file with country) ---
sup = None
for p in BASE.glob("suppliers*.csv"):
    try:
        t = pd.read_csv(p, nrows=1)
        if {"supplier_id","country"}.issubset(t.columns):
            sup = pd.read_csv(p)
            break
    except Exception:
        pass
if sup is None:
    raise FileNotFoundError("No suppliers*.csv in clean_data/ with columns ['supplier_id','country'].")

In [None]:
# --- Merge country onto deliveries ---
dl = dl.merge(sup[["supplier_id","country"]], on="supplier_id", how="left")

In [None]:
# --- Flags & monthly key (expected timeline) ---
dl["is_late"] = dl["actual_delivery_date"].notna() & (dl["actual_delivery_date"] > dl["expected_delivery_date"])
dl["is_undelivered"] = dl["actual_delivery_date"].isna()
dl["late_or_open"] = dl["is_late"] | dl["is_undelivered"]
dl["month"] = dl["expected_delivery_date"].dt.to_period("M").astype(str)

In [None]:
# --- Aggregate monthly delay rate by country ---
grp = (dl.groupby(["country","month"], as_index=False)
         .agg(total_deliveries=("delivery_id","size"),
              late_or_open=("late_or_open","sum")))
grp["delay_rate_pct"] = (grp["late_or_open"] / grp["total_deliveries"] * 100).round(2)

In [None]:
# --- Keep only countries with enough volume overall (avoid noisy lines) ---
vol = grp.groupby("country")["total_deliveries"].sum()
keep_countries = vol[vol >= 400].index.tolist()  # adjust threshold if needed
plot_df = grp[grp["country"].isin(keep_countries)].copy()

In [None]:
# --- Pivot for plotting (months on X, countries as series) ---
pivot = plot_df.pivot(index="month", columns="country", values="delay_rate_pct").sort_index()

# --- Plot (matplotlib only; no custom colors) ---
plt.figure(figsize=(12,7))
for col in pivot.columns:
    plt.plot(pivot.index, pivot[col], marker="o", label=col)

plt.title("Monthly Delay Rate by Country (late or undelivered)")
plt.xlabel("Month")
plt.ylabel("Delay Rate (%)")
plt.xticks(rotation=45, ha="right")
plt.grid(True, axis="y")
plt.legend(title="Country", ncol=2, frameon=False)
plt.tight_layout()

out_path = FIG_DIR / "monthly_delay_rate_by_country.png"
plt.savefig(out_path, dpi=200)
plt.show()

print(f" Saved: {out_path}")
print("Series plotted:", list(pivot.columns))


### Supplier × Month Delay Rate Heatmap

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

BASE = Path("clean_data")
FIG_DIR = BASE / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# --- Load deliveries ---
dl = pd.read_csv(
    BASE / "delivery_logs.csv",
    parse_dates=["order_date","expected_delivery_date","actual_delivery_date"]
)

In [None]:
# --- Optional: load supplier names if available ---
label_col = "supplier_id"
sup = None
for p in BASE.glob("suppliers*.csv"):
    try:
        t = pd.read_csv(p, nrows=1)
        if {"supplier_id","supplier_name"}.issubset(t.columns):
            sup = pd.read_csv(p)
            break
    except Exception:
        pass
if sup is not None:
    dl = dl.merge(sup[["supplier_id","supplier_name"]], on="supplier_id", how="left")
    label_col = "supplier_name"

In [None]:
# --- Late/Open flags and month key ---
dl["is_late"] = dl["actual_delivery_date"].notna() & (dl["actual_delivery_date"] > dl["expected_delivery_date"])
dl["is_undelivered"] = dl["actual_delivery_date"].isna()
dl["late_or_open"] = dl["is_late"] | dl["is_undelivered"]
dl["month"] = dl["expected_delivery_date"].dt.to_period("M").astype(str)

In [None]:
# --- Aggregate supplier × month delay rate ---
agg = (dl.groupby([label_col,"month"], as_index=False)
         .agg(total_deliveries=("delivery_id","size"),
              late_or_open=("late_or_open","sum")))
agg["delay_rate_pct"] = (agg["late_or_open"] / agg["total_deliveries"] * 100).round(2)

In [None]:
# --- Keep suppliers with enough volume; rank by average delay and take top 20 ---
vol = agg.groupby(label_col)["total_deliveries"].sum()
enough = vol[vol >= 200].index
agg = agg[agg[label_col].isin(enough)]
rank = agg.groupby(label_col)["delay_rate_pct"].mean().sort_values(ascending=False)
top_suppliers = rank.head(20).index

agg = agg[agg[label_col].isin(top_suppliers)].copy()

In [None]:
# --- Build complete month axis (chronological) ---
all_months = pd.period_range(dl["expected_delivery_date"].min(), dl["expected_delivery_date"].max(), freq="M").astype(str)

In [None]:

# --- Pivot to matrix (fill missing with NaN) ---
pivot = (agg.pivot(index=label_col, columns="month", values="delay_rate_pct")
            .reindex(index=top_suppliers, columns=all_months))

# --- Plot heatmap (matplotlib only) ---
plt.figure(figsize=(max(10, len(all_months)*0.6), 0.5*len(top_suppliers)+3))
im = plt.imshow(pivot.values, aspect="auto", interpolation="nearest")  # default colormap

# Axes & labels
plt.title("Supplier × Month Delay Rate (%) — Top 20 by Avg Delay")
plt.xlabel("Month")
plt.ylabel("Supplier")

# Ticks
plt.xticks(ticks=np.arange(len(all_months)), labels=all_months, rotation=45, ha="right")
plt.yticks(ticks=np.arange(len(top_suppliers)), labels=list(top_suppliers))

# Colorbar
cbar = plt.colorbar(im)
cbar.set_label("Delay Rate (%)")

plt.tight_layout()
out_path = FIG_DIR / "supplier_delay_heatmap.png"
plt.savefig(out_path, dpi=200)
plt.show()

print(f" Saved heatmap: {out_path}")
print("Suppliers shown (top 20 by avg delay, ≥200 deliveries):")
print(list(top_suppliers))
