In [5]:
# baseline_metrics.py
import pandas as pd
import numpy as np
from pathlib import Path

# ---- Config ----
INPUT_CSV = Path("../telco_data/telco_clean.csv")   # adjust if needed
OUT_DIR = Path("../docs")                              # output folder for CSVs
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Load & clean ----
df = pd.read_csv(INPUT_CSV)

# Normalize column names to predictable names (case-insensitive)
cols = {c.lower(): c for c in df.columns}
def col(name):
    return cols.get(name.lower())

# Required column names as in original dataset
cid_col = col("customerid") or col("customer_id") or list(df.columns)[0]
churn_col = col("churn") or "Churn"
contract_col = col("contract") or "Contract"
tenure_col = col("tenure") or "tenure"
monthly_col = col("monthlycharges") or "MonthlyCharges"
total_col = col("totalcharges") or "TotalCharges"
payment_col = col("paymentmethod") or "PaymentMethod"

# Ensure numeric types where appropriate
# MonthlyCharges
if monthly_col in df.columns:
    df[monthly_col] = pd.to_numeric(df[monthly_col], errors="coerce")
else:
    df["MonthlyCharges"] = np.nan
    monthly_col = "MonthlyCharges"

# TotalCharges
if total_col in df.columns:
    df[total_col] = pd.to_numeric(df[total_col], errors="coerce")
else:
    df["TotalCharges"] = np.nan
    total_col = "TotalCharges"

# Tenure
if tenure_col in df.columns:
    df[tenure_col] = pd.to_numeric(df[tenure_col], errors="coerce").fillna(0).astype(int)
else:
    df["tenure"] = 0
    tenure_col = "tenure"

# CustomerID ensure exists
if cid_col not in df.columns:
    df.insert(0, "customerID", [f"cust_{i}" for i in range(len(df))])
    cid_col = "customerID"

# Churn flag (binary)
if churn_col in df.columns:
    df["ChurnFlag"] = df[churn_col].astype(str).str.strip().str.lower().map({"yes":1,"y":1,"true":1,"1":1}).fillna(0).astype(int)
else:
    df["ChurnFlag"] = 0

# Tenure buckets
bins = [0,6,12,24,48,1_000_000]
labels = ['0-6','6-12','12-24','24-48', '48+']
df["tenure_bucket"] = pd.cut(df[tenure_col].fillna(0), bins=bins, labels=labels, right=False)

# ---- Baseline metrics (single-row summary) ----
total_customers = len(df)
overall_churn_rate = df["ChurnFlag"].mean() if total_customers>0 else 0
avg_tenure = df[tenure_col].replace({0:np.nan}).dropna().mean() if tenure_col in df.columns else 0
avg_monthly_charges = df[monthly_col].mean()
total_mrr = df[monthly_col].sum()
# Simple CLV (very simple): avg_monthly_charges * avg_tenure
simple_clv = avg_monthly_charges * (avg_tenure if not np.isnan(avg_tenure) else 0)
    
baseline_summary = {
    "total_customers": total_customers,
    "overall_churn_rate": overall_churn_rate,
    "avg_tenure_months": avg_tenure,
    "avg_monthly_charges": avg_monthly_charges,
    "total_mrr": total_mrr,
    "simple_clv_estimate": simple_clv
}

# Save single-row summary
pd.DataFrame([baseline_summary]).to_csv(OUT_DIR / "baseline_summary_metrics.csv", index=False)

# ---- Churn by contract ----
if contract_col in df.columns:
    churn_by_contract = df.groupby(contract_col).agg(
        customers=("ChurnFlag","count"),
        churned=("ChurnFlag","sum"),
        churn_pct=("ChurnFlag", lambda x: round(100.0 * x.sum() / max(1, x.count()),2)),
        avg_monthly_charges=(monthly_col, "mean")
    ).sort_values("churn_pct", ascending=False).reset_index()
else:
    churn_by_contract = pd.DataFrame(columns=[contract_col, "customers","churned","churn_pct","avg_monthly_charges"])
churn_by_contract.to_csv(OUT_DIR / "churn_by_contract.csv", index=False)

# ---- Churn by tenure bucket ----
churn_by_tenure = df.groupby("tenure_bucket").agg(
    customers=("ChurnFlag","count"),
    churned=("ChurnFlag","sum"),
    churn_pct=("ChurnFlag", lambda x: round(100.0 * x.sum() / max(1, x.count()),2))
).reset_index().sort_values("tenure_bucket")
churn_by_tenure.to_csv(OUT_DIR / "churn_by_tenure.csv", index=False)

# ---- Churn by monthly charge quartile ----
df["monthly_quartile"] = pd.qcut(df[monthly_col].fillna(0), q=4, labels=["Q1","Q2","Q3","Q4"])
churn_by_monthly = df.groupby("monthly_quartile").agg(
    customers=("ChurnFlag","count"),
    churned=("ChurnFlag","sum"),
    churn_pct=("ChurnFlag", lambda x: round(100.0 * x.sum() / max(1, x.count()),2)),
    avg_monthly=(monthly_col, "mean")
).reset_index().sort_values("monthly_quartile")
churn_by_monthly.to_csv(OUT_DIR / "churn_by_monthly_quartile.csv", index=False)

# ---- Churn by payment method (if exists) ----
if payment_col in df.columns and payment_col in df.columns:
    churn_by_payment = df.groupby(payment_col).agg(
        customers=("ChurnFlag","count"),
        churned=("ChurnFlag","sum"),
        churn_pct=("ChurnFlag", lambda x: round(100.0 * x.sum() / max(1, x.count()),2))
    ).reset_index().sort_values("churn_pct", ascending=False)
else:
    churn_by_payment = pd.DataFrame()
churn_by_payment.to_csv(OUT_DIR / "churn_by_payment_method.csv", index=False)

# ---- MRR and simple CLV CSV ----
mrr_clv = {
    "avg_mrr_per_customer": avg_monthly_charges,
    "total_mrr": total_mrr,
    "avg_tenure_months": avg_tenure,
    "simple_clv_estimate": simple_clv
}
pd.DataFrame([mrr_clv]).to_csv(OUT_DIR / "mrr_and_simple_clv.csv", index=False)

# ---- Baseline customers snapshot for BI/debugging ----
snapshot_cols = [
    cid_col if cid_col in df.columns else "customerID",
    tenure_col,
    monthly_col,
    total_col,
    contract_col if contract_col in df.columns else "contract",
    payment_col if payment_col in df.columns and payment_col in df.columns else None,
    "ChurnFlag",
    "tenure_bucket",
    "monthly_quartile"
]
# filter None and duplicates, keep columns only if they exist
snapshot_cols = [c for c in snapshot_cols if c and c in df.columns]
baseline_snapshot = df[snapshot_cols].copy()
baseline_snapshot.to_csv(OUT_DIR / "baseline_customers_snapshot.csv", index=False)

# ---- Print summary to console ----
print("Baseline metrics saved to:", OUT_DIR.resolve())
print(pd.DataFrame([baseline_summary]).to_string(index=False))
print("Saved files:")
for f in [
    "baseline_summary_metrics.csv",
    "churn_by_contract.csv",
    "churn_by_tenure.csv",
    "churn_by_monthly_quartile.csv",
    "churn_by_payment_method.csv",
    "mrr_and_simple_clv.csv",
    "baseline_customers_snapshot.csv"
]:
    path = OUT_DIR / f
    if path.exists():
        print(" -", path.name)
    else:
        print(" -", path.name, "(NOT CREATED)")

print("\nDone.")


Baseline metrics saved to: C:\Users\ThinkPAD\Documents\Telco Customer Churn\docs
 total_customers  overall_churn_rate  avg_tenure_months  avg_monthly_charges  total_mrr  simple_clv_estimate
            7032            0.265785          32.421786            64.798208   455661.0          2100.873647
Saved files:
 - baseline_summary_metrics.csv
 - churn_by_contract.csv
 - churn_by_tenure.csv
 - churn_by_monthly_quartile.csv
 - churn_by_payment_method.csv
 - mrr_and_simple_clv.csv
 - baseline_customers_snapshot.csv

Done.


  churn_by_tenure = df.groupby("tenure_bucket").agg(
  churn_by_monthly = df.groupby("monthly_quartile").agg(
