
# Customer Segmentation & AI Insights — Churn Analysis (Notebook)

This notebook is a **self-contained** case study that:
- generates **synthetic customer + VoC** data,
- performs quick **EDA** with charts,
- trains a **logistic regression** churn model with a preprocessing pipeline,
- exports **Tableau-ready** churn scores,
- runs simple **segmentation** with K-Means,
- (optional) adds **SHAP** explanations if available.

> Run cells top-to-bottom. Artifacts are saved under `data/`, `out/`, `models/`, and `reports/`.


## Setup

In [None]:

%matplotlib inline

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix
import joblib

# Optional: SHAP for explainability (skip if not installed)
try:
    import shap
    HAS_SHAP = True
except Exception:
    HAS_SHAP = False

SEED = 42
np.random.seed(SEED)

for d in ["data", "out", "models", "reports"]:
    Path(d).mkdir(parents=True, exist_ok=True)

def save_fig(path):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    plt.tight_layout()
    plt.savefig(path, dpi=144)
    plt.show()


## 1) Generate Synthetic Data

In [None]:

from numpy.random import default_rng
rng = default_rng(SEED)

N = 5000
df = pd.DataFrame({
    "customer_id": np.arange(1, N+1),
    "tenure_days": rng.integers(10, 1825, size=N),
    "last_nps": rng.integers(-100, 101, size=N),
    "tickets_30d": rng.poisson(lam=0.6, size=N),
    "usage_delta": rng.normal(0.0, 1.0, size=N),
    "sentiment_score": np.clip(rng.normal(0.1, 0.6, size=N), -1, 1),
    "plan": rng.choice(["Basic","Plus","Pro"], size=N, p=[0.45,0.35,0.20]),
    "region": rng.choice(["ON","BC","AB","QC","MB","SK","NS","NB","NL","PE"], size=N),
    "billing_change_30d": rng.choice([0,1], size=N, p=[0.8,0.2]),
})

logit = (
    -2.0
    + 0.0006 * (365 - df["tenure_days"])
    + 0.025 * df["tickets_30d"]
    - 0.015 * df["last_nps"]
    - 0.20 * df["sentiment_score"]
    + 0.35 * df["billing_change_30d"]
    + np.where(df["plan"].eq("Basic"), 0.25, 0.0)
    + np.where(df["plan"].eq("Pro"), -0.2, 0.0)
)
proba = 1 / (1 + np.exp(-logit))
df["churned_next_60d"] = (rng.random(N) < proba).astype(int)

themes = np.array(["billing issue","poor app performance","network outage","great support","pricing concern"])
df["voc_theme"] = themes[rng.integers(0, len(themes), N)]

df.to_csv("data/synthetic_customers.csv", index=False)
df.head()


## 2) Quick EDA

In [None]:

print("Churn rate:", df["churned_next_60d"].mean().round(3))

for col in ["tenure_days","last_nps","tickets_30d","usage_delta","sentiment_score"]:
    plt.figure()
    plt.hist(df[col], bins=40)
    plt.title(f"Distribution: {col}")
    plt.xlabel(col); plt.ylabel("count")
    save_fig(f"reports/dist_{col}.png")

plt.figure()
(df.groupby("plan")["churned_next_60d"].mean()
   .reindex(["Basic","Plus","Pro"])
   .plot(kind="bar"))
plt.title("Churn rate by plan"); plt.ylabel("rate")
save_fig("reports/churn_by_plan.png")

plt.figure()
(df.groupby("billing_change_30d")["churned_next_60d"].mean()
   .rename({0:"No recent change", 1:"Recent change"})
   .plot(kind="bar"))
plt.title("Churn rate by recent billing change"); plt.ylabel("rate")
save_fig("reports/churn_by_billing_change.png")


## 3) Train Logistic Regression Churn Model

In [None]:

features = ["tenure_days","last_nps","tickets_30d","usage_delta","sentiment_score","plan","region","billing_change_30d"]
target = "churned_next_60d"
X = df[features]
y = df[target]

num_cols = ["tenure_days","last_nps","tickets_30d","usage_delta","sentiment_score"]
cat_cols = ["plan","region","billing_change_30d"]

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])
clf = LogisticRegression(max_iter=300, class_weight="balanced")
pipe = Pipeline([("pre", pre), ("clf", clf)])
pipe.fit(X, y)

pred_proba = pipe.predict_proba(X)[:,1]
y_pred = (pred_proba > 0.5).astype(int)
auc = roc_auc_score(y, pred_proba)
print("AUC:", round(auc, 3))
print(classification_report(y, y_pred, digits=3))

joblib.dump(pipe, "models/churn_logit_pipeline.joblib")


### ROC Curve & Confusion Matrix

In [None]:

fpr, tpr, _ = roc_curve(y, pred_proba)
plt.figure()
plt.plot(fpr, tpr, label=f"LogReg (AUC={auc:.3f})")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.legend()
plt.title("ROC Curve")
save_fig("reports/roc_curve.png")

cm = confusion_matrix(y, y_pred)
plt.figure()
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix (0/1)")
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha="center", va="center")
plt.xlabel("Predicted"); plt.ylabel("Actual")
save_fig("reports/confusion_matrix.png")


## 4) (Optional) SHAP Explainability

In [None]:

if HAS_SHAP:
    # Use a small sample to keep runtime snappy
    sample = X.sample(n=min(800, len(X)), random_state=SEED)
    f = lambda x: pipe.predict_proba(pd.DataFrame(x, columns=X.columns))[:,1]
    explainer = shap.KernelExplainer(f, shap.sample(sample, 200, random_state=SEED))
    shap_values = explainer.shap_values(shap.sample(sample, 200, random_state=SEED+1))

    shap.summary_plot(shap_values, shap.sample(sample, 200, random_state=SEED+1),
                      feature_names=X.columns, show=True)
else:
    print("SHAP not installed; skipping (pip install shap).")


## 5) Segmentation (K-Means)

In [None]:

seg_features = ["tenure_days","tickets_30d","usage_delta","sentiment_score","last_nps"]
Z = (df[seg_features] - df[seg_features].mean()) / df[seg_features].std(ddof=0)
km = KMeans(n_clusters=5, random_state=SEED, n_init="auto")
df["segment"] = km.fit_predict(Z)

profiles = df.groupby("segment")[seg_features + ["churned_next_60d"]].agg(["mean","median"])
profiles.columns = [f"{a}_{b}" for a,b in profiles.columns]
profiles.reset_index(inplace=True)
profiles.to_csv("out/segment_profiles.csv", index=False)
profiles.head()


In [None]:

plt.figure()
df["segment"].value_counts().sort_index().plot(kind="bar")
plt.title("Segment sizes"); plt.xlabel("segment"); plt.ylabel("count")
save_fig("reports/segment_sizes.png")

plt.figure()
(df.groupby("segment")["churned_next_60d"].mean().plot(kind="bar"))
plt.title("Churn rate by segment"); plt.xlabel("segment"); plt.ylabel("rate")
save_fig("reports/churn_by_segment.png")


## 6) Export Tableau-ready Scores

In [None]:

export = df[["customer_id","plan","region"]].copy()
export["churn_prob"] = pred_proba
export.to_csv("out/churn_scores.csv", index=False)

enriched = df[["customer_id","plan","region","segment","churned_next_60d"]].copy()
enriched["churn_prob"] = pred_proba
enriched.to_csv("out/customer_scored_segments.csv", index=False)

print("Saved:", "out/churn_scores.csv", "and", "out/customer_scored_segments.csv")
