# Uplift Modeling for Churn Prediction

Short notebook: same workflow as the full notebook, using **utils** for all logic.
Run cells in order. Data paths: `train/` and `test/` under project root.

## 1. Setup


In [None]:
# Setup may take 30–60s: utils imports sentence_transformers, causalml, lightgbm, xgboost.
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold
try:
    from causalml.metrics import qini_auc_score
except Exception:
    qini_auc_score = None

from utils import (
    approx_auuc, assign_segments, build_claims_labels, build_feature_matrix, build_recency_tenure,
    count_events_before_signup, DOW_NAMES, embed_wellco_brief, EMBED_MODEL_NAME, feat_distribution_summary,
    feature_diagnostics, FOCUS_ICD_CODES, load_wellco_brief, missingness_and_member_coverage,
    missingness_mechanism_analysis, plot_balance, plot_correlation_diagnostics, plot_feature_histograms,
    print_focus_icd_stats, print_table_overview, RANDOM_STATE, ref_date_from_tables,
    run_relevance_filter_sanity_check, SIMILARITY_THRESHOLD, time_bin, uplift_at_k, uplift_by_groups,
    uplift_curve, _build_model,
)

pd.set_option('display.max_columns', 200)
BASE_DIR = Path('.').resolve()
TRAIN_DIR = BASE_DIR / 'train'
TEST_DIR = BASE_DIR / 'test'


## 2. Load data
Train/test CSVs; train events restricted to observation window (July 1–15, 2025).


In [None]:
# Training data
churn_labels = pd.read_csv(TRAIN_DIR / "churn_labels.csv", parse_dates=["signup_date"])
app_usage = pd.read_csv(TRAIN_DIR / "app_usage.csv", parse_dates=["timestamp"])
web_visits = pd.read_csv(TRAIN_DIR / "web_visits.csv", parse_dates=["timestamp"])
claims = pd.read_csv(TRAIN_DIR / "claims.csv", parse_dates=["diagnosis_date"])

# Test data
test_members = pd.read_csv(TEST_DIR / "test_members.csv", parse_dates=["signup_date"])
test_app_usage = pd.read_csv(TEST_DIR / "test_app_usage.csv", parse_dates=["timestamp"])
test_web_visits = pd.read_csv(TEST_DIR / "test_web_visits.csv", parse_dates=["timestamp"])
test_claims = pd.read_csv(TEST_DIR / "test_claims.csv", parse_dates=["diagnosis_date"])

# Observation window: July 1 - July 15, 2025 (pre-outreach). Outreach = July 15; churn measured after.
# Restrict train event data only; test data is not filtered (outreach has not occurred for test).
OBS_START = pd.Timestamp("2025-07-01")
OBS_END   = pd.Timestamp("2025-07-15")  # exclusive: keep events strictly before outreach

web_visits = web_visits[(web_visits["timestamp"] >= OBS_START) & (web_visits["timestamp"] < OBS_END)]
app_usage  = app_usage[(app_usage["timestamp"] >= OBS_START) & (app_usage["timestamp"] < OBS_END)]
claims     = claims[(claims["diagnosis_date"] >= OBS_START) & (claims["diagnosis_date"] < OBS_END)]

# Quick sanity check
for name, df in {
    "churn_labels": churn_labels,
    "app_usage": app_usage,
    "web_visits": web_visits,
    "claims": claims,
    "test_members": test_members,
    "test_app_usage": test_app_usage,
    "test_web_visits": test_web_visits,
    "test_claims": test_claims,
}.items():
    print(f"{name}: {df.shape}")

## **3. EDA**

Exploratory data analysis: table structure, missingness, treatment balance, leakage checks, and uplift by engagement/claims/recency.

---

### **3.1 Raw data overview**
Summarize structure, dtypes, and sample rows for all 8 tables.


In [None]:
all_tables = {
    "churn_labels": churn_labels,
    "app_usage": app_usage,
    "web_visits": web_visits,
    "claims": claims,
    "test_members": test_members,
    "test_app_usage": test_app_usage,
    "test_web_visits": test_web_visits,
    "test_claims": test_claims,
}

for name, df in all_tables.items():
    print_table_overview(name, df)


### 3.2 Column-specific checks
event_type, url, title, icd_code for feature-engineering decisions.


In [None]:
# ----------
# 3.1 Column-specific checks
# Purpose: Check special columns (event_type, url, title, icd_code) for feature engineering decisions.
# What we test: value_counts for event_type, url, title, icd_code.
# What we do with this info:
#   - If event_type is constant -> drop it.
#   - URL/title variety -> potential content-categorization features.
#   - ICD distribution -> guides focus-ICD flag design.
# ----------

print("="*60)
print("  Column-specific checks")
print("="*60)
print("\napp_usage event_type value_counts:")
print(app_usage["event_type"].value_counts().to_string())
print(f"  -> {'CONSTANT — can drop' if app_usage['event_type'].nunique() == 1 else 'MULTIPLE VALUES — keep'}")

# url and title in web_visits: content variety
print(f"\nweb_visits url: {web_visits['url'].nunique()} unique values")
print("  Top-5 URLs:")
print(web_visits["url"].value_counts().head(5).to_string())
print(f"\nweb_visits title: {web_visits['title'].nunique()} unique values")
print("  Top-5 titles:")
print(web_visits["title"].value_counts().head(5).to_string())

# icd_code in claims
print(f"\nclaims icd_code: {claims['icd_code'].nunique()} unique values")
print("  Top-10 ICD codes:")
print(claims["icd_code"].value_counts().head(10).to_string())

### 3.3 Missing values and member coverage


In [None]:
# 3.2 Missing values and member coverage (utils: missingness_and_member_coverage)
missingness_and_member_coverage(all_tables, churn_labels, web_visits, app_usage, claims, test_members, test_web_visits, test_app_usage, test_claims)

### 3.4 Missingness mechanism (Chi-square)


In [None]:
# 3.4 Missingness mechanism (utils: missingness_mechanism_analysis)
missingness_mechanism_analysis(churn_labels, web_visits, app_usage, claims)

**What it means:** Chi-square p-values and the bar chart show whether churn (or outreach) rate differs between members who have activity in a source vs those who do not. The cross-tab shows how many members are missing from each combination of sources.

**What it says about further analysis:**  p-values are large (e.g. > 0.05), missingness is not strongly related to churn/outreach → zero-fill is enough.

### 3.5 Labels & treatment balance

In [None]:
# 3.4 Labels and treatment balance
churn_rate = churn_labels["churn"].mean()
outreach_rate = churn_labels["outreach"].mean()

summary_labels = churn_labels.groupby("outreach")["churn"].agg([
    ("members", "count"),
    ("churn_rate", "mean"),
])

print(f"Overall churn rate: {churn_rate:.3f}")
print(f"Outreach rate: {outreach_rate:.3f}")
print("\nOutreach x Churn cross-tabulation:")
cross_tab = pd.crosstab(churn_labels["outreach"], churn_labels["churn"],
                        margins=True, margins_name="Total")
print(cross_tab.to_string())
print("\nChurn rates by group:")
print(summary_labels.to_string())

plot_balance(churn_labels, "outreach", "Outreach vs. control counts", "Outreach", "Count")
plot_balance(churn_labels, "churn", "Churn label counts", "Churn", "Count")
plot_balance(summary_labels.reset_index(), "outreach", "Churn rate by outreach group", "Outreach", "Churn rate", y="churn_rate")


### 3.6 Leakage & time-window validation (utils: `count_events_before_signup`)


In [None]:
window_summary = pd.DataFrame([
    {'table': 'web_visits', 'min': web_visits['timestamp'].min(), 'max': web_visits['timestamp'].max()},
    {'table': 'app_usage', 'min': app_usage['timestamp'].min(), 'max': app_usage['timestamp'].max()},
    {'table': 'claims', 'min': claims['diagnosis_date'].min(), 'max': claims['diagnosis_date'].max()},
])
leakage = pd.DataFrame([
    {'table': 'web_visits', 'events_before_signup': count_events_before_signup(web_visits, 'timestamp', churn_labels)},
    {'table': 'app_usage', 'events_before_signup': count_events_before_signup(app_usage, 'timestamp', churn_labels)},
    {'table': 'claims', 'events_before_signup': count_events_before_signup(claims, 'diagnosis_date', churn_labels)},
])
display(window_summary)
display(leakage)

**What it means:** The first table shows min/max timestamps per event table (all within July 1–14, 2025). The second table shows zero events before signup for web, app, and claims — no leakage.

**What it says about further analysis:** Observation window and signup logic are consistent. We can safely use these events for feature engineering. Next: temporal and engagement uplift (3.6, 3.7).

### 3.7 Temporal features as uplift moderators

Uplift = P(churn=1 | outreach=1, bin) − P(churn=1 | outreach=0, bin).

Each bar shows uplift among members who had **at least one event** in that bin. The same member may appear in multiple bins.


In [None]:
web_ev = web_visits[['member_id', 'timestamp']].copy(); web_ev['hour'] = web_ev['timestamp'].dt.hour; web_ev['dow'] = web_ev['timestamp'].dt.dayofweek
app_ev = app_usage[['member_id', 'timestamp']].copy(); app_ev['hour'] = app_ev['timestamp'].dt.hour; app_ev['dow'] = app_ev['timestamp'].dt.dayofweek

events = pd.concat([web_ev[['member_id', 'hour', 'dow']], app_ev[['member_id', 'hour', 'dow']]], ignore_index=True)
events['time_of_day'] = events['hour'].apply(time_bin)
events['dow_name'] = events['dow'].map(DOW_NAMES)
events['is_weekend'] = events['dow'].isin([5, 6])
labels = churn_labels[['member_id', 'churn', 'outreach']]
print('Events:', len(events), 'rows,', events['member_id'].nunique(), 'members')

In [None]:
# Uplift by time of day
uplift_by_groups(events, labels, "time_of_day", ["Early Morning", "Morning", "Afternoon", "Evening"], title="Uplift by time of day", xlabel="Time of day")


In [None]:
# Uplift by day of week
uplift_by_groups(events, labels, "dow_name", ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], title="Uplift by day of week", xlabel="Day of week")


In [None]:
# Uplift by weekend vs weekday
uplift_by_groups(events, labels, "is_weekend", [False, True], plot_labels=["Weekday", "Weekend"], title="Uplift by weekend vs weekday", xlabel="Day type")


**What it means:** Uplift by time of day and by weekday vs weekend is similar across bins (all slightly negative). Outreach reduces churn a bit regardless of when members are active.

**What it says about further analysis:** Temporal features (time of day, day of week, weekend) do not strongly moderate uplift. Next: engagement and claims

### 3.8 Engagement features as uplift moderators

**(a)** Distribution sanity checks — log-scaled histograms and quantile summaries.  
**(b)** Uplift by engagement quartile for each feature.

In [None]:
# Distribution sanity checks (engagement)

web_per = web_visits.groupby("member_id").size().rename("web_visits_count").reset_index()
app_per = app_usage.groupby("member_id").size().rename("app_sessions_count").reset_index()
url_div = web_visits.groupby("member_id")["url"].nunique().rename("url_nunique").reset_index()

eng = churn_labels[["member_id", "churn", "outreach"]].merge(
    web_per, on="member_id", how="left"
).merge(
    app_per, on="member_id", how="left"
).merge(
    url_div, on="member_id", how="left"
)
for col in ["web_visits_count", "app_sessions_count", "url_nunique"]:
    eng[col] = eng[col].fillna(0)

feat_distribution_summary(eng, "web_visits_count")
feat_distribution_summary(eng, "app_sessions_count")
feat_distribution_summary(eng, "url_nunique")


In [None]:
# Uplift by web_visits_count quartile
eng["web_q"] = pd.qcut(eng["web_visits_count"], q=4, duplicates="drop")
uplift_by_groups(eng, labels, "web_q", sorted(eng["web_q"].dropna().unique()), title="Uplift by web visits quartile", xlabel="Web visits (quartile)")


In [None]:
# Uplift by app_sessions_count quartile
eng["app_q"] = pd.qcut(eng["app_sessions_count"], q=4, duplicates="drop")
uplift_by_groups(eng, labels, "app_q", sorted(eng["app_q"].dropna().unique()), title="Uplift by app sessions quartile", xlabel="App sessions (quartile)")


In [None]:
# Uplift by URL diversity quartile
eng["url_q"] = pd.qcut(eng["url_nunique"], q=4, duplicates="drop")
uplift_by_groups(eng, labels, "url_q", sorted(eng["url_q"].dropna().unique()), title="Uplift by URL diversity quartile", xlabel="Unique URLs (quartile)")


**What it means:** Engagement (event counts, sessions, URL diversity) shows uplift varying by quartile; some bins have near-zero or slightly positive uplift.

**What it says about further analysis:** Engagement level can moderate uplift — useful as features for the model. 

### 3.9 Claims features as uplift moderators

**(a)** Distribution sanity checks — log-scaled histograms, quantile summaries, focus-ICD prevalence.  
**(b)** Uplift by claims strata: quartile bins for counts, binary for has-focus-ICD, and 0/1/2/3 for count of focus ICDs.


In [None]:
# Distribution sanity checks (claims)
cl = build_claims_labels(claims, churn_labels)
feat_distribution_summary(cl, "claims_count", color="green")
feat_distribution_summary(cl, "icd_nunique", color="green")
print_focus_icd_stats(cl)


In [None]:
# Uplift by claims_count quartile
cl["claims_q"] = pd.qcut(cl["claims_count"], q=4, duplicates="drop")
uplift_by_groups(cl, labels, "claims_q", sorted(cl["claims_q"].dropna().unique()), title="Uplift by claims count quartile", xlabel="Claims count (quartile)")


In [None]:
# Uplift by icd_nunique quartile
cl["icd_q"] = pd.qcut(cl["icd_nunique"], q=4, duplicates="drop")
uplift_by_groups(cl, labels, "icd_q", sorted(cl["icd_q"].dropna().unique()), title="Uplift by distinct ICD codes quartile", xlabel="Distinct ICD codes (quartile)")


In [None]:
# Uplift by has focus ICD (binary: No / Yes)
uplift_by_groups(cl, labels, "has_focus_icd", [0, 1], plot_labels=["No focus ICD", "Has focus ICD"], title="Uplift by has focus ICD", xlabel="Focus ICD status")


In [None]:
# Uplift by count of focus ICDs (0, 1, 2, 3)
uplift_by_groups(cl, labels, "focus_icd_count", [0, 1, 2, 3], plot_labels=["0 focus ICD", "1 focus ICD", "2 focus ICD", "3 focus ICD"], title="Uplift by count of focus ICDs", xlabel="Number of distinct focus ICD codes")


**What it means:** Uplift by claims (count, focus ICD, quartiles) varies across groups; some segments show stronger or weaker outreach effects.

**What it says about further analysis:** Claims-based features are useful for targeting. 

### 3.10 Recency & tenure (utils: `build_recency_tenure`)


In [None]:
recency_df, ref_date = build_recency_tenure(churn_labels, web_visits, app_usage, claims)
rec = churn_labels[['member_id', 'churn', 'outreach']].merge(recency_df, left_on='member_id', right_index=True)
print('Ref date:', ref_date)
print(f"Members: {len(rec)}")
print(rec[['days_since_last_web', 'days_since_last_app', 'days_since_last_activity', 'tenure_days']].describe().round(1).to_string())

In [None]:
# Uplift by days_since_last_web quartile
feat = "days_since_last_web"
valid = rec.dropna(subset=[feat]).copy()
excluded = len(rec) - len(valid)
valid["quartile_bin"] = pd.qcut(valid[feat], q=4, duplicates="drop")
if valid["quartile_bin"].nunique() < 2:
    valid["quartile_bin"] = pd.cut(valid[feat], bins=min(4, valid[feat].nunique()))
uplift_by_groups(valid, labels, "quartile_bin", sorted(valid["quartile_bin"].dropna().unique()), title="Uplift by days since last web activity", xlabel="Days-since-last-web bin")
print(f"Excluded from plot (no web activity): {excluded} members (n too small for stable uplift).")


In [None]:
# Uplift by days_since_last_app quartile
feat = "days_since_last_app"
valid = rec.dropna(subset=[feat]).copy()
excluded = len(rec) - len(valid)
valid["quartile_bin"] = pd.qcut(valid[feat], q=4, duplicates="drop")
if valid["quartile_bin"].nunique() < 2:
    valid["quartile_bin"] = pd.cut(valid[feat], bins=min(4, valid[feat].nunique()))
uplift_by_groups(valid, labels, "quartile_bin", sorted(valid["quartile_bin"].dropna().unique()), title="Uplift by days since last app activity", xlabel="Days-since-last-app bin")
print(f"Excluded from plot (no app activity): {excluded} members (n too small for stable uplift).")


In [None]:
# Uplift by days_since_last_claim quartile
feat = "days_since_last_claim"
valid = rec.dropna(subset=[feat]).copy()
excluded = len(rec) - len(valid)
valid["quartile_bin"] = pd.qcut(valid[feat], q=4, duplicates="drop")
if valid["quartile_bin"].nunique() < 2:
    valid["quartile_bin"] = pd.cut(valid[feat], bins=min(4, valid[feat].nunique()))
uplift_by_groups(valid, labels, "quartile_bin", sorted(valid["quartile_bin"].dropna().unique()), title="Uplift by days since last claim", xlabel="Days-since-last-claim bin")
print(f"Excluded from plot (no claims): {excluded} members (n too small for stable uplift).")


In [None]:
# Uplift by days_since_last_activity quartile
feat = "days_since_last_activity"
valid = rec.dropna(subset=[feat]).copy()
excluded = len(rec) - len(valid)
valid["quartile_bin"] = pd.qcut(valid[feat], q=4, duplicates="drop")
if valid["quartile_bin"].nunique() < 2:
    valid["quartile_bin"] = pd.cut(valid[feat], bins=min(4, valid[feat].nunique()))
uplift_by_groups(valid, labels, "quartile_bin", sorted(valid["quartile_bin"].dropna().unique()), title="Uplift by days since last activity (any source)", xlabel="Days-since-last-activity bin")
print(f"Excluded from plot (no activity at all): {excluded} members (n too small for stable uplift).")


In [None]:
# Uplift by tenure_days quartile
feat = "tenure_days"
valid = rec.dropna(subset=[feat]).copy()
excluded = len(rec) - len(valid)
valid["quartile_bin"] = pd.qcut(valid[feat], q=4, duplicates="drop")
if valid["quartile_bin"].nunique() < 2:
    valid["quartile_bin"] = pd.cut(valid[feat], bins=min(4, valid[feat].nunique()))
uplift_by_groups(valid, labels, "quartile_bin", sorted(valid["quartile_bin"].dropna().unique()), title="Uplift by tenure (days since signup)", xlabel="Tenure bin (days)")
print(f"Excluded from plot (missing signup_date): {excluded} members (n too small for stable uplift).")


**What it means:** Uplift by recency and tenure bins shows how outreach effect varies with how recently members were active and how long they have been members.

**What it says about further analysis:** Recency and tenure are strong candidates for the uplift model. EDA is complete.

## 4. Feature Engineering
Config; load WellCo brief and embedding model once; then build train/test feature matrices (utils).


In [None]:
WELLCO_BRIEF_PATH = BASE_DIR / 'wellco_client_brief.txt'
print(f"Similarity threshold – {SIMILARITY_THRESHOLD}")
print(f"Embedding model      – {EMBED_MODEL_NAME}")
print(f"Focus ICD codes      – {FOCUS_ICD_CODES}")

In [None]:
# ── RUN ONCE: load embedding model & embed WellCo brief ────────────────────
# This cell is intentionally isolated so it runs exactly once per session.
# All downstream cells reuse `embed_model` and `wellco_embedding`.

brief_text = load_wellco_brief()
embed_model = SentenceTransformer(EMBED_MODEL_NAME)
wellco_embedding = embed_wellco_brief(brief_text, embed_model)  # shape (1, dim)

print(f"WellCo brief loaded     – {len(brief_text):,} characters")
print(f"Embedding model loaded  – {EMBED_MODEL_NAME}")
print(f"WellCo embedding shape  – {wellco_embedding.shape}")

In [None]:
ref_date_train = ref_date_from_tables(web_visits, app_usage, claims)
ref_date_test  = ref_date_from_tables(test_web_visits, test_app_usage, test_claims)
print('ref_date_train:', ref_date_train, '| ref_date_test:', ref_date_test)

print('Building TRAIN feature matrix...')
train_features = build_feature_matrix(churn_labels, web_visits, app_usage, claims, ref_date_train,
    wellco_embedding=wellco_embedding, embed_model=embed_model, include_labels=True)
    
print('Building TEST feature matrix...')
test_features = build_feature_matrix(test_members, test_web_visits, test_app_usage, test_claims, ref_date_test,
    wellco_embedding=wellco_embedding, embed_model=embed_model, include_labels=False)

# --- Quick summary ----------------------------------------------------------
print(f"\nTrain features shape: {train_features.shape}")
print(f"Test  features shape: {test_features.shape}")
print(f"\nTrain columns: {list(train_features.columns)}")
print(f"Test  columns: {list(test_features.columns)}")
print(f"\nTrain head:\n{train_features.head()}")


### 4.1 Feature diagnostics (informational only)
Inspect distributions and multicollinearity of the engineered features **on the training set**. This section is for review only — no features are automatically dropped or transformed.

In [None]:
# ── 4.6a  Per-feature distribution diagnostics ──────────────────────────────
# Feature columns only (exclude member_id and labels)
FEATURE_COLS = [
    "wellco_web_visits_count",
    # TOGGLE: uncomment next line to include URL feature in diagnostics
    # "wellco_web_unique_urls",
    "days_since_last_wellco_web",
    "app_sessions_count",
    "icd_distinct_count",
    "has_focus_icd",
    "days_since_last_claim",
    "tenure_days",
]
feature_diagnostics(train_features, FEATURE_COLS, title_suffix="(train)")


In [None]:
# ── 4.6b  Histograms ────────────────────────────────────────────────────────
# Each plot: x = feature value, y = how many members have that value.
FEATURE_XLABELS = {
    "wellco_web_visits_count": "Relevant web visits per member",
    "wellco_web_unique_urls": "Unique URLs (WellCo-relevant visits)",
    "days_since_last_wellco_web": "Days since last relevant web visit",
    "app_sessions_count": "App sessions per member",
    "icd_distinct_count": "Distinct ICD codes per member",
    "has_focus_icd": "Has focus ICD (0 = no, 1 = yes)",
    "days_since_last_claim": "Days since last claim",
    "tenure_days": "Tenure (days since signup)",
}
plot_feature_histograms(train_features, FEATURE_COLS, xlabels=FEATURE_XLABELS, suptitle="Feature distributions: how many members have each value (train set)")


**What we see in these distribution plots (train set):** Each histogram shows how many members have each value for one feature. **wellco_web_visits_count:** Strong right skew; most members in the 0–5 or 5–14 range, long tail up to 62 visits — a few heavy engagers. **wellco_web_unique_urls:** Almost identical shape to visit count (same data, one row per URL), so redundant; we drop it from the matrix. **days_since_last_wellco_web:** Right-skewed; many at 0–1 days, tail out to 13; ~2% missing (no relevant visit). **app_sessions_count:** More symmetric, roughly bell-shaped; median ~10, less skew than web/claims. **icd_distinct_count:** Multi-modal (several peaks at 4, 5, 6 codes). **has_focus_icd:** Almost all 1 (92%+); near-binary. **days_since_last_claim:** Right-skewed; peak at 1–2 days. **tenure_days:** Right-skewed; spread from 45 to 561 days. For tree-based uplift models we use these as-is; for linear models we'd log1p skewed counts and keep binary 0/1.

In [None]:
# ── 4.6c  Multicollinearity diagnostic ──────────────────────────────────────
plot_correlation_diagnostics(train_features, FEATURE_COLS, threshold=0.8, title_suffix="(train set)")


**What we see in this heatmap:** The plot above is the correlation matrix of the 8 feature columns (including `wellco_web_unique_urls`). The only pair with |r| ≥ 0.8 is **wellco_web_visits_count** and **wellco_web_unique_urls** — they are **perfectly correlated** (r = 1.0). That is because both were computed from the same WellCo-relevant rows: in that subset each visit is one row and one URL, so the two counts are identical. All other pairs in this matrix are moderate or weak (e.g. tenure vs. recency, app vs. web); no other near-perfect correlation appears. **Decision:** We drop `wellco_web_unique_urls` from the feature matrix for modeling (7 features) to avoid multicollinearity; the diagnostics above still show the 8-feature version so it is clear we tried the URL feature and why we dropped it.

In [None]:
# ── 4.6d  Uplift by WellCo-relevant web features ───────────────────────────
# Uses uplift_by_groups with labels; train_features has the three web features.
web_uplift_df = train_features[["member_id", "wellco_web_visits_count", "wellco_web_unique_urls", "days_since_last_wellco_web"]].copy()

# 1. Uplift by wellco_web_visits_count (quartiles)
web_uplift_df["quartile_bin"] = pd.qcut(web_uplift_df["wellco_web_visits_count"], q=4, duplicates="drop")
uplift_by_groups(web_uplift_df, labels, "quartile_bin", sorted(web_uplift_df["quartile_bin"].dropna().unique()), title="Uplift by WellCo-relevant web visits (count)", xlabel="wellco_web_visits_count (quartile)")

# 2. Uplift by wellco_web_unique_urls (quartiles)
web_uplift_df["quartile_bin"] = pd.qcut(web_uplift_df["wellco_web_unique_urls"], q=4, duplicates="drop")
uplift_by_groups(web_uplift_df, labels, "quartile_bin", sorted(web_uplift_df["quartile_bin"].dropna().unique()), title="Uplift by WellCo-relevant unique URLs", xlabel="wellco_web_unique_urls (quartile)")

# 3. Uplift by days_since_last_wellco_web (quartiles); exclude members with no relevant visits
valid_web = web_uplift_df.dropna(subset=["days_since_last_wellco_web"]).copy()
n_excluded_web = len(web_uplift_df) - len(valid_web)
valid_web["quartile_bin"] = pd.qcut(valid_web["days_since_last_wellco_web"], q=4, duplicates="drop")
uplift_by_groups(valid_web, labels, "quartile_bin", sorted(valid_web["quartile_bin"].dropna().unique()), title="Uplift by days since last WellCo-relevant web visit", xlabel="days_since_last_wellco_web (quartile)")
print(f"Excluded from plot (no relevant web visits): {n_excluded_web} members.")


**What we see in these uplift plots:** Each bar chart bins members by quartiles of one WellCo-relevant web feature and shows the **uplift** (churn rate difference: control − treated) in that bin. **Plot 1 (wellco_web_visits_count):** Uplift is small and slightly negative across quartiles; more visits do not show a clearly stronger or weaker outreach effect here. **Plot 2 (wellco_web_unique_urls):** Mirrors the visit-count plot (same redundancy as in the correlation heatmap); we kept this plot to show we tried the URL feature. **Plot 3 (days_since_last_wellco_web):** Members with no relevant visit are excluded; across quartiles uplift is again modest. Overall, these filtered web features show some variation in uplift by segment but no single strong moderator; they remain useful as inputs to the uplift model rather than as standalone targeting rules.

### 4.2 Embeding + relevance filter sanity test
The 26 unique (title, description) pairs in the web data split into two groups based on the WellCo brief (nutrition, exercise, sleep, stress, diabetes, hypertension, cardiometabolic health). If the test passes, the threshold is separating relevant from non-relevant correctly on these unseen examples.

In [None]:
# 4.7 Relevance filter sanity test (logic in utils.run_relevance_filter_sanity_check)
run_relevance_filter_sanity_check(wellco_embedding, embed_model)

## 5. Model Selection — Uplift CV
Stratified K-fold CV; compare S/T/X-learner × LGBM/XGB with AUUC, Qini, uplift@k (utils: `_build_model`, metric helpers).


In [None]:
FEATURE_COLS = ['wellco_web_visits_count', 'days_since_last_wellco_web', 'app_sessions_count',
                'icd_distinct_count', 'has_focus_icd', 'days_since_last_claim', 'tenure_days']
N_SPLITS, N_CURVE_POINTS = 5, 100
CANDIDATE_DEFS = [('S+LGBM','S','LGBM'),('S+XGB','S','XGB'),('T+LGBM','T','LGBM'),('T+XGB','T','XGB'),('X+LGBM','X','LGBM'),('X+XGB','X','XGB')]

X = train_features[FEATURE_COLS].copy()
y = train_features['churn'].astype(int).values
treatment = train_features['outreach'].astype(int).values
stratify_col = 2 * treatment + y
SCALE_POS_WEIGHT = (y == 0).sum() / max((y == 1).sum(), 1)
print('X shape:', X.shape, '| Churn rate:', y.mean(), '| Treatment rate:', treatment.mean())


In [None]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
cv_records, cv_curves, cv_segments = [], {}, {}
for name, meta_key, base_key in CANDIDATE_DEFS:
    cv_curves[name], cv_segments[name] = [], []
    for fold_i, (tr_idx, va_idx) in enumerate(skf.split(X, stratify_col), start=1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        t_tr, t_va = treatment[tr_idx], treatment[va_idx]
        spw = (y_tr == 0).sum() / max((y_tr == 1).sum(), 1)
        model = _build_model(meta_key, base_key, spw)
        model.fit(X_tr, t_tr, y_tr)
        tau = np.asarray(model.predict(X_va)).reshape(-1)
        ks, uvals = uplift_curve(y_va, t_va, tau, n_points=N_CURVE_POINTS)
        auuc_val = approx_auuc(ks, uvals)
        qini_val = float(qini_auc_score(y_va, tau, t_va)) if qini_auc_score else np.nan
        u10, u20 = uplift_at_k(y_va, t_va, tau, 0.10), uplift_at_k(y_va, t_va, tau, 0.20)
        seg = assign_segments(tau)
        seg_share = pd.Series(seg).value_counts(normalize=True)
        cv_records.append({'model': name, 'fold': fold_i, 'auuc': auuc_val, 'qini': qini_val, 'uplift@10%': u10, 'uplift@20%': u20,
                          'persuadables_pct': seg_share.get('Persuadables', 0)})
        cv_curves[name].append((ks, uvals))
        cv_segments[name].append(seg_share)
        print(f'[{name}] Fold {fold_i}: AUUC={auuc_val:+.5f} Qini={qini_val:+.5f} u@10%={u10:+.4f} u@20%={u20:+.4f}')
print('CV complete.')


### 5.5 Results summary


In [None]:
cv_df = pd.DataFrame(cv_records)
summary = cv_df.groupby('model').agg(auuc_mean=('auuc','mean'), auuc_std=('auuc','std'),
    qini_mean=('qini','mean'), u10_mean=('uplift@10%','mean'), u20_mean=('uplift@20%','mean')).sort_values('auuc_mean', ascending=False)
print('CV results (mean ± std):')
display(summary.round(5))


### 5.6 Diagnostic plots


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
models = summary.index.tolist()
x = np.arange(len(models))
ax1.bar(x, summary['auuc_mean'], yerr=summary['auuc_std'], capsize=5); ax1.set_xticks(x); ax1.set_xticklabels(models, rotation=30, ha='right')
ax1.set_ylabel('AUUC'); ax1.set_title('AUUC by model'); ax1.axhline(0, color='grey', ls='--')
qini_std = cv_df.groupby('model')['qini'].std().reindex(models).fillna(0).values
ax2.bar(x, summary['qini_mean'], yerr=qini_std, capsize=5); ax2.set_xticks(x); ax2.set_xticklabels(models, rotation=30, ha='right')
ax2.set_ylabel('Qini'); ax2.set_title('Qini by model'); ax2.axhline(0, color='grey', ls='--')
plt.tight_layout(); plt.show()
