# Uplift Modeling for Churn Prediction

Short notebook: same workflow as the full notebook, using **utils** for all logic.
Run cells in order. Data paths: `train/` and `test/` under project root.

## 1. Setup


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold
try:
    from causalml.metrics import qini_auc_score
except Exception:
    qini_auc_score = None

from utils import (
    print_table_overview, count_events_before_signup, time_bin, compute_uplift, plot_uplift_bars,
    build_recency_tenure, load_wellco_brief, ref_date_from_tables, embed_wellco_brief, embed_visit_texts,
    filter_wellco_relevant_visits, agg_web_features, agg_app_features, agg_claims_features,
    agg_lifecycle_tenure, build_feature_matrix, make_lgbm, make_xgb,
    uplift_at_k, uplift_curve, approx_auuc, assign_segments, _build_model,
    DOW_NAMES, SIMILARITY_THRESHOLD, EMBED_MODEL_NAME, FOCUS_ICD_CODES, RANDOM_STATE,
)

pd.set_option('display.max_columns', 200)
BASE_DIR = Path('.').resolve()
TRAIN_DIR = BASE_DIR / 'train'
TEST_DIR = BASE_DIR / 'test'


## 2. Load data
Train/test CSVs; train events restricted to observation window (July 1–15, 2025).


In [None]:
churn_labels = pd.read_csv(TRAIN_DIR / 'churn_labels.csv', parse_dates=['signup_date'])
app_usage = pd.read_csv(TRAIN_DIR / 'app_usage.csv', parse_dates=['timestamp'])
web_visits = pd.read_csv(TRAIN_DIR / 'web_visits.csv', parse_dates=['timestamp'])
claims = pd.read_csv(TRAIN_DIR / 'claims.csv', parse_dates=['diagnosis_date'])
test_members = pd.read_csv(TEST_DIR / 'test_members.csv', parse_dates=['signup_date'])
test_app_usage = pd.read_csv(TEST_DIR / 'test_app_usage.csv', parse_dates=['timestamp'])
test_web_visits = pd.read_csv(TEST_DIR / 'test_web_visits.csv', parse_dates=['timestamp'])
test_claims = pd.read_csv(TEST_DIR / 'test_claims.csv', parse_dates=['diagnosis_date'])

OBS_START, OBS_END = pd.Timestamp('2025-07-01'), pd.Timestamp('2025-07-15')
web_visits = web_visits[(web_visits['timestamp'] >= OBS_START) & (web_visits['timestamp'] < OBS_END)]
app_usage  = app_usage[(app_usage['timestamp'] >= OBS_START) & (app_usage['timestamp'] < OBS_END)]
claims     = claims[(claims['diagnosis_date'] >= OBS_START) & (claims['diagnosis_date'] < OBS_END)]

for name, df in [('churn_labels', churn_labels), ('app_usage', app_usage), ('web_visits', web_visits),
                  ('claims', claims), ('test_members', test_members), ('test_app_usage', test_app_usage),
                  ('test_web_visits', test_web_visits), ('test_claims', test_claims)]:
    print(f'{name}: {df.shape}')


## 3. EDA
### 3.1 Raw data overview
Structure, dtypes, and sample for all 8 tables (utils: `print_table_overview`).


In [None]:
all_tables = {'churn_labels': churn_labels, 'app_usage': app_usage, 'web_visits': web_visits, 'claims': claims,
              'test_members': test_members, 'test_app_usage': test_app_usage, 'test_web_visits': test_web_visits, 'test_claims': test_claims}
for name, df in all_tables.items():
    print_table_overview(name, df)


### 3.2 Column-specific checks
event_type, url, title, icd_code for feature-engineering decisions.


In [None]:
print('app_usage event_type:', app_usage['event_type'].value_counts().to_string())
print('web_visits url unique:', web_visits['url'].nunique(), '| title unique:', web_visits['title'].nunique())
print('claims icd_code unique:', claims['icd_code'].nunique())
print(claims['icd_code'].value_counts().head(10).to_string())


### 3.3 Missing values and member coverage


In [None]:
null_rows = []
for name, df in all_tables.items():
    for col, cnt in df.isnull().sum().items():
        if cnt > 0: null_rows.append({'table': name, 'column': col, 'null_count': cnt})
print('Column nulls:', pd.DataFrame(null_rows).to_string(index=False) if null_rows else 'None')
base_ids = set(churn_labels['member_id'])
n_base = len(base_ids)
for src_name, src_df in [('web_visits', web_visits), ('app_usage', app_usage), ('claims', claims)]:
    present = set(src_df['member_id']) & base_ids
    print(f'{src_name}: {len(present)} present, {n_base - len(present)} absent ({100*(n_base-len(present))/n_base:.2f}%)')


### 3.4 Missingness mechanism (Chi-square) and 3.5 Labels & treatment balance


In [None]:
train_ids = churn_labels[['member_id', 'churn', 'outreach']].copy()
train_ids['has_web'] = train_ids['member_id'].isin(web_visits['member_id']).astype(int)
train_ids['has_app'] = train_ids['member_id'].isin(app_usage['member_id']).astype(int)
train_ids['has_claims'] = train_ids['member_id'].isin(claims['member_id']).astype(int)
for source in ['has_web', 'has_app', 'has_claims']:
    for target in ['churn', 'outreach']:
        ct = pd.crosstab(train_ids[source], train_ids[target])
        chi2, p, _, _ = chi2_contingency(ct)
        print(f'{source} vs {target}: chi2={chi2:.2f} p={p:.4g}')
print('Churn rate:', churn_labels['churn'].mean(), '| Outreach rate:', churn_labels['outreach'].mean())
print(churn_labels.groupby('outreach')['churn'].agg(['count', 'mean']).to_string())


### 3.6 Leakage & time-window validation (utils: `count_events_before_signup`)


In [None]:
window_summary = pd.DataFrame([
    {'table': 'web_visits', 'min': web_visits['timestamp'].min(), 'max': web_visits['timestamp'].max()},
    {'table': 'app_usage', 'min': app_usage['timestamp'].min(), 'max': app_usage['timestamp'].max()},
    {'table': 'claims', 'min': claims['diagnosis_date'].min(), 'max': claims['diagnosis_date'].max()},
])
leakage = pd.DataFrame([
    {'table': 'web_visits', 'events_before_signup': count_events_before_signup(web_visits, 'timestamp', churn_labels)},
    {'table': 'app_usage', 'events_before_signup': count_events_before_signup(app_usage, 'timestamp', churn_labels)},
    {'table': 'claims', 'events_before_signup': count_events_before_signup(claims, 'diagnosis_date', churn_labels)},
])
display(window_summary)
display(leakage)


### 3.7 Temporal & uplift helpers
Prepare events; define `labels` for uplift; utils: `time_bin`, `compute_uplift`, `plot_uplift_bars`.


In [None]:
web_ev = web_visits[['member_id', 'timestamp']].copy(); web_ev['hour'] = web_ev['timestamp'].dt.hour; web_ev['dow'] = web_ev['timestamp'].dt.dayofweek
app_ev = app_usage[['member_id', 'timestamp']].copy(); app_ev['hour'] = app_ev['timestamp'].dt.hour; app_ev['dow'] = app_ev['timestamp'].dt.dayofweek
events = pd.concat([web_ev[['member_id', 'hour', 'dow']], app_ev[['member_id', 'hour', 'dow']]], ignore_index=True)
events['time_of_day'] = events['hour'].apply(time_bin)
events['dow_name'] = events['dow'].map(DOW_NAMES)
events['is_weekend'] = events['dow'].isin([5, 6])
labels = churn_labels[['member_id', 'churn', 'outreach']]
print('Events:', len(events), 'rows,', events['member_id'].nunique(), 'members')


### 3.8 Uplift by time of day and day of week


In [None]:
tod_order = ['Early Morning', 'Morning', 'Afternoon', 'Evening']
tod_uplift = [compute_uplift(labels, events.loc[events['time_of_day'] == t, 'member_id'].unique())[0] for t in tod_order]
plot_uplift_bars(tod_order, tod_uplift, 'Uplift by time of day', 'Time of day')
dow_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
dow_uplift = [compute_uplift(labels, events.loc[events['dow_name'] == d, 'member_id'].unique())[0] for d in dow_order]
plot_uplift_bars(dow_order, dow_uplift, 'Uplift by day of week', 'Day of week')


### 3.9 Recency & tenure (utils: `build_recency_tenure`)


In [None]:
recency_df, ref_date = build_recency_tenure(churn_labels, web_visits, app_usage, claims)
rec = churn_labels[['member_id', 'churn', 'outreach']].merge(recency_df, left_on='member_id', right_index=True)
print('Ref date:', ref_date)
print(rec[['days_since_last_web', 'days_since_last_app', 'days_since_last_activity', 'tenure_days']].describe().round(1).to_string())


## 4. Feature Engineering
Config; load WellCo brief and embedding model once; then build train/test feature matrices (utils).


In [None]:
WELLCO_BRIEF_PATH = BASE_DIR / 'wellco_client_brief.txt'
brief_text = load_wellco_brief(WELLCO_BRIEF_PATH)
embed_model = SentenceTransformer(EMBED_MODEL_NAME)
wellco_embedding = embed_wellco_brief(brief_text, embed_model)
print('WellCo brief chars:', len(brief_text), '| Embedding shape:', wellco_embedding.shape)


In [None]:
ref_date_train = ref_date_from_tables(web_visits, app_usage, claims)
ref_date_test  = ref_date_from_tables(test_web_visits, test_app_usage, test_claims)
print('ref_date_train:', ref_date_train, '| ref_date_test:', ref_date_test)

print('Building TRAIN feature matrix...')
train_features = build_feature_matrix(churn_labels, web_visits, app_usage, claims, ref_date_train,
    wellco_embedding=wellco_embedding, embed_model=embed_model, include_labels=True)
print('Building TEST feature matrix...')
test_features = build_feature_matrix(test_members, test_web_visits, test_app_usage, test_claims, ref_date_test,
    wellco_embedding=wellco_embedding, embed_model=embed_model, include_labels=False)
print('Train shape:', train_features.shape, '| Test shape:', test_features.shape)
print('Columns:', list(train_features.columns))


## 5. Model Selection — Uplift CV
Stratified K-fold CV; compare S/T/X-learner × LGBM/XGB with AUUC, Qini, uplift@k (utils: `_build_model`, metric helpers).


In [None]:
FEATURE_COLS = ['wellco_web_visits_count', 'days_since_last_wellco_web', 'app_sessions_count',
                'icd_distinct_count', 'has_focus_icd', 'days_since_last_claim', 'tenure_days']
N_SPLITS, N_CURVE_POINTS = 5, 100
CANDIDATE_DEFS = [('S+LGBM','S','LGBM'),('S+XGB','S','XGB'),('T+LGBM','T','LGBM'),('T+XGB','T','XGB'),('X+LGBM','X','LGBM'),('X+XGB','X','XGB')]

X = train_features[FEATURE_COLS].copy()
y = train_features['churn'].astype(int).values
treatment = train_features['outreach'].astype(int).values
stratify_col = 2 * treatment + y
SCALE_POS_WEIGHT = (y == 0).sum() / max((y == 1).sum(), 1)
print('X shape:', X.shape, '| Churn rate:', y.mean(), '| Treatment rate:', treatment.mean())


In [None]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
cv_records, cv_curves, cv_segments = [], {}, {}
for name, meta_key, base_key in CANDIDATE_DEFS:
    cv_curves[name], cv_segments[name] = [], []
    for fold_i, (tr_idx, va_idx) in enumerate(skf.split(X, stratify_col), start=1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        t_tr, t_va = treatment[tr_idx], treatment[va_idx]
        spw = (y_tr == 0).sum() / max((y_tr == 1).sum(), 1)
        model = _build_model(meta_key, base_key, spw)
        model.fit(X_tr, t_tr, y_tr)
        tau = np.asarray(model.predict(X_va)).reshape(-1)
        ks, uvals = uplift_curve(y_va, t_va, tau, n_points=N_CURVE_POINTS)
        auuc_val = approx_auuc(ks, uvals)
        qini_val = float(qini_auc_score(y_va, tau, t_va)) if qini_auc_score else np.nan
        u10, u20 = uplift_at_k(y_va, t_va, tau, 0.10), uplift_at_k(y_va, t_va, tau, 0.20)
        seg = assign_segments(tau)
        seg_share = pd.Series(seg).value_counts(normalize=True)
        cv_records.append({'model': name, 'fold': fold_i, 'auuc': auuc_val, 'qini': qini_val, 'uplift@10%': u10, 'uplift@20%': u20,
                          'persuadables_pct': seg_share.get('Persuadables', 0)})
        cv_curves[name].append((ks, uvals))
        cv_segments[name].append(seg_share)
        print(f'[{name}] Fold {fold_i}: AUUC={auuc_val:+.5f} Qini={qini_val:+.5f} u@10%={u10:+.4f} u@20%={u20:+.4f}')
print('CV complete.')


### 5.5 Results summary


In [None]:
cv_df = pd.DataFrame(cv_records)
summary = cv_df.groupby('model').agg(auuc_mean=('auuc','mean'), auuc_std=('auuc','std'),
    qini_mean=('qini','mean'), u10_mean=('uplift@10%','mean'), u20_mean=('uplift@20%','mean')).sort_values('auuc_mean', ascending=False)
print('CV results (mean ± std):')
display(summary.round(5))


### 5.6 Diagnostic plots


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
models = summary.index.tolist()
x = np.arange(len(models))
ax1.bar(x, summary['auuc_mean'], yerr=summary['auuc_std'], capsize=5); ax1.set_xticks(x); ax1.set_xticklabels(models, rotation=30, ha='right')
ax1.set_ylabel('AUUC'); ax1.set_title('AUUC by model'); ax1.axhline(0, color='grey', ls='--')
qini_std = cv_df.groupby('model')['qini'].std().reindex(models).fillna(0).values
ax2.bar(x, summary['qini_mean'], yerr=qini_std, capsize=5); ax2.set_xticks(x); ax2.set_xticklabels(models, rotation=30, ha='right')
ax2.set_ylabel('Qini'); ax2.set_title('Qini by model'); ax2.axhline(0, color='grey', ls='--')
plt.tight_layout(); plt.show()
