In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_recall_curve, confusion_matrix,
    precision_score, recall_score,
    average_precision_score, roc_auc_score
)

In [31]:
pd.set_option("display.max_columns", 60)

In [32]:
df = pd.read_csv("churn.csv")

In [33]:
df.shape

(14000, 8)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tenure_months     13730 non-null  float64
 1   monthly_spend     13711 non-null  float64
 2   support_calls     13726 non-null  float64
 3   promo_eligible    13730 non-null  float64
 4   debt_ratio        13728 non-null  float64
 5   age               13710 non-null  float64
 6   is_international  13713 non-null  float64
 7   churn             14000 non-null  int64  
dtypes: float64(7), int64(1)
memory usage: 875.1 KB


In [35]:
df.head(7)

Unnamed: 0,tenure_months,monthly_spend,support_calls,promo_eligible,debt_ratio,age,is_international,churn
0,32.0,68.231799,8.0,,0.290672,21.0,1.0,1
1,71.0,90.902968,0.0,0.0,0.165351,52.0,1.0,0
2,71.0,61.131705,6.0,1.0,0.353166,55.0,1.0,0
3,27.0,129.992275,2.0,,0.347002,65.0,0.0,0
4,68.0,109.181253,1.0,1.0,0.168505,46.0,1.0,0
5,59.0,,6.0,1.0,0.341591,57.0,1.0,0
6,46.0,145.048771,11.0,0.0,0.363906,35.0,1.0,1


In [36]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tenure_months,13730.0,35.73343,20.650465,0.0,18.0,36.0,54.0,71.0
monthly_spend,13711.0,85.284376,29.782121,10.0,65.187485,85.448481,105.423118,196.463627
support_calls,13726.0,5.488708,3.459574,0.0,3.0,5.0,9.0,11.0
promo_eligible,13730.0,0.493955,0.499982,0.0,0.0,0.0,1.0,1.0
debt_ratio,13728.0,0.285614,0.160425,0.000678,0.16086,0.262753,0.391601,0.889832
age,13710.0,49.147411,18.166699,18.0,33.25,49.0,65.0,80.0
is_international,13713.0,0.498432,0.500016,0.0,0.0,0.0,1.0,1.0
churn,14000.0,0.401214,0.490162,0.0,0.0,0.0,1.0,1.0


In [38]:
df.churn.value_counts(normalize=False).rename("share")

churn
0    8383
1    5617
Name: share, dtype: int64

In [39]:
df.churn.value_counts(normalize=True).rename("share")

churn
0    0.598786
1    0.401214
Name: share, dtype: float64

In [42]:
X = df.drop(columns=["churn"])
y = df["churn"]

In [43]:
# First cut: 80%-20% (train vs test)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20, stratify=y, random_state=13)

In [45]:
# Second cut: From the training set, make a validation set (60% train, 20% validation, 20% test)
X_tr, X_va, y_tr, y_va = train_test_split(X_tr, y_tr, test_size=0.25, stratify=y_tr, random_state=13)

In [46]:
neg = int((y_tr == 0).sum())
pos = int((y_tr == 1).sum())
spw = neg/max(pos,1)

In [61]:
print("Train / Val / Test", X_tr.shape, X_va.shape, X_te.shape)
print(f"neg: {neg}, pos:{pos}, scale_positive_weight (spw):{round(spw, 1)}")

Train / Val / Test (6300, 7) (2100, 7) (2800, 7)
neg: 3772, pos:2528, scale_positive_weight (spw):1.5


In [63]:
feat_names = list(X_tr.columns)

dtr = xgb.DMatrix(X_tr, label=y_tr, feature_names=feat_names)
dva = xgb.DMatrix(X_va, label=y_va, feature_names=feat_names)
dte = xgb.DMatrix(X_te, label=y_te, feature_names=feat_names)

In [64]:
params = dict(
    objective="binary:logistic",  
    eval_metric="aucpr",          
    tree_method="hist",           
    max_depth=5,
    eta=0.03,                     
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    scale_pos_weight=spw
)

bst = xgb.train(
    params=params,
    dtrain=dtr,
    num_boost_round=4000,         
    evals=[(dva, "val")],         
    early_stopping_rounds=200,    
    verbose_eval=False
)

best_iter = bst.best_iteration
print("Best number of trees (early stopping chose this):", best_iter)

Best number of trees (early stopping chose this): 137


### Practical Decision Treshold on Validation

In [65]:
p_va = bst.predict(dva, iteration_range=(0, best_iter + 1))
pre, rec, thr = precision_recall_curve(y_va, p_va)
f1 = 2 * pre * rec / np.clip(pre + rec, 1e-9, None)
t_best = float(thr[np.argmax(f1[:-1])])
print("Chosen decision threshold t_best (max F1 on validation):", round(t_best, 3))

Chosen decision threshold t_best (max F1 on validation): 0.483


## Confusion Matrix

In [66]:
p_te = bst.predict(dte, iteration_range=(0, best_iter + 1))
pred = (p_te >= t_best).astype(int)

cm = confusion_matrix(y_te, pred)
tn, fp, fn, tp = cm.ravel()

print("Confusion matrix:\n", cm)
print("tn:", tn, "fp:", fp, "fn:", fn, "tp:", tp)
print("Precision:", round(precision_score(y_te, pred), 3))
print("Recall:", round(recall_score(y_te, pred), 3))
print("ROC AUC:", round(roc_auc_score(y_te, p_te), 3),
      "| PR AUC:", round(average_precision_score(y_te, p_te), 3))

Confusion matrix:
 [[1303  374]
 [ 239  884]]
tn: 1303 fp: 374 fn: 239 tp: 884
Precision: 0.703
Recall: 0.787
ROC AUC: 0.86 | PR AUC: 0.799


In [67]:
imp = pd.Series(bst.get_score(importance_type="gain")).sort_values(ascending=False)

In [68]:
imp.head(10).to_frame("gain")

Unnamed: 0,gain
support_calls,32.823067
promo_eligible,8.211391
age,6.784165
is_international,5.141154
debt_ratio,4.812185
tenure_months,4.321021
monthly_spend,3.857061


In [70]:
top_feat = imp.index[0]
vals = X_te[top_feat].dropna()
grid = np.linspace(vals.quantile(0.05), vals.quantile(0.95), 12)

Xg = pd.DataFrame([X_te.median(numeric_only=True).values]*len(grid), columns=X_te.columns)
Xg[top_feat] = grid

pred = bst.predict(xgb.DMatrix(Xg, feature_names=list(X_te.columns)),
                   iteration_range=(0, bst.best_iteration + 1))

pd.DataFrame({top_feat: grid, "predicted_churn": pred})

Unnamed: 0,support_calls,predicted_churn
0,0.0,0.071968
1,1.0,0.085467
2,2.0,0.150604
3,3.0,0.197384
4,4.0,0.222535
5,5.0,0.299495
6,6.0,0.329948
7,7.0,0.5021
8,8.0,0.56758
9,9.0,0.68819


In [72]:
trend = np.sign(np.nanmean(np.diff(pred)))
print(f"Trend hint for {top_feat}:",
      "increasing" if trend>0 else "decreasing" if trend<0 else "flat/mixed")

Trend hint for support_calls: increasing


In [74]:
bst.save_model("easy_xgb_base.ubj")

In [76]:
feat_names = list(X_tr.columns)
cons = [0]*len(feat_names)
if "debt_ratio" in feat_names:
    cons[feat_names.index("debt_ratio")] = 1      # +1 = non-decreasing effect
if "tenure_months" in feat_names:
    cons[feat_names.index("tenure_months")] = -1  # -1 = non-increasing effect
mono = "(" + ",".join(map(str, cons)) + ")"
print("Monotone constraints:", mono)

params_cons = params.copy()
params_cons.update({"monotone_constraints": mono, "max_bin": 512})

bst_cons = xgb.train(
    params=params_cons,
    dtrain=dtr,
    num_boost_round=4000,
    evals=[(dva, "val")],
    early_stopping_rounds=200,
    verbose_eval=False
)
print("Constrained model best trees:", bst_cons.best_iteration)

Monotone constraints: (-1,0,0,0,1,0,0)
Constrained model best trees: 158


In [77]:
p_base = bst.predict(dte, iteration_range=(0, bst.best_iteration + 1))
p_cons = bst_cons.predict(dte, iteration_range=(0, bst_cons.best_iteration + 1))

print("PR AUC  baseline     :", round(average_precision_score(y_te, p_base), 3))
print("PR AUC  constrained  :", round(average_precision_score(y_te, p_cons), 3))
print("ROC AUC baseline     :", round(roc_auc_score(y_te, p_base), 3))
print("ROC AUC constrained  :", round(roc_auc_score(y_te, p_cons), 3))


PR AUC  baseline     : 0.799
PR AUC  constrained  : 0.801
ROC AUC baseline     : 0.86
ROC AUC constrained  : 0.861


In [78]:
def rule_check(booster, feature, direction, df_like, n_points=25):
    if feature not in df_like.columns:
        return None, None, None
    vals = df_like[feature].dropna()
    grid = np.linspace(vals.quantile(0.05), vals.quantile(0.95), n_points)
    Xg = pd.DataFrame([df_like.median(numeric_only=True).values]*n_points, columns=df_like.columns)
    Xg[feature] = grid
    pred = booster.predict(xgb.DMatrix(Xg, feature_names=list(df_like.columns)),
                           iteration_range=(0, (booster.best_iteration or 0) + 1))
    diffs = np.diff(pred)
    if direction == "nondecreasing":
        violations = int((diffs < -1e-9).sum())
    else:  # nonincreasing
        violations = int((diffs >  1e-9).sum())
    sample = pd.DataFrame({feature: grid[:6], "predicted_churn": pred[:6]})
    return violations, grid, sample

viol_dr, grid_dr, sample_dr = rule_check(bst_cons, "debt_ratio", "nondecreasing", X_te)
viol_tn, grid_tn, sample_tn = rule_check(bst_cons, "tenure_months", "nonincreasing", X_te)

print("Debt ratio monotonic violations (should be 0):", viol_dr if viol_dr is not None else "feature missing")
print("Tenure months monotonic violations (should be 0):", viol_tn if viol_tn is not None else "feature missing")

# Show a tiny preview table so you can eyeball the trend
if sample_dr is not None:
    print("\ndebt_ratio preview (first 6 rows):")
    display(sample_dr)
if sample_tn is not None:
    print("\ntenure_months preview (first 6 rows):")
    display(sample_tn)

Debt ratio monotonic violations (should be 0): 0
Tenure months monotonic violations (should be 0): 0

debt_ratio preview (first 6 rows):


Unnamed: 0,debt_ratio,predicted_churn
0,0.059572,0.20693
1,0.081137,0.248736
2,0.102701,0.277456
3,0.124266,0.281545
4,0.14583,0.293937
5,0.167394,0.293937



tenure_months preview (first 6 rows):


Unnamed: 0,tenure_months,predicted_churn
0,3.0,0.429317
1,5.708333,0.429317
2,8.416667,0.420339
3,11.125,0.413353
4,13.833333,0.410043
5,16.541667,0.377445


In [79]:
bst_cons.save_model("easy_xgb_cons.ubj")