In [6]:
# assumes `df` is already loaded (from your notebook)
assert 'churn_flag' in df.columns, "Create churn_flag first (Step 1)."

# Optional: remove identifier column(s)
if 'customerID' in df.columns:
    df = df.drop(columns=['customerID'])

# Check class balance
print(df['churn_flag'].value_counts())
print(df['churn_flag'].value_counts(normalize=True))


churn_flag
0    1879
1    1331
Name: count, dtype: int64
churn_flag
0    0.585358
1    0.414642
Name: proportion, dtype: float64


Create Train / Validation / Test Splits

We will use 70% train, 15% validation, 15% test, with stratify=df['churn_flag'] so the churn ratio stays the same in each split.

In [7]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

# 1) Train (70%) and Temp (30%)
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    stratify=df['churn_flag'],
    random_state=RANDOM_STATE
)

# 2) Validation (15%) and Test (15%) from Temp
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df['churn_flag'],
    random_state=RANDOM_STATE
)

print("Train size:", train_df.shape)
print("Validation size:", val_df.shape)
print("Test size:", test_df.shape)


Train size: (2247, 31)
Validation size: (481, 31)
Test size: (482, 31)


Verify Class Ratios in Each Split

Purpose:
To check whether stratification worked correctly and the churn percentage is similar in Train, Validation, and Test.

In [8]:
for name, d in [("Train", train_df), ("Validation", val_df), ("Test", test_df)]:
    print("\n", name)
    print("Counts:")
    print(d['churn_flag'].value_counts())
    print("Proportions:")
    print((d['churn_flag'].value_counts(normalize=True) * 100).round(2))



 Train
Counts:
churn_flag
0    1315
1     932
Name: count, dtype: int64
Proportions:
churn_flag
0    58.52
1    41.48
Name: proportion, dtype: float64

 Validation
Counts:
churn_flag
0    282
1    199
Name: count, dtype: int64
Proportions:
churn_flag
0    58.63
1    41.37
Name: proportion, dtype: float64

 Test
Counts:
churn_flag
0    282
1    200
Name: count, dtype: int64
Proportions:
churn_flag
0    58.51
1    41.49
Name: proportion, dtype: float64


Save the Train / Validation / Test Splits

Purpose:
To ensure reproducibility so that the same data is always used for model training, tuning, and final evaluation.
Saving the files also lets you reload them in later notebooks (EDA, modeling, evaluation).

In [9]:
train_df.to_csv("train_telco.csv", index=False)
val_df.to_csv("val_telco.csv", index=False)
test_df.to_csv("test_telco.csv", index=False)

print("Files saved successfully.")


Files saved successfully.


Basic Leakage Checks

Purpose:
To ensure no information from the future or the target leaks into model inputs, which would artificially inflate model performance and make the model useless in real deployment.

In [10]:
for name, subset in [('Train', train_df), ('Validation', val_df), ('Test', test_df)]:
    print(name, subset['churn_flag'].isnull().sum())


Train 0
Validation 0
Test 0


In [11]:
train_df.columns.tolist()


['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn',
 'churn_flag',
 'monthly_high_flag',
 'tenure_group',
 'internet_flag',
 'total_services',
 'many_services_flag',
 'paperless_flag',
 'payment_echeck',
 'payment_auto',
 'contract_short',
 'internet_short']

Identify Numerical Columns



In [12]:
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols


['SeniorCitizen',
 'tenure',
 'MonthlyCharges',
 'TotalCharges',
 'churn_flag',
 'monthly_high_flag',
 'internet_flag',
 'total_services',
 'many_services_flag',
 'paperless_flag',
 'payment_echeck',
 'payment_auto']

In [13]:
cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
cat_cols


['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn',
 'tenure_group',
 'contract_short',
 'internet_short']

In [14]:
train_df[num_cols].isnull().sum(), train_df[cat_cols].isnull().sum()


(SeniorCitizen         0
 tenure                0
 MonthlyCharges        0
 TotalCharges          0
 churn_flag            0
 monthly_high_flag     0
 internet_flag         0
 total_services        0
 many_services_flag    0
 paperless_flag        0
 payment_echeck        0
 payment_auto          0
 dtype: int64,
 gender              0
 Partner             0
 Dependents          0
 PhoneService        0
 MultipleLines       0
 InternetService     0
 OnlineSecurity      0
 OnlineBackup        0
 DeviceProtection    0
 TechSupport         0
 StreamingTV         0
 StreamingMovies     0
 Contract            0
 PaperlessBilling    0
 PaymentMethod       0
 Churn               0
 tenure_group        0
 contract_short      0
 internet_short      0
 dtype: int64)

Encoding Categorical Variables

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# One-Hot Encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ],
    remainder='passthrough'  # keep numerical columns as they are
)


In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Updated preprocessing: scale numerical + encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)


Build the full modeling pipeline (preprocessor + model)

In [19]:
!pip install xgboost


Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   - -------------------------------------- 1.8/72.0 MB 9.4 MB/s eta 0:00:08
   - -------------------------------------- 2.9/72.0 MB 7.8 MB/s eta 0:00:09
   -- ------------------------------------- 4.5/72.0 MB 7.0 MB/s eta 0:00:10
   --- ------------------------------------ 5.8/72.0 MB 6.9 MB/s eta 0:00:10
   --- ------------------------------------ 7.1/72.0 MB 6.8 MB/s eta 0:00:10
   ---- ----------------------------------- 8.4/72.0 MB 6.7 MB/s eta 0:00:10
   ----- ---------------------------------- 9.7/72.0 MB 6.6 MB/s eta 0:00:10
   ------ --------------------------------- 11.0/72.0 MB 6.4 MB/s eta 0:00:10
   ------ --------------------------------- 12.3/72.0 MB 6.5 MB/s eta 0:00:10
   -------


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
# full pipeline + training (run this cell)
import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier   # install xgboost if not present

# features / target
FEATURE_COLS = num_cols + cat_cols
TARGET = 'churn_flag'

X_train = train_df[FEATURE_COLS]
y_train = train_df[TARGET]
X_val = val_df[FEATURE_COLS]
y_val = val_df[TARGET]

# Define model candidates
models = {
    'logreg': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'rf': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    'xgb': XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ('preproc', preprocessor),    # preprocessor defined earlier (scaling + OHE)
        ('clf', model)
    ])

    print(f"\nTraining: {name}")
    pipe.fit(X_train, y_train)

    # Predictions & scores on validation set
    y_pred = pipe.predict(X_val)
    y_prob = pipe.predict_proba(X_val)[:, 1] if hasattr(pipe, "predict_proba") else None

    print("Classification report:")
    print(classification_report(y_val, y_pred, digits=4))
    if y_prob is not None:
        print("ROC AUC:", round(roc_auc_score(y_val, y_prob), 4))

    cm = confusion_matrix(y_val, y_pred)
    print("Confusion matrix:\n", cm)

    # save pipeline and store metrics
    joblib.dump(pipe, f"{name}_pipeline.joblib")
    results[name] = {
        'pipeline_file': f"{name}_pipeline.joblib",
        'clf': model
    }

print("\nDone. Pipelines saved: ", [v['pipeline_file'] for v in results.values()])



Training: logreg
Classification report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       282
           1     1.0000    1.0000    1.0000       199

    accuracy                         1.0000       481
   macro avg     1.0000    1.0000    1.0000       481
weighted avg     1.0000    1.0000    1.0000       481

ROC AUC: 1.0
Confusion matrix:
 [[282   0]
 [  0 199]]

Training: rf
Classification report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       282
           1     1.0000    1.0000    1.0000       199

    accuracy                         1.0000       481
   macro avg     1.0000    1.0000    1.0000       481
weighted avg     1.0000    1.0000    1.0000       481

ROC AUC: 1.0
Confusion matrix:
 [[282   0]
 [  0 199]]

Training: xgb
Classification report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       282
           

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [21]:
import numpy as np
import pandas as pd

FEATURE_COLS = num_cols + cat_cols   # your feature lists
TARGET = 'churn_flag'

X_val = val_df[FEATURE_COLS].copy()
y_val = val_df[TARGET].copy()

print("1) Basic shapes")
print(" X_val shape:", X_val.shape)
print(" y_val shape:", y_val.shape)
print()

# 2) Is target accidentally included in features?
print("2) Is 'churn_flag' in FEATURE_COLS?", 'churn_flag' in FEATURE_COLS)
print()

# 3) Any feature name equal to target or duplicates?
print("3) Any feature exactly named 'churn_flag' in X_val.columns?", 'churn_flag' in X_val.columns)
print()

# 4) Any feature identical to the target (exact match)?
identical = []
for c in FEATURE_COLS:
    try:
        if X_val[c].dtype.kind in 'bifc':  # numeric-like
            if X_val[c].equals(y_val.reset_index(drop=True)):
                identical.append(c)
        else:
            # for non-numeric compare elementwise after mapping to strings
            if X_val[c].astype(str).equals(y_val.astype(str).reset_index(drop=True)):
                identical.append(c)
    except Exception:
        pass

print("4) Features identical to target (exact match):", identical)
print()

# 5) Numeric correlations with target
print("5) Numeric correlations (pearson) with target:")
for c in [c for c in FEATURE_COLS if c in X_val.select_dtypes(include=['int','float']).columns]:
    try:
        corr = np.corrcoef(X_val[c].astype(float), y_val.astype(float))[0,1]
        print(f"  {c:25s} corr={corr:.4f}")
    except Exception:
        print(f"  {c:25s} corr=NA")
print()

# 6) Categorical value -> churn mapping (show categories that map perfectly to one class)
print("6) Categorical churn mapping (rows where category -> churn proportion 0.0 or 1.0 may indicate leakage):")
for c in [c for c in FEATURE_COLS if c in X_val.select_dtypes(include=['object','category']).columns]:
    ct = pd.crosstab(X_val[c], y_val, normalize='index')
    perfect = ct[(ct[0].round(6)==1.0) | (ct[1].round(6)==1.0)]
    if not perfect.empty:
        print(f"\n  Column: {c}")
        display(perfect)
print()

# 7) Are there any constant columns or columns with single unique value?
const_cols = [c for c in FEATURE_COLS if X_val[c].nunique(dropna=False) <= 1]
print("7) Constant or single-unique-value columns:", const_cols)
print()

# 8) Show first 5 rows of X_val with y for visual spot check
print("8) Head example (X_val.head with target):")
display(pd.concat([X_val.head(), y_val.reset_index(drop=True).head()], axis=1))


1) Basic shapes
 X_val shape: (481, 31)
 y_val shape: (481,)

2) Is 'churn_flag' in FEATURE_COLS? True

3) Any feature exactly named 'churn_flag' in X_val.columns? True

4) Features identical to target (exact match): []

5) Numeric correlations (pearson) with target:
  SeniorCitizen             corr=0.1749
  tenure                    corr=-0.2288
  MonthlyCharges            corr=0.3762
  TotalCharges              corr=0.0004
  churn_flag                corr=1.0000
  monthly_high_flag         corr=0.3528
  internet_flag             corr=0.3245
  total_services            corr=0.1980
  many_services_flag        corr=0.2052
  paperless_flag            corr=0.1812
  payment_echeck            corr=0.2452
  payment_auto              corr=-0.0570

6) Categorical churn mapping (rows where category -> churn proportion 0.0 or 1.0 may indicate leakage):

  Column: Contract


churn_flag,0,1
Contract,Unnamed: 1_level_1,Unnamed: 2_level_1
Two year,1.0,0.0



  Column: Churn


churn_flag,0,1
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1.0,0.0
Yes,0.0,1.0



  Column: contract_short


churn_flag,0,1
contract_short,Unnamed: 1_level_1,Unnamed: 2_level_1
Two_year,1.0,0.0



7) Constant or single-unique-value columns: []

8) Head example (X_val.head with target):


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,churn_flag,monthly_high_flag,internet_flag,total_services,many_services_flag,paperless_flag,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,tenure_group,contract_short,internet_short,churn_flag.1
1159,0.0,12.0,78.85,876.75,0.0,1.0,1.0,7.0,1.0,0.0,...,Yes,Yes,One year,No,Electronic check,No,4-12,One_year,DSL,
2837,0.0,10.0,70.1,659.65,1.0,1.0,1.0,2.0,0.0,1.0,...,No,No,Month-to-month,Yes,Electronic check,Yes,4-12,Month-to-month,Fiber_optic,
359,0.0,2.0,45.35,89.5,1.0,0.0,1.0,2.0,0.0,0.0,...,No,No,Month-to-month,No,Electronic check,Yes,0-3,Month-to-month,DSL,
1949,0.0,5.0,70.05,346.4,1.0,1.0,1.0,5.0,1.0,1.0,...,Yes,Yes,Month-to-month,Yes,Electronic check,Yes,4-12,Month-to-month,DSL,
2298,0.0,19.0,55.0,1046.5,1.0,0.0,1.0,4.0,1.0,1.0,...,No,No,Month-to-month,Yes,Credit card (automatic),Yes,13-24,Month-to-month,DSL,
0,,,,,,,,,,,...,,,,,,,,,,0.0
1,,,,,,,,,,,...,,,,,,,,,,1.0
2,,,,,,,,,,,...,,,,,,,,,,1.0
3,,,,,,,,,,,...,,,,,,,,,,1.0
4,,,,,,,,,,,...,,,,,,,,,,1.0


In [22]:
FEATURE_COLS


['SeniorCitizen',
 'tenure',
 'MonthlyCharges',
 'TotalCharges',
 'churn_flag',
 'monthly_high_flag',
 'internet_flag',
 'total_services',
 'many_services_flag',
 'paperless_flag',
 'payment_echeck',
 'payment_auto',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn',
 'tenure_group',
 'contract_short',
 'internet_short']

In [23]:
X_train = train_df[FEATURE_COLS]
y_train = train_df['churn_flag']

X_val = val_df[FEATURE_COLS]
y_val = val_df['churn_flag']


In [24]:
'churn_flag' in FEATURE_COLS


True

In [25]:

FEATURE_COLS = []

for col in num_cols + cat_cols:
    if col not in ['churn_flag', 'Churn']:   # remove both target columns
        FEATURE_COLS.append(col)

FEATURE_COLS


['SeniorCitizen',
 'tenure',
 'MonthlyCharges',
 'TotalCharges',
 'monthly_high_flag',
 'internet_flag',
 'total_services',
 'many_services_flag',
 'paperless_flag',
 'payment_echeck',
 'payment_auto',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'tenure_group',
 'contract_short',
 'internet_short']

In [26]:
# ensure targets removed
for c in ['churn_flag', 'Churn']:
    if c in num_cols: num_cols.remove(c)
    if c in cat_cols: cat_cols.remove(c)

# rebuild feature list
FEATURE_COLS = num_cols + cat_cols
FEATURE_COLS  # print to verify


['SeniorCitizen',
 'tenure',
 'MonthlyCharges',
 'TotalCharges',
 'monthly_high_flag',
 'internet_flag',
 'total_services',
 'many_services_flag',
 'paperless_flag',
 'payment_echeck',
 'payment_auto',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'tenure_group',
 'contract_short',
 'internet_short']

In [27]:
print("'churn_flag' in FEATURE_COLS ->", 'churn_flag' in FEATURE_COLS)
print("'Churn' in FEATURE_COLS ->", 'Churn' in FEATURE_COLS)


'churn_flag' in FEATURE_COLS -> False
'Churn' in FEATURE_COLS -> False


In [28]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# rebuild X/y with clean features
X_train = train_df[FEATURE_COLS]
y_train = train_df['churn_flag']
X_val = val_df[FEATURE_COLS]
y_val = val_df['churn_flag']

models = {
    'logreg': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'rf': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
}

for name, model in models.items():
    pipe = Pipeline([('preproc', preprocessor), ('clf', model)])
    print(f"\nTraining: {name}")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    y_prob = pipe.predict_proba(X_val)[:,1] if hasattr(pipe, "predict_proba") else None
    print(classification_report(y_val, y_pred, digits=4))
    if y_prob is not None:
        print("ROC AUC:", round(roc_auc_score(y_val, y_prob),4))
    print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))
    joblib.dump(pipe, f"{name}_pipeline.joblib")



Training: logreg
              precision    recall  f1-score   support

           0     0.7984    0.7305    0.7630       282
           1     0.6592    0.7387    0.6967       199

    accuracy                         0.7339       481
   macro avg     0.7288    0.7346    0.7298       481
weighted avg     0.7408    0.7339    0.7355       481

ROC AUC: 0.805
Confusion matrix:
 [[206  76]
 [ 52 147]]

Training: rf
              precision    recall  f1-score   support

           0     0.7458    0.7908    0.7676       282
           1     0.6758    0.6181    0.6457       199

    accuracy                         0.7193       481
   macro avg     0.7108    0.7044    0.7067       481
weighted avg     0.7169    0.7193    0.7172       481

ROC AUC: 0.7882
Confusion matrix:
 [[223  59]
 [ 76 123]]


Short Summary of Model Results

The models are now functioning correctly after removing data leakage, and the performance metrics are realistic for churn prediction.

Logistic Regression is currently the best-performing model.

Accuracy: 73%

AUC: 0.805

Churn Recall: ~74%
This model successfully identifies most churners and provides strong, stable performance.

Random Forest performs slightly weaker at this stage.

Accuracy: ~72%

AUC: 0.788

Churn Recall: ~62%
It misses more churners than Logistic Regression.

Why Logistic Regression is better right now:

Higher recall (detects more high-risk customers).

Higher AUC (better probability ranking).

Good performance even without tuning.

Next Step: Hyperparameter tuning to improve the models.
With tuning, Random Forest will improve significantly, and XGBoost (if used later) often becomes the strongest performer.

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Define parameter grid
param_grid_lr = {
    'clf__C': [0.01, 0.1, 1, 5, 10],
}

# Logistic Regression pipeline
lr_pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])

# Grid Search
lr_grid = GridSearchCV(
    estimator=lr_pipe,
    param_grid=param_grid_lr,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

lr_grid.fit(X_train, y_train)

print("Best C:", lr_grid.best_params_)
print("Best AUC (training CV):", lr_grid.best_score_)

# Evaluate on validation set
from sklearn.metrics import classification_report, roc_auc_score

y_val_pred = lr_grid.predict(X_val)
y_val_prob = lr_grid.predict_proba(X_val)[:, 1]

print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred, digits=4))

print("Validation AUC:", roc_auc_score(y_val, y_val_prob))


Best C: {'clf__C': 0.1}
Best AUC (training CV): 0.8051688048076964

Validation Classification Report:
              precision    recall  f1-score   support

           0     0.8008    0.7411    0.7698       282
           1     0.6682    0.7387    0.7017       199

    accuracy                         0.7401       481
   macro avg     0.7345    0.7399    0.7357       481
weighted avg     0.7459    0.7401    0.7416       481

Validation AUC: 0.8061851812252754


Tuned Logistic Regression — Summary

Logistic Regression tuned with C=0.1 (stronger regularization) achieved a cross-validated AUC of 0.8052. On the validation set the model scored Accuracy = 74.01%, AUC = 0.8062, Churn precision = 0.6682, Churn recall = 0.7387, and F1 (churn) = 0.7017. This model provides a stable, well-regularized baseline for detecting churn and producing reliable probability scores for downstream interventions.

Hyperparameter Tuning for Random Forest

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid for Random Forest
param_grid_rf = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 5, 10, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# RF pipeline
rf_pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Grid Search
rf_grid = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

print("Best Parameters:", rf_grid.best_params_)
print("Best Training AUC:", rf_grid.best_score_)

# Validation Evaluation
y_val_pred = rf_grid.predict(X_val)
y_val_prob = rf_grid.predict_proba(X_val)[:, 1]

print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred, digits=4))

print("Validation AUC:", roc_auc_score(y_val, y_val_prob))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'clf__max_depth': 5, 'clf__min_samples_leaf': 4, 'clf__min_samples_split': 10, 'clf__n_estimators': 200}
Best Training AUC: 0.7974170628108892

Validation Classification Report:
              precision    recall  f1-score   support

           0     0.7645    0.7943    0.7791       282
           1     0.6915    0.6533    0.6718       199

    accuracy                         0.7360       481
   macro avg     0.7280    0.7238    0.7255       481
weighted avg     0.7343    0.7360    0.7347       481

Validation AUC: 0.8122349335329129
Confusion Matrix:
 [[224  58]
 [ 69 130]]


Tuned Random Forest — Summary

Random Forest tuned with the parameters
max_depth=5, min_samples_leaf=4, min_samples_split=10, and n_estimators=200
achieved a cross-validated AUC of 0.7974.
On the validation set, the tuned model reached Accuracy = 73.60%, AUC = 0.8122, Churn precision = 0.6915, Churn recall = 0.6533, and Churn F1 = 0.6718.
Compared to the untuned version, this model provides a higher AUC and more stable performance, though Logistic Regression still offers better recall for churn detection.

Final Model Training on (Train + Validation) and Evaluation on Test Set

Combine Train + Validation

In [31]:
full_train_df = pd.concat([train_df, val_df], axis=0)

X_full_train = full_train_df[FEATURE_COLS]
y_full_train = full_train_df['churn_flag']

X_test = test_df[FEATURE_COLS]
y_test = test_df['churn_flag']


Train the Final Logistic Regression Model

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

final_lr = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(
        C=0.1,
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ))
])

final_lr.fit(X_full_train, y_full_train)


0,1,2
,steps,"[('preproc', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


Evaluate on Test Set

In [33]:
y_test_pred = final_lr.predict(X_test)
y_test_prob = final_lr.predict_proba(X_test)[:, 1]

print("Final Test Classification Report:")
print(classification_report(y_test, y_test_pred, digits=4))

print("Final Test AUC:", roc_auc_score(y_test, y_test_prob))

print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


Final Test Classification Report:
              precision    recall  f1-score   support

           0     0.8017    0.6879    0.7405       282
           1     0.6333    0.7600    0.6909       200

    accuracy                         0.7178       482
   macro avg     0.7175    0.7240    0.7157       482
weighted avg     0.7318    0.7178    0.7199       482

Final Test AUC: 0.7865514184397163
Test Confusion Matrix:
[[194  88]
 [ 48 152]]


Save the Final Model

In [34]:
import joblib
joblib.dump(final_lr, "final_logreg_model.joblib")
print("Final model saved as final_logreg_model.joblib")


Final model saved as final_logreg_model.joblib
