# Module 4 — Churn Prediction: Model Development & Evaluation

**Author:** Prince Nsidibe  
**Course:** BAN6880 - Data Analytics Capstone

This notebook trains and evaluates classification models to predict customer churn using the finalized dataset from Milestone 1.


In [None]:
# 1) Imports and settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score,
                             roc_curve, precision_recall_curve, average_precision_score)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42


## 2) Load finalized dataset

Update `DATA_PATH` below if your finalized dataset is in a different location.

In [None]:
DATA_PATH = '../data/processed/finalized_churn_dataset.csv'  # change if needed
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Finalized dataset not found at {DATA_PATH}. Please run churn_eda.ipynb or update DATA_PATH.")
df = pd.read_csv(DATA_PATH)
print('Loaded dataset shape:', df.shape)
df.head()

## 3) Quick data checks

In [None]:
# Basic checks
print('Columns:', list(df.columns))
print('\nDtypes:\n', df.dtypes)
print('\nMissing values per column:\n', df.isna().sum())

## 4) Feature selection

Drop identifiers and choose label (`churn`).

In [None]:
label = 'churn'  # as created in Milestone 1
drop_cols = ['customer_id', 'signup_date', 'last_order_date', 'last_login']
drop_cols = [c for c in drop_cols if c in df.columns]
X = df.drop(columns=[label] + drop_cols)
y = df[label].astype(int)
print('X shape:', X.shape, 'y distribution:\n', y.value_counts())

## 5) Train-test split (stratified)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print('Train:', X_train.shape, 'Test:', X_test.shape)

## 6) Identify numeric and categorical columns

In [None]:
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()
print('Numeric cols:', numeric_cols)
print('Categorical cols:', cat_cols)

## 7) Preprocessing pipelines

StandardScaler for numeric features and OneHotEncoder for categoricals.

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', cat_transformer, cat_cols)
], remainder='drop')
print('Preprocessor ready')

## 8) Baseline model — Logistic Regression

Use class_weight='balanced' to mitigate imbalance.

In [None]:
pipe_lr = Pipeline(steps=[('preproc', preprocessor),
                           ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE))])
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)
y_proba_lr = pipe_lr.predict_proba(X_test)[:,1]
print('Logistic Regression')
print(classification_report(y_test, y_pred_lr))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_lr))

## 9) Option: SMOTE + Logistic Regression

Compare resampling approach versus class_weight.

In [None]:
pipe_sm_lr = ImbPipeline(steps=[
    ('preproc', preprocessor),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('clf', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])
pipe_sm_lr.fit(X_train, y_train)
y_pred_sm = pipe_sm_lr.predict(X_test)
y_proba_sm = pipe_sm_lr.predict_proba(X_test)[:,1]
print('SMOTE + LR')
print(classification_report(y_test, y_pred_sm))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_sm))

## 10) Random Forest

In [None]:
pipe_rf = Pipeline(steps=[('preproc', preprocessor),
                          ('clf', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=RANDOM_STATE))])
pipe_rf.fit(X_train, y_train)
y_pred_rf = pipe_rf.predict(X_test)
y_proba_rf = pipe_rf.predict_proba(X_test)[:,1]
print('Random Forest')
print(classification_report(y_test, y_pred_rf))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_rf))

## 11) XGBoost

In [None]:
pipe_xgb = Pipeline(steps=[('preproc', preprocessor),
                           ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE))])
pipe_xgb.fit(X_train, y_train)
y_pred_xgb = pipe_xgb.predict(X_test)
y_proba_xgb = pipe_xgb.predict_proba(X_test)[:,1]
print('XGBoost')
print(classification_report(y_test, y_pred_xgb))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_xgb))

## 12) ROC Curves Comparison

In [None]:
plt.figure(figsize=(8,6))
for name, proba in [('LogReg', y_proba_lr), ('RF', y_proba_rf), ('XGB', y_proba_xgb)]:
    fpr, tpr, _ = roc_curve(y_test, proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc_score(y_test, proba):.3f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.show()

## 13) Precision-Recall Curves

In [None]:
plt.figure(figsize=(8,6))
for name, proba in [('LogReg', y_proba_lr), ('RF', y_proba_rf), ('XGB', y_proba_xgb)]:
    prec, rec, _ = precision_recall_curve(y_test, proba)
    ap = average_precision_score(y_test, proba)
    plt.plot(rec, prec, label=f"{name} (AP={ap:.3f})")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.show()

## 14) Hyperparameter Tuning (XGBoost)

GridSearchCV optimizing ROC-AUC.

In [None]:
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5],
    'clf__learning_rate': [0.05, 0.1]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
gs = GridSearchCV(pipe_xgb, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)
print('Best params:', gs.best_params_)
best_model = gs.best_estimator_

# Evaluate best model
y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred_best))
print('ROC-AUC (best):', roc_auc_score(y_test, y_proba_best))

## 15) Feature Importance

In [None]:
feature_names = []
try:
    num_names = numeric_cols
    if cat_cols:
        ohe = best_model.named_steps['preproc'].named_transformers_['cat'].named_steps['onehot']
        cat_names = list(ohe.get_feature_names_out(cat_cols))
    else:
        cat_names = []
    feature_names = list(num_names) + cat_names
except Exception as e:
    print('Could not build feature names:', e)

try:
    importances = best_model.named_steps['clf'].feature_importances_
    fi = pd.DataFrame({'feature': feature_names, 'importance': importances})
    fi = fi.sort_values('importance', ascending=False).head(20)
    print(fi)
    plt.figure(figsize=(8,6))
    plt.barh(fi['feature'][::-1], fi['importance'][::-1])
    plt.title('Top Feature Importances')
    plt.show()
except Exception as e:
    print('Could not extract feature importances:', e)

## 16) Save best model and results

In [None]:
os.makedirs('../models', exist_ok=True)
joblib.dump(best_model, '../models/best_xgb_model.pkl')
results = {
    'model': ['LogisticRegression', 'RandomForest', 'XGBoost', 'XGBoost_Tuned'],
    'roc_auc': [
        roc_auc_score(y_test, y_proba_lr),
        roc_auc_score(y_test, y_proba_rf),
        roc_auc_score(y_test, y_proba_xgb),
        roc_auc_score(y_test, y_proba_best)
    ]
}
results_df = pd.DataFrame(results)
results_df.to_csv('../models/model_results_summary.csv', index=False)
print('Saved model and results to ../models/')

## 17) Conclusion & Next Steps

- Best model (tuned XGBoost) saved.
- Key metrics: ROC-AUC, Precision/Recall reported above.
- Next steps: productionize model, set monitoring, A/B test retention campaigns.
