
# Customer Churn Prediction — End-to-End ML Pipeline (Module 6 — Task 2)

This notebook targets the Kaggle Credit Card Customer Attrition/Churn dataset and similar churn datasets.
It includes:

1. **EDA**
2. **Imbalance check** and resolution (simple random **over-sampling** on train only)
3. **Logistic Regression**
4. **Naive Bayes**
5. **K-Nearest Neighbors**
6. **SVC** with **GridSearchCV**
7. **Decision Tree** with **GridSearchCV**
8. **Random Forest** with **RandomizedSearchCV**
9. **Model Selection**: accuracy, precision, recall, F1, ROC-AUC, confusion matrix, and saves every model separately.

> **Instructions**
> - Place your churn CSV (e.g., `BankChurners.csv`) in the **same folder** as this notebook.
> - Update `DATA_PATH` and `TARGET_CANDIDATES` if your column names differ.
> - The notebook auto-detects the target label among common names like `Attrition_Flag`, `Churn`, `Exited`.


In [None]:

# ====== Config ======
DATA_PATH = "BankChurners.csv"  # change if needed
TARGET_CANDIDATES = ["Attrition_Flag", "Churn", "Exited", "churn", "attrition_flag"]
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_JOBS = -1  # use all cores

# Output
MODELS_DIR = "models_churn"
RESULTS_PATH = "churn_model_results.csv"

import os
os.makedirs(MODELS_DIR, exist_ok=True)
print("Models will be saved to:", MODELS_DIR)


In [None]:

# ====== Imports ======
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, RocCurveDisplay)

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib
import os


In [None]:

# ====== Load Data ======
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


## Target detection & label cleaning

In [None]:

# Try to detect the churn/attrition column
target_col = None
for c in TARGET_CANDIDATES:
    if c in df.columns:
        target_col = c
        break

if target_col is None:
    raise ValueError(f"Could not find target column in {TARGET_CANDIDATES}. Please update TARGET_CANDIDATES or rename your column.")

print("Detected target column:", target_col)

# Normalize binary labels to 0/1
y_raw = df[target_col].astype(str).str.strip()

# Common mappings
mapping_yes_no = {"yes":1,"no":0,"true":1,"false":0,"1":1,"0":0}
mapping_attrition = {"Attrited Customer":1, "Existing Customer":0,
                     "Churned":1, "Not Churned":0,
                     "Exited":1, "Stayed":0}

y = y_raw.str.lower().map(mapping_yes_no)
if y.isna().any():
    # Try attrition mapping (case-sensitive original)
    y2 = y_raw.map(mapping_attrition)
    y = np.where(pd.isna(y), y2, y).astype(float)

# If still NaN, try a fallback binary encoding of the most frequent values
if pd.isna(y).mean() > 0:
    top_vals = y_raw.value_counts().index.tolist()
    if len(top_vals) >= 2:
        primary, secondary = top_vals[0], top_vals[1]
        y = y_raw.apply(lambda v: 1 if v == primary else (0 if v == secondary else np.nan))

# Final check
if pd.isna(y).any():
    raise ValueError("Unable to normalize target to 0/1. Please adjust mappings.")

y = y.astype(int)

# Define X
X = df.drop(columns=[target_col])
print("Class balance (raw):")
print(pd.Series(y).value_counts())


## 1) EDA

In [None]:

print("Columns:", X.columns.tolist())
print("\nDtypes summary:")
print(X.dtypes.value_counts())

print("\nMissing values (top 30):")
missing = X.isna().sum().sort_values(ascending=False)
display(missing.head(30))

# Basic numeric EDA
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) > 0:
    desc = X[num_cols].describe(percentiles=[.05,.25,.5,.75,.95])
    display(desc)

# Churn rate
churn_rate = y.mean()
print(f"Churn rate: {churn_rate:.3f}")

# Plot churn distribution
plt.figure(figsize=(5,4))
plt.bar(["Not Churned (0)", "Churned (1)"], [ (y==0).sum(), (y==1).sum() ])
plt.title("Target Distribution")
plt.ylabel("Count")
plt.show()


## 2) Imbalance check & resolution

In [None]:

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print("Train distribution:")
print(pd.Series(y_train).value_counts(normalize=True))

def simple_random_oversample(Xt, yt, random_state=RANDOM_STATE):
    # Oversample minority class to match majority count (train only)
    counts = pd.Series(yt).value_counts()
    if len(counts) != 2:
        return Xt, yt
    maj_class = counts.idxmax()
    min_class = counts.idxmin()
    n_maj = counts.max()
    n_min = counts.min()
    if n_min == 0 or n_min == n_maj:
        return Xt, yt
    # Split by class
    Xm = Xt[yt==min_class]
    ym = yt[yt==min_class]
    reps = n_maj - n_min
    Xm_up = Xm.sample(reps, replace=True, random_state=random_state)
    ym_up = ym.sample(reps, replace=True, random_state=random_state)
    X_bal = pd.concat([Xt, Xm_up], axis=0)
    y_bal = pd.concat([yt, ym_up], axis=0)
    return X_bal, y_bal

X_train_bal, y_train_bal = simple_random_oversample(X_train.copy(), y_train.copy())

print("After over-sampling (train):")
print(pd.Series(y_train_bal).value_counts())


## Preprocessing (impute, scale, one-hot)

In [None]:

num_features = X_train_bal.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X_train_bal.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)


In [None]:

def evaluate_and_save(model, name, X_test, y_test, save_dir=MODELS_DIR):
    preds = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        probas = model.predict_proba(X_test)[:,1]
    else:
        # decision_function for SVC; fallback to scaled 0-1 via rank if not available
        if hasattr(model, "decision_function"):
            s = model.decision_function(X_test)
            # min-max scale
            probas = (s - s.min()) / (s.max() - s.min() + 1e-9)
        else:
            probas = preds  # not ideal, but keeps pipeline consistent

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    f1 = f1_score(y_test, preds, zero_division=0)
    try:
        auc = roc_auc_score(y_test, probas)
    except Exception:
        auc = np.nan

    path = os.path.join(save_dir, f"{name}.pkl")
    joblib.dump(model, path)

    # Confusion matrix
    cm = confusion_matrix(y_test, preds)
    print(f"Confusion Matrix — {name}:\n", cm)

    return {"model": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": auc, "path": path}


## 3) Logistic Regression

In [None]:

pipe_lr = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=N_JOBS))
])
pipe_lr.fit(X_train_bal, y_train_bal)
res_lr = evaluate_and_save(pipe_lr, "churn_logistic_regression", X_test, y_test)
res_lr


## 4) Naive Bayes

In [None]:

# We'll use GaussianNB over the transformed numeric space; for categorical-heavy datasets, CategoricalNB would require integer-encoded categories.
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# Build a pipeline that outputs dense arrays for NB
numeric_transformer_nb = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer_nb = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])
preprocessor_nb = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_nb, num_features),
        ("cat", categorical_transformer_nb, cat_features)
    ],
    remainder="drop"
)

pipe_nb = Pipeline(steps=[
    ("preprocess", preprocessor_nb),
    ("model", GaussianNB())
])
pipe_nb.fit(X_train_bal, y_train_bal)
res_nb = evaluate_and_save(pipe_nb, "churn_naive_bayes", X_test, y_test)
res_nb


## 5) K-Nearest Neighbors

In [None]:

pipe_knn = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", KNeighborsClassifier())
])

param_grid_knn = {
    "model__n_neighbors": [3,5,7, nine := 9, 11, 15],
    "model__weights": ["uniform", "distance"],
    "model__p": [1,2]
}

grid_knn = GridSearchCV(pipe_knn, param_grid=param_grid_knn, cv=5, n_jobs=N_JOBS)
grid_knn.fit(X_train_bal, y_train_bal)
best_knn = grid_knn.best_estimator_
res_knn = evaluate_and_save(best_knn, "churn_knn", X_test, y_test)
grid_knn.best_params_, res_knn


## 6) SVC with GridSearchCV

In [None]:

pipe_svc = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", SVC(class_weight="balanced", probability=True))
])

param_grid_svc = {
    "model__kernel": ["rbf", "linear"],
    "model__C": [0.1, 1, 10, 30, 100],
    "model__gamma": ["scale", "auto"]
}

grid_svc = GridSearchCV(pipe_svc, param_grid=param_grid_svc, cv=5, n_jobs=N_JOBS)
grid_svc.fit(X_train_bal, y_train_bal)
best_svc = grid_svc.best_estimator_
res_svc = evaluate_and_save(best_svc, "churn_svc", X_test, y_test)
grid_svc.best_params_, res_svc


## 7) Decision Tree with GridSearchCV

In [None]:

pipe_dt = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight="balanced"))
])

param_grid_dt = {
    "model__max_depth": [None, 5, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 4, 8]
}

grid_dt = GridSearchCV(pipe_dt, param_grid=param_grid_dt, cv=5, n_jobs=N_JOBS)
grid_dt.fit(X_train_bal, y_train_bal)
best_dt = grid_dt.best_estimator_
res_dt = evaluate_and_save(best_dt, "churn_decision_tree", X_test, y_test)
grid_dt.best_params_, res_dt


## 8) Random Forest with RandomizedSearchCV

In [None]:

pipe_rf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=N_JOBS, class_weight="balanced"))
])

param_dist_rf = {
    "model__n_estimators": [100, 200, 300, 500, 800],
    "model__max_depth": [None, 10, 20, 30, 50],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", 0.3, 0.5, 0.7]
}

rand_rf = RandomizedSearchCV(
    pipe_rf,
    param_distributions=param_dist_rf,
    n_iter=25,
    cv=5,
    random_state=RANDOM_STATE,
    n_jobs=N_JOBS
)
rand_rf.fit(X_train_bal, y_train_bal)
best_rf = rand_rf.best_estimator_
res_rf = evaluate_and_save(best_rf, "churn_random_forest", X_test, y_test)
rand_rf.best_params_, res_rf


## 9) Model Selection & Comparison

In [None]:

results = [res_lr, res_nb, res_knn, res_svc, res_dt, res_rf]
results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False)
results_df.to_csv(RESULTS_PATH, index=False)
display(results_df)

# Plot F1
plt.figure(figsize=(9,5))
plt.bar(results_df["model"], results_df["f1"])
plt.xticks(rotation=45, ha='right')
plt.title("Model F1 Score (higher is better)")
plt.ylabel("F1")
plt.tight_layout()
plt.show()

print("Results saved to:", RESULTS_PATH)
