# Customer Churn Prediction with Machine Learning Pipelines

In [None]:
# - @FCS11424

# Repo: https://github.com/ThaddeusTeh2/psw_customer_churn_challenge

# Problem Statement

- Customer churn prediction is critical for business retention. 
- Current machine learning approaches often suffer from data leakage, 
 leading to inflated performance metrics and unreliable predictions in a real-world setting. 
- This poses a challenge in developing an effective, deployable model.

# Goal

- The goal is to design, implement, and evaluate a machine learning pipeline that accurately predicts customer churn. 

# Load the dataset ?

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#load dat data
df = pd.read_csv('./customer_churn.csv')
df.head(10)


# Lookie at the dataframe

In [None]:
df.shape 

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# dayum clean dataset

### the good stuff

In [None]:
cols_to_check = [
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "Churn"
]

for col in cols_to_check:
    if col in df.columns:
        print(f"Unique vals in '{col}':")
        print(df[col].unique())
        print("-" * 20)

# slight DF cleanup

In [None]:
## dont feel like i need these

columns_to_drop = ["customerID", "TotalCharges"]
df = df.drop(columns=columns_to_drop, errors="ignore")



- TotalCharges is highly correlated with both MonthlyCharges and tenure.

- TotalCharges is calculated as tenure multiplied by MonthlyCharges.

- The presence of these three highly correlated variables can introduce multicollinearity into a machine learning model, which can make a model's coefficients unstable and difficult to interpret.

- The decision to drop TotalCharges is a common practice in analyses of this specific dataset to mitigate multicollinearity while retaining the core information in the other two columns.

In [None]:
# split df into categorical and numerical columns

categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include='number').columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# EDA

## any cols directly affect our target variable 'Churn'?

### categorical cols

In [None]:
# All categorical features vs Churn
for feature in categorical_cols:
    if feature == 'Churn':  # skip target itself
        continue
    plt.figure(figsize=(10,6))
    churn_counts = pd.crosstab(df[feature], df['Churn'], normalize='index') * 100
    churn_counts.plot(kind='bar', stacked=True, color=['skyblue','salmon'], figsize=(10,6))
    plt.title(f'{feature} vs Churn (%)')
    plt.ylabel('Percentage')
    plt.legend(title='Churn', loc='upper right')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

#### sanity checkplot: churn distributionm

In [None]:
plt.figure(figsize=(6,6))
df['Churn'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue','salmon'], startangle=90)
plt.title('Churn Distribution')
plt.ylabel('')
plt.show()

#### binary cols

In [None]:
# Define binary categorical features
binary_cats = [
    'Partner', 'Dependents', 'PhoneService',
    'PaperlessBilling', 'SeniorCitizen'
]

# Binary features vs Churn
for feature in binary_cats:
    plt.figure(figsize=(6,4))
    sns.barplot(data=df, x=feature, y=df['Churn'].map({'No':0,'Yes':1}), palette='Set2')
    plt.title(f'Churn Rate by {feature}')
    plt.ylabel('Churn Rate')
    plt.xlabel(feature)
    plt.tight_layout()
    plt.show()

### numerical cols

In [None]:
# Numerical features vs Churn
for col in numerical_cols:
    plt.figure(figsize=(12,6))
    sns.boxplot(data=df, x='Churn', y=col, palette='Set2')
    sns.stripplot(data=df, x='Churn', y=col, color='black', alpha=0.3, jitter=True)
    plt.title(f'{col} distribution by Churn')
    plt.tight_layout()
    plt.show()

#### heatmap of numerical features

In [None]:
# Summary statistics for numerical columns
num_summary = df[numerical_cols].describe().T  # count, mean, std, min, 25%, 50%, 75%, max
plt.figure(figsize=(10,6))
sns.heatmap(num_summary, annot=True, fmt=".1f", cmap="YlGnBu")
plt.title("Summary Statistics of Numerical Features")
plt.show()

# Feature Engineering

In [None]:
# Family indicator (Partner + Dependents)
df['Family'] = ((df['Partner'] == 'Yes') | (df['Dependents'] == 'Yes')).astype(int)
# Captures household/family support as a stabilizing factor against churn.

In [None]:
# Services count (how many optional services are active)
service_cols = ['OnlineSecurity','OnlineBackup','DeviceProtection',
                'TechSupport','StreamingTV','StreamingMovies']
df['ServicesCount'] = df[service_cols].apply(lambda row: (row == 'Yes').sum(), axis=1)
# Higher service engagement usually correlates with lower churn.

In [None]:
# Phone service merged into a single scale
def phone_lines(row):
    if row['PhoneService'] == 'No':
        return 0
    elif row['MultipleLines'] == 'Yes':
        return 2
    else:
        return 1
df['PhoneLines'] = df.apply(phone_lines, axis=1)
# Simplifies PhoneService + MultipleLines into one ordinal feature.

In [None]:
# LongTermContract flag
df['LongTermContract'] = df['Contract'].apply(lambda x: 1 if x in ['One year','Two year'] else 0)
# Longer commitments generally reduce churn likelihood.

In [None]:
# AutomaticPayment flag
df['AutoPayment'] = df['PaymentMethod'].apply(lambda x: 1 if 'automatic' in x.lower() else 0)
# Automatic payments usually indicate stability and lower churn risk.

In [None]:
# Tenure group buckets
df['TenureGroup'] = pd.cut(
    df['tenure'],
    bins=[0,12,24,48,72,float('inf')],
    labels=['0-12','12-24','24-48','48-72','72+'],
    include_lowest=True
)
# Groups customers by service length to capture lifecycle effects.

In [None]:
# High spender binary feature (top 25% of monthly charges)
threshold = df['MonthlyCharges'].quantile(0.75)
df['HighSpender'] = (df['MonthlyCharges'] > threshold).astype(int)
# Flags customers paying in the top quartile of charges.

In [None]:
# Prevent divide by zero
df['SpendPerService'] = df.apply(lambda row: row['MonthlyCharges'] / row['ServicesCount'] if row['ServicesCount'] > 0 else 0, axis=1)
# Normalizes spending intensity per subscribed service.

In [None]:
# 8. Interaction: Contract x PaymentMethod
df['Contract_Payment'] = df['Contract'] + '_' + df['PaymentMethod']
# Captures risky combinations like month-to-month + electronic check.

# save and switch to new dataset with feature engineered columns

In [None]:
df.to_csv('customer_churn_w_fe_cols.csv', index=False)

# use the new df with fe cols as df2

In [None]:
#load dat data
df2 = pd.read_csv('./customer_churn_w_fe_cols.csv')
df2.head(10)

## what df look like after FE

In [None]:
df2.shape

In [None]:
df2.columns

In [None]:
df2.dtypes

In [None]:
df2.isna().sum()

In [None]:
cols_to_check = [
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "Churn",
    "Family",
    "PhoneLines",
    "LongTermContract",
    "AutoPayment",
    "TenureGroup",
    "HighSpender",
    "Contract_Payment"
]

for col in cols_to_check:
    if col in df2.columns:
        print(f"Unique vals in '{col}':")
        print(df2[col].unique())
        print("-" * 20)

# encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()

for col in df2:
    df2[col+'_encoded'] = label.fit_transform(df2[col])
df2

In [None]:
final_df = df2.select_dtypes(['int64','float64'])
final_df.head()

In [None]:
final_df.columns

In [None]:
final_df.drop(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'Family', 'ServicesCount', 'PhoneLines', 'LongTermContract', 'AutoPayment', 'HighSpender', 'SpendPerService'], axis=1)

### what the data look like after encoding

In [None]:
final_df.shape

# preprocessing

In [None]:
# data preprocessing imports
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV


# evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline


### X,Y train test split

#### KPIs_met_more_than_80 as target = y

In [None]:
x = final_df.drop(['Churn_encoded'], axis=1)
y = final_df.Churn_encoded
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled.shape, X_test_scaled.shape

### check the class balance (how many ppl churn(1) vs not(0) )

In [None]:
sns.countplot(x=y)
plt.title('Class Balance of Churn')
plt.xlabel('Churn (encoded)')
plt.ylabel('Count')
plt.show()

print(y.value_counts())
print(y.value_counts(normalize=True))

# modeling

In [None]:
# (throw different models in and see which one performs best)

# dont have to use this many, just pick the best

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import recall_score, f1_score

import numpy as np
# only for XGBclassifier, to handle class imbalance
neg, pos = np.bincount(y_train)
#scale_pos_weight = (number of negative samples) / (number of positive samples)
scale_pos_weight = neg / pos

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Ridge Classifier': RidgeClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=scale_pos_weight),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'LightGBM': LGBMClassifier(random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        # For models like RidgeClassifier, use decision_function and sigmoid
        from scipy.special import expit
        y_proba = expit(model.decision_function(X_test_scaled))
    results.append({
        'model': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba)
    })

results_df = pd.DataFrame(results)
results_df

# cross validation

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced"),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Define CV and scoring metrics
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Run CV for each model
cv_results = []
for name, model in models.items():
    scores = cross_validate(model, X_train_scaled, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    cv_results.append({
        "model": name,
        "accuracy": scores['test_accuracy'].mean(),
        "precision": scores['test_precision'].mean(),
        "recall": scores['test_recall'].mean(),
        "f1_score": scores['test_f1'].mean(),
        "roc_auc": scores['test_roc_auc'].mean()
    })

cv_results_df = pd.DataFrame(cv_results)
cv_results_df

# model justification

- Logistic Regression: Lower accuracy, but recall is high (+- 0.79) >> better at catching churners, though at cost of more false alarms
- Gradient Boosting: Higher accuracy, precision, and ROC-AUC >> overall stronger balanced model, but recall is lower (+- 0.51)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Logistic Regression pipeline
logreg_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(
        solver='liblinear',        # works with l1 and l2
        class_weight='balanced',   # handle imbalance
        random_state=42,
        max_iter=1000
    ))
])

# Parameter grid for Logistic Regression
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10],         # regularization strength
    'logreg__penalty': ['l1', 'l2']          # type of regularization
}

# GridSearchCV with recall focus
grid = GridSearchCV(
    logreg_pipe,
    param_grid,
    cv=5,
    scoring='recall',   # focus on catching churners
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best mean recall (CV):", grid.best_score_)

# Predict and evaluate on test set
y_pred_logreg = grid.predict(X_test)
y_proba_logreg = grid.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred_logreg), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba_logreg)
auc_score = roc_auc_score(y_test, y_proba_logreg)
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'r--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.show()

# Classification report
print(classification_report(y_test, y_pred_logreg))

# Customer groups at highest churn risk
Based on feature engineering + literature on this dataset, the high-risk groups are:
- Contract = Month-to-month > highest churn.
- PaymentMethod = Electronic check > disproportionately churn-prone.
- Low tenure (0–12 months) > customers who haven’t built loyalty yet.
- Few or no extra services (low ServicesCount) > less “stickiness” to the company.

# extract coefficients as odds ratios from tuned Logistic Regression

In [None]:
# Get best estimator from GridSearch
best_logreg = grid.best_estimator_.named_steps['logreg']

# Feature names (after scaling/encoding)
feature_names = X_train.columns  # adjust if using encoded/expanded features

# Coefficients → odds ratios
odds_ratios = pd.Series(
    np.exp(best_logreg.coef_[0]), 
    index=feature_names
).sort_values(ascending=False)

# Display top 15 churn-increasing features
odds_ratios.head(15)

-	Values > 1 → increase churn odds.
-	Values < 1 → reduce churn odds.

# Actionable Insights
1.	Month-to-Month Contracts (highest churn risk)
	-	Customers on short-term contracts are unstable.
	-	Action: Offer bundled discounts for upgrading to annual contracts; implement loyalty rewards after 6 months to encourage long-term retention.
2.	Electronic Check Payments (high churn group)
	-	This group shows poor retention compared to auto-payment customers.
	-	Action: Provide small bill credits or convenience perks for switching to automatic payment methods (bank transfer, credit card).
3.	Low Tenure (0–12 months)
	-	New customers are far more likely to churn before establishing habits.
	-	Action: Implement a structured onboarding program (welcome discounts, check-ins, tutorials) to increase stickiness in the first year.
4.	Low ServicesCount (few subscribed services)
	-	Customers with only 1–2 services are less tied to the provider.
	-	Action: Run cross-sell campaigns — offer attractive bundle pricing (e.g., Internet + Streaming) to deepen engagement and make churn less likely.