# Customer Churn â€” Model Experiments

## Objective
This notebook performs baseline model experiments to identify the best-performing
model for customer churn prediction.

We will:
- Load feature-engineered data
- Split data into train & test sets
- Apply encoding & scaling
- Train multiple models
- Evaluate models using ROC-AUC
- Compare performance

Final model selection happens here.


In [46]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score,classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [47]:
DATA_PATH = "../data/processed/featured_telco_churn.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (7032, 27)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,TenureGroup,MonthlyChargeLevel,TotalServices,HasInternet,SupportRisk,ContractRisk,AvgMonthlySpend
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0-1 Year,Low,1,Yes,HighRisk,High,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,2-4 Years,Medium,3,Yes,LowRisk,Medium,55.573529
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0-1 Year,Medium,3,Yes,LowRisk,High,54.075
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,2-4 Years,Low,3,Yes,LowRisk,Medium,40.905556
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0-1 Year,Medium,1,Yes,HighRisk,High,75.825


In [48]:
# Separate Features and Target
TARGET = "Churn"

X = df.drop(columns=[TARGET])
y = df[TARGET].map({"Yes": 1, "No": 0})

X.shape, y.shape



((7032, 26), (7032,))

In [49]:
# Identify the Featuer Types
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(exclude="object").columns.tolist()

categorical_cols, numerical_cols


(['gender',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod',
  'TenureGroup',
  'MonthlyChargeLevel',
  'HasInternet',
  'SupportRisk',
  'ContractRisk'],
 ['SeniorCitizen',
  'tenure',
  'MonthlyCharges',
  'TotalCharges',
  'TotalServices',
  'AvgMonthlySpend'])

In [50]:
for col in categorical_cols:
    X[col] = X[col].astype('category')


In [51]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   gender              7032 non-null   category
 1   SeniorCitizen       7032 non-null   int64   
 2   Partner             7032 non-null   category
 3   Dependents          7032 non-null   category
 4   tenure              7032 non-null   int64   
 5   PhoneService        7032 non-null   category
 6   MultipleLines       7032 non-null   category
 7   InternetService     7032 non-null   category
 8   OnlineSecurity      7032 non-null   category
 9   OnlineBackup        7032 non-null   category
 10  DeviceProtection    7032 non-null   category
 11  TechSupport         7032 non-null   category
 12  StreamingTV         7032 non-null   category
 13  StreamingMovies     7032 non-null   category
 14  Contract            7032 non-null   category
 15  PaperlessBilling    7032 non-null   ca

In [52]:
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y 
)

X_train.shape, X_test.shape

((5625, 26), (1407, 26))

In [53]:
# 1. Define the specific columns
nominal_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 
                'PaymentMethod', 'HasInternet']

ordinal_cols = ['Contract', 'TenureGroup', 'MonthlyChargeLevel', 'SupportRisk', 'ContractRisk']

# 2. Define Ordinal mapping (Example: adjust based on your specific labels)
# Note: List the categories in order from "lowest" to "highest"
contract_order = ['Month-to-month', 'One year', 'Two year']
# Add other orders as needed...

# 3. Create the Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("nominal", OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_cols),
        ("ordinal", OrdinalEncoder(), ordinal_cols)
    ]
)

In [61]:
# Create the Function to Evaluate the model
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ]
    )

    pipeline.fit(X_train, y_train)

    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print(f"\n{name}")
    print("-" * 40)
    print(f"ROC-AUC: {roc_auc:.4f}")

    return roc_auc, pipeline


In [63]:
lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs'
)

In [64]:
lr_score, lr_pipeline = evaluate_model(
    "Logistic Regression", lr, X_train, X_test, y_train, y_test
)


Logistic Regression
----------------------------------------
ROC-AUC: 0.8339


In [57]:

cb = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.1,
    loss_function='Logloss',
    verbose=False,
    random_state=42
)

cb_score, cb_pipeline = evaluate_model(
    "CatBoost", cb, X_train, X_test, y_train, y_test
)



CatBoost
----------------------------------------
ROC-AUC: 0.8240


In [59]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)

rf_score, rf_pipeline = evaluate_model(
    "Random Forest", rf, X_train, X_test, y_train, y_test
)


Random Forest
----------------------------------------
ROC-AUC: 0.8325


In [60]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42,
    use_label_encoder=False
)

xgb_score, xgb_pipeline = evaluate_model(
    "XGBoost", xgb, X_train, X_test, y_train, y_test
)

Parameters: { "use_label_encoder" } are not used.




XGBoost
----------------------------------------
ROC-AUC: 0.8075
