# Customer Churn Prediction - Rigorous Data Preprocessing

**Objective**: Prepare a flawless dataset for machine learning.
**Steps**:
1. **Data Cleaning**: Strict type verification and missing value handling.
2. **Feature Engineering**: Creating domain-relevant features.
3. **Quality Assurance**: Verifying data distribution and class balance.
4. **Pipeline Construction**: Robust One-Hot Encoding and Standard Scaling.
5. **Validation**: Asserting no leakage and no missing values in processed data.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load dataset
df = pd.read_csv('../data/raw/Telco-Customer-Churn.csv')

## 1. Cleaning & Engineering

In [2]:
# Force numeric types
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Handle Nulls: 11 missing TotalCharges are likely new customers (tenure=0). Fill with 0.
df['TotalCharges'] = df['TotalCharges'].fillna(0)

# Drop IDs
df.drop(columns=['customerID'], inplace=True)

# Target Encoding
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# --- Feature Engineering ---
df['TenureGroup'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 60, 72], labels=['0-1 Yr', '1-2 Yrs', '2-4 Yrs', '4-5 Yrs', '5+ Yrs'], right=False)
df['Monthly_Tenure_Interaction'] = df['MonthlyCharges'] * df['tenure']
df['Ratio_Total_Monthly'] = df['TotalCharges'] / (df['MonthlyCharges'] + 1)

services = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
df['ServiceCount'] = df[services].apply(lambda x: x.isin(['Yes', 'Fiber optic', 'DSL']).sum(), axis=1)

print("Feature Engineering Complete.")

Feature Engineering Complete.


## 2. Data Quality Check (Pre-Split)

In [3]:
# Assert no Nulls (except potentially in categorical which pipeline handles)
assert df['TotalCharges'].isnull().sum() == 0, "TotalCharges still has nulls!"
assert df['Churn'].isnull().sum() == 0, "Target variable has nulls!"

# Check Class Imbalance
churn_rate = df['Churn'].mean()
print(f"Churn Rate: {churn_rate:.2%}")
if churn_rate < 0.2:
    print("WARNING: Severe Class Imbalance. Models must use class_weight='balanced' or SMOTE.")

Churn Rate: 26.54%


## 3. Pipeline Construction & Splitting

In [4]:
X = df.drop('Churn', axis=1)
y = df['Churn']

# Stratified Split to maintain Churn Rate in Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Monthly_Tenure_Interaction', 'Ratio_Total_Monthly', 'ServiceCount']
categorical_cols = [c for c in X.columns if c not in numerical_cols]

# Cast categorical to string
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Build Optimized Pipeline
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # Crucial for SVM/Linear models
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

# Fit & Transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Recover Feature Names
ohe_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_features = numerical_cols + list(ohe_features)

X_train_df = pd.DataFrame(X_train_processed, columns=all_features)
X_test_df = pd.DataFrame(X_test_processed, columns=all_features)

print(f"Final Feature Count: {len(all_features)}")
X_train_df.head()

Final Feature Count: 49


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Monthly_Tenure_Interaction,Ratio_Total_Monthly,ServiceCount,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,TenureGroup_0-1 Yr,TenureGroup_1-2 Yrs,TenureGroup_2-4 Yrs,TenureGroup_4-5 Yrs,TenureGroup_5+ Yrs,TenureGroup_nan
0,0.102371,-0.521976,-0.262257,-0.253464,0.084111,-0.072199,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.711743,0.337478,-0.503635,-0.51492,-0.692027,-0.503095,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.793155,-0.809013,-0.749883,-0.778067,-0.730239,-0.072199,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.26398,0.284384,-0.172722,-0.170483,-0.259904,0.789595,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-1.281624,-0.676279,-0.989374,-0.989954,-1.27744,-0.933992,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## 4. Final Quality Check on Processed Data

In [5]:
# 1. Check for NaNs
assert X_train_df.isnull().sum().sum() == 0, "Processed Train data has NaNs!"
assert X_test_df.isnull().sum().sum() == 0, "Processed Test data has NaNs!"

# 2. Check Scaling (Mean should be ~0, Std ~1 for numerical)
print("Numerical Feature Stats (Expect Mean~0, Std~1):")
print(X_train_df[numerical_cols].describe().loc[['mean', 'std']])

Numerical Feature Stats (Expect Mean~0, Std~1):
            tenure  MonthlyCharges  TotalCharges  Monthly_Tenure_Interaction  \
mean -1.008935e-17   -2.402527e-16  2.522338e-17                1.135052e-17   
std   1.000089e+00    1.000089e+00  1.000089e+00                1.000089e+00   

      Ratio_Total_Monthly  ServiceCount  
mean         7.567015e-17 -1.034159e-16  
std          1.000089e+00  1.000089e+00  


In [6]:
# Save
X_train_df.to_csv('../data/processed/X_train.csv', index=False)
X_test_df.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)
joblib.dump(preprocessor, '../models/preprocessor.joblib')
print("Data Saved.")

Data Saved.
