# Customer Churn Prediction Model Training

This notebook demonstrates the training and evaluation of machine learning models for predicting customer churn using XGBoost and SHAP interpretability.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, roc_curve
)
import xgboost as xgb
import shap
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Generate Synthetic Training Data

In [None]:
def generate_synthetic_data(n_samples=5000):
    """Generate synthetic customer churn data"""
    np.random.seed(42)
    
    data = []
    for i in range(n_samples):
        customer_id = f"CUST_{i+1:06d}"
        
        # Demographics
        gender = np.random.choice(['Male', 'Female'])
        senior_citizen = np.random.choice([0, 1], p=[0.85, 0.15])
        partner = np.random.choice(['Yes', 'No'], p=[0.6, 0.4])
        dependents = np.random.choice(['Yes', 'No'], p=[0.3, 0.7])
        
        # Service information
        tenure = np.random.randint(1, 73)  # 1 to 72 months
        phone_service = np.random.choice(['Yes', 'No'], p=[0.9, 0.1])
        
        if phone_service == 'Yes':
            multiple_lines = np.random.choice(['Yes', 'No'], p=[0.4, 0.6])
        else:
            multiple_lines = 'No phone service'
        
        internet_service = np.random.choice(['DSL', 'Fiber optic', 'No'], p=[0.4, 0.3, 0.3])
        
        # Internet-dependent services
        if internet_service == 'No':
            online_security = online_backup = device_protection = tech_support = streaming_tv = streaming_movies = 'No internet service'
        else:
            online_security = np.random.choice(['Yes', 'No'], p=[0.3, 0.7])
            online_backup = np.random.choice(['Yes', 'No'], p=[0.4, 0.6])
            device_protection = np.random.choice(['Yes', 'No'], p=[0.3, 0.7])
            tech_support = np.random.choice(['Yes', 'No'], p=[0.3, 0.7])
            streaming_tv = np.random.choice(['Yes', 'No'], p=[0.4, 0.6])
            streaming_movies = np.random.choice(['Yes', 'No'], p=[0.4, 0.6])
        
        # Account information
        contract = np.random.choice(['Month-to-month', 'One year', 'Two year'], p=[0.5, 0.3, 0.2])
        paperless_billing = np.random.choice(['Yes', 'No'], p=[0.6, 0.4])
        payment_method = np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], p=[0.3, 0.2, 0.25, 0.25])
        
        # Financial information
        monthly_charges = np.random.uniform(20, 120)
        
        # Adjust monthly charges based on services
        if internet_service == 'Fiber optic':
            monthly_charges += np.random.uniform(20, 40)
        elif internet_service == 'DSL':
            monthly_charges += np.random.uniform(10, 25)
        
        if streaming_tv == 'Yes':
            monthly_charges += np.random.uniform(5, 15)
        if streaming_movies == 'Yes':
            monthly_charges += np.random.uniform(5, 15)
        if multiple_lines == 'Yes':
            monthly_charges += np.random.uniform(10, 20)
        
        total_charges = monthly_charges * tenure
        
        # Calculate churn probability based on multiple factors
        churn_prob = 0.2  # Base churn probability
        
        # Tenure factor (newer customers more likely to churn)
        if tenure <= 12:
            churn_prob += 0.3
        elif tenure <= 24:
            churn_prob += 0.15
        elif tenure <= 36:
            churn_prob += 0.05
        
        # Contract type factor
        if contract == 'Month-to-month':
            churn_prob += 0.25
        elif contract == 'One year':
            churn_prob += 0.1
        else:  # Two year
            churn_prob -= 0.05
        
        # Payment method factor
        if payment_method == 'Electronic check':
            churn_prob += 0.15
        elif payment_method == 'Mailed check':
            churn_prob += 0.05
        
        # Service quality factors
        if internet_service == 'Fiber optic':
            if tech_support == 'No':
                churn_prob += 0.1
            if online_security == 'No':
                churn_prob += 0.08
        
        # Financial factors
        if monthly_charges > 80:
            churn_prob += 0.1
        elif monthly_charges < 35:
            churn_prob -= 0.05
        
        # Demographic factors
        if senior_citizen == 1:
            churn_prob += 0.05
        
        # Ensure probability is between 0 and 1
        churn_prob = max(0, min(1, churn_prob))
        
        # Final churn decision
        churn = 'Yes' if np.random.random() < churn_prob else 'No'
        
        data.append({
            'customerID': customer_id,
            'gender': gender,
            'SeniorCitizen': senior_citizen,
            'Partner': partner,
            'Dependents': dependents,
            'tenure': tenure,
            'PhoneService': phone_service,
            'MultipleLines': multiple_lines,
            'InternetService': internet_service,
            'OnlineSecurity': online_security,
            'OnlineBackup': online_backup,
            'DeviceProtection': device_protection,
            'TechSupport': tech_support,
            'StreamingTV': streaming_tv,
            'StreamingMovies': streaming_movies,
            'Contract': contract,
            'PaperlessBilling': paperless_billing,
            'PaymentMethod': payment_method,
            'MonthlyCharges': monthly_charges,
            'TotalCharges': total_charges,
            'Churn': churn
        })
    
    return pd.DataFrame(data)

# Generate data
print("Generating synthetic training data...")
df = generate_synthetic_data(5000)
print(f"Generated {len(df)} samples")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nTarget Distribution:")
print(df['Churn'].value_counts())
print(f"Churn Rate: {df['Churn'].value_counts()['Yes'] / len(df) * 100:.2f}%")

In [None]:
# Visualize target distribution
plt.figure(figsize=(8, 6))
df['Churn'].value_counts().plot(kind='bar', color=['lightgreen', 'lightcoral'])
plt.title('Customer Churn Distribution')
plt.xlabel('Churn Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical features
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

categorical_cols = ['Contract', 'PaymentMethod', 'InternetService', 'SeniorCitizen', 'Partner', 'Dependents']

for i, col in enumerate(categorical_cols):
    if col == 'SeniorCitizen':
        # Special handling for binary numeric column
        churn_by_col = df.groupby(col)['Churn'].apply(lambda x: (x == 'Yes').mean())
        churn_by_col.plot(kind='bar', ax=axes[i], color=['skyblue', 'orange'])
        axes[i].set_title(f'Churn Rate by {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Churn Rate')
        axes[i].tick_params(axis='x', rotation=0)
    else:
        pd.crosstab(df[col], df['Churn'], normalize='index').plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Churn Rate by {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Proportion')
        axes[i].legend(['No Churn', 'Churn'])
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Analyze numerical features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Tenure distribution
churned = df[df['Churn'] == 'Yes']['tenure']
retained = df[df['Churn'] == 'No']['tenure']

axes[0,0].hist([retained, churned], bins=20, alpha=0.7, label=['Retained', 'Churned'], color=['green', 'red'])
axes[0,0].set_title('Tenure Distribution')
axes[0,0].set_xlabel('Tenure (months)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend()

# Monthly charges distribution
churned_charges = df[df['Churn'] == 'Yes']['MonthlyCharges']
retained_charges = df[df['Churn'] == 'No']['MonthlyCharges']

axes[0,1].hist([retained_charges, churned_charges], bins=20, alpha=0.7, label=['Retained', 'Churned'], color=['green', 'red'])
axes[0,1].set_title('Monthly Charges Distribution')
axes[0,1].set_xlabel('Monthly Charges')
axes[0,1].set_ylabel('Frequency')
axes[0,1].legend()

# Box plots
df.boxplot(column='tenure', by='Churn', ax=axes[1,0])
axes[1,0].set_title('Tenure by Churn Status')
axes[1,0].set_xlabel('Churn Status')
axes[1,0].set_ylabel('Tenure (months)')

df.boxplot(column='MonthlyCharges', by='Churn', ax=axes[1,1])
axes[1,1].set_title('Monthly Charges by Churn Status')
axes[1,1].set_xlabel('Churn Status')
axes[1,1].set_ylabel('Monthly Charges')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
def create_features(df):
    """Create additional features for better model performance"""
    df = df.copy()
    
    # Average monthly charges (total charges / tenure)
    df['AvgMonthlyCharges'] = df['TotalCharges'] / df['tenure'].replace(0, 1)
    
    # Service count (number of additional services)
    service_cols = ['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
    
    def count_services(row):
        count = 0
        for col in service_cols:
            if row[col] == 'Yes':
                count += 1
        return count
    
    df['ServiceCount'] = df.apply(count_services, axis=1)
    
    # Price per service
    df['PricePerService'] = df['MonthlyCharges'] / (df['ServiceCount'] + 1)
    
    # Tenure categories
    df['TenureGroup'] = pd.cut(df['tenure'], 
                               bins=[0, 12, 24, 48, 72], 
                               labels=['0-12', '13-24', '25-48', '49-72'])
    
    # Monthly charges categories
    df['MonthlyChargesGroup'] = pd.cut(df['MonthlyCharges'], 
                                      bins=[0, 35, 65, 95, 120], 
                                      labels=['Low', 'Medium', 'High', 'Premium'])
    
    # Contract length numeric
    contract_mapping = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
    df['ContractNumeric'] = df['Contract'].map(contract_mapping)
    
    # Payment method risk score
    payment_risk = {'Electronic check': 3, 'Mailed check': 2, 'Bank transfer': 1, 'Credit card': 1}
    df['PaymentMethodRisk'] = df['PaymentMethod'].map(payment_risk)
    
    # Senior citizen interaction
    df['SeniorHighCharges'] = (df['SeniorCitizen'] == 1) & (df['MonthlyCharges'] > 80)
    
    # Tenure to charges ratio
    df['TenureToChargesRatio'] = df['tenure'] / df['MonthlyCharges'].replace(0, 1)
    
    return df

# Apply feature engineering
df_features = create_features(df)
print("New features created:")
print(df_features.columns.tolist())
df_features.head()

## 4. Data Preprocessing

In [None]:
def preprocess_data(df):
    """Preprocess data for model training"""
    # Separate features and target
    X = df.drop(['Churn', 'customerID'], axis=1)
    y = df['Churn'].map({'Yes': 1, 'No': 0})
    
    # Encode categorical variables
    categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
    
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    
    return X.values, y.values, label_encoders

# Preprocess data
X, y, encoders = preprocess_data(df_features)
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Features: {df_features.drop(['Churn', 'customerID'], axis=1).columns.tolist()}")

## 5. Model Training and Evaluation

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Train XGBoost model
print("Training XGBoost model...")

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test_scaled)
y_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'auc_roc': roc_auc_score(y_test, y_pred_proba)
}

print("Model training completed!")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1-Score: {metrics['f1_score']:.4f}")
print(f"AUC-ROC: {metrics['auc_roc']:.4f}")

## 6. Model Performance Visualization

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - XGBoost Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticklabels(['No Churn', 'Churn'])
plt.yticklabels(['No Churn', 'Churn'])
plt.show()

# Print detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc_score = metrics['auc_roc']

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f'XGBoost (AUC = {auc_score:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - XGBoost Model')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Feature importance plot
feature_names = df_features.drop(['Churn', 'customerID'], axis=1).columns.tolist()
feature_importance = xgb_model.feature_importances_

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(12, 8))
top_features = importance_df.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances (XGBoost)')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
print(importance_df.head(10))

## 7. SHAP Analysis and Interpretability

In [None]:
# Initialize SHAP explainer
print("Initializing SHAP explainer...")
explainer = shap.TreeExplainer(xgb_model)

# Calculate SHAP values for test set
shap_values = explainer.shap_values(X_test_scaled)

# For binary classification, we get a list of arrays
if isinstance(shap_values, list):
    shap_values_churn = shap_values[1]  # Class 1 (churn)
else:
    shap_values_churn = shap_values

print("SHAP explainer initialized successfully!")
print(f"SHAP values shape: {shap_values_churn.shape}")

In [None]:
# Summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values_churn, X_test_scaled, feature_names=feature_names, show=False)
plt.title('SHAP Summary Plot')
plt.tight_layout()
plt.show()

# Summary plot (bar)
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values_churn, X_test_scaled, feature_names=feature_names, plot_type="bar", show=False)
plt.title('SHAP Feature Importance (Bar Plot)')
plt.tight_layout()
plt.show()

In [None]:
# Analyze a specific customer
sample_idx = 0  # First test sample

print(f"Customer {sample_idx} prediction: {'Churn' if y_pred[sample_idx] == 1 else 'No Churn'}")
print(f"Actual: {'Churn' if y_test.iloc[sample_idx] == 1 else 'No Churn'}")
print(f"Prediction probability: {y_pred_proba[sample_idx]:.4f}")

# Force plot for this customer
plt.figure(figsize=(12, 6))
shap.force_plot(
    explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value,
    shap_values_churn[sample_idx],
    X_test_scaled[sample_idx],
    feature_names=feature_names,
    matplotlib=True,
    show=False
)
plt.title(f'SHAP Force Plot for Customer {sample_idx}')
plt.tight_layout()
plt.show()

## 8. Save the Model

In [None]:
import joblib
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save the complete model pipeline
model_data = {
    'model': xgb_model,
    'scaler': scaler,
    'label_encoders': encoders,
    'feature_names': feature_names,
    'metrics': metrics,
    'explainer': explainer
}

joblib.dump(model_data, '../models/churn_prediction_model.joblib')
print("Model saved successfully!")

# Also save individual components
joblib.dump(xgb_model, '../models/model.joblib')
joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(encoders, '../models/label_encoders.joblib')
joblib.dump(explainer, '../models/shap_explainer.joblib')
print("Individual components saved!")

## 9. Test Model with Sample Customers

In [None]:
# Test with sample customers
test_customers = [
    {
        'customerID': 'TEST_001',
        'gender': 'Male',
        'SeniorCitizen': 0,
        'Partner': 'No',
        'Dependents': 'No',
        'tenure': 2,
        'PhoneService': 'Yes',
        'MultipleLines': 'No',
        'InternetService': 'Fiber optic',
        'OnlineSecurity': 'No',
        'OnlineBackup': 'No',
        'DeviceProtection': 'No',
        'TechSupport': 'No',
        'StreamingTV': 'Yes',
        'StreamingMovies': 'No',
        'Contract': 'Month-to-month',
        'PaperlessBilling': 'Yes',
        'PaymentMethod': 'Electronic check',
        'MonthlyCharges': 95.5,
        'TotalCharges': 191.0
    },
    {
        'customerID': 'TEST_002',
        'gender': 'Female',
        'SeniorCitizen': 0,
        'Partner': 'Yes',
        'Dependents': 'Yes',
        'tenure': 60,
        'PhoneService': 'Yes',
        'MultipleLines': 'Yes',
        'InternetService': 'DSL',
        'OnlineSecurity': 'Yes',
        'OnlineBackup': 'Yes',
        'DeviceProtection': 'Yes',
        'TechSupport': 'Yes',
        'StreamingTV': 'Yes',
        'StreamingMovies': 'No',
        'Contract': 'Two year',
        'PaperlessBilling': 'No',
        'PaymentMethod': 'Credit card',
        'MonthlyCharges': 65.0,
        'TotalCharges': 3900.0
    }
]

for i, customer in enumerate(test_customers):
    print(f"\nTest Customer {i+1} ({customer['customerID']}):")
    print(f"Tenure: {customer['tenure']} months")
    print(f"Monthly Charges: ${customer['MonthlyCharges']}")
    print(f"Contract: {customer['Contract']}")
    print(f"Payment Method: {customer['PaymentMethod']}")
    
    # Preprocess and predict
    df_test = pd.DataFrame([customer])
    
    # Create features
    df_test['AvgMonthlyCharges'] = df_test['TotalCharges'] / df_test['tenure'].replace(0, 1)
    service_cols = ['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
    
    def count_services(row):
        count = 0
        for col in service_cols:
            if row[col] == 'Yes':
                count += 1
        return count
    
    df_test['ServiceCount'] = df_test.apply(count_services, axis=1)
    df_test['PricePerService'] = df_test['MonthlyCharges'] / (df_test['ServiceCount'] + 1)
    
    # Contract length numeric
    contract_mapping = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
    df_test['ContractNumeric'] = df_test['Contract'].map(contract_mapping)
    
    # Payment method risk score
    payment_risk = {'Electronic check': 3, 'Mailed check': 2, 'Bank transfer': 1, 'Credit card': 1}
    df_test['PaymentMethodRisk'] = df_test['PaymentMethod'].map(payment_risk)
    
    # Encode categorical variables
    categorical_columns = df_test.select_dtypes(include=['object']).columns.tolist()
    if 'customerID' in categorical_columns:
        categorical_columns.remove('customerID')
    
    for col in categorical_columns:
        if col in encoders:
            df_test[col] = encoders[col].transform(df_test[col])
    
    # Select features in correct order
    X_test_sample = df_test[feature_names]
    X_test_scaled_sample = scaler.transform(X_test_sample)
    
    # Make prediction
    prediction = xgb_model.predict(X_test_scaled_sample)[0]
    probability = xgb_model.predict_proba(X_test_scaled_sample)[0, 1]
    
    print(f"Prediction: {'Churn' if prediction == 1 else 'No Churn'}")
    print(f"Churn Probability: {probability:.4f}")
    print(f"Risk Level: {'High' if probability > 0.7 else 'Medium' if probability > 0.4 else 'Low'}")

## Summary

This notebook demonstrated:

1. **Data Generation**: Created synthetic customer churn data with realistic patterns
2. **Exploratory Analysis**: Analyzed feature distributions and their relationship with churn
3. **Feature Engineering**: Created additional features like service count, price per service, and risk scores
4. **Model Training**: Trained XGBoost model with hyperparameter tuning
5. **Model Evaluation**: Used multiple metrics including accuracy, precision, recall, F1-score, and AUC-ROC
6. **Feature Importance**: Identified the most important features for churn prediction
7. **SHAP Analysis**: Provided model interpretability using SHAP values
8. **Model Saving**: Saved the trained model for deployment

The XGBoost model with SHAP interpretability will be used in the API deployment for real-time churn prediction with explanations.