#### Importing Libraries

In [1]:
# Model Development - Customer Churn Prediction
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Model Development Started")

Model Development Started


#### Load Processed Data

In [None]:
# Load processed data
df = pd.read_csv('../data/processed/telco_churn_final_processed.csv')
print(f"Dataset shape: {df.shape}")

# Prepare features and target
X = df.drop('Churn_binary', axis=1) # Feature 
y = df['Churn_binary'] # Target

print(f"Features: {X.shape[1]}")
print(f"Target distribution:")
print(y.value_counts())

Dataset shape: (7043, 30)
Features: 29
Target distribution:
Churn_binary
0    5174
1    1869
Name: count, dtype: int64


#### Train-Test Split

In [3]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Train target distribution: {y_train.value_counts()}")

Training set: (5634, 29)
Test set: (1409, 29)
Train target distribution: Churn_binary
0    4139
1    1495
Name: count, dtype: int64


#### Handle Class Imbalance

In [4]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Original training shape: {X_train.shape}")
print(f"Balanced training shape: {X_train_balanced.shape}")
print(f"Balanced target distribution:")
print(pd.Series(y_train_balanced).value_counts())

Original training shape: (5634, 29)
Balanced training shape: (8278, 29)
Balanced target distribution:
Churn_binary
0    4139
1    4139
Name: count, dtype: int64


#### Initialize Models


In [5]:
# Define models to test
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1)
}

print(f"Models to train: {list(models.keys())}")

Models to train: ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM']


#### Train and Evaluate Models

In [6]:
# Train models and collect results
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train on balanced data
    model.fit(X_train_balanced, y_train_balanced)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    
    # Store results
    results[name] = {
        'model': model,
        'roc_auc': roc_auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"CV Score: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print("-" * 30)

Training Logistic Regression...
ROC AUC: 0.8426
CV Score: 0.8485 ± 0.0133
------------------------------
Training Random Forest...
ROC AUC: 0.8222
CV Score: 0.8229 ± 0.0121
------------------------------
Training XGBoost...
ROC AUC: 0.8099
CV Score: 0.8213 ± 0.0104
------------------------------
Training LightGBM...
ROC AUC: 0.8273
CV Score: 0.8312 ± 0.0138
------------------------------


#### Compare Model Performance

In [7]:
# Create comparison table
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'ROC_AUC': [results[name]['roc_auc'] for name in results.keys()],
    'CV_Mean': [results[name]['cv_mean'] for name in results.keys()],
    'CV_Std': [results[name]['cv_std'] for name in results.keys()]
})

comparison_df = comparison_df.sort_values('ROC_AUC', ascending=False)
print("Model Performance Comparison:")
print(comparison_df)

# Identify best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"\nBest Model: {best_model_name}")

Model Performance Comparison:
                 Model   ROC_AUC   CV_Mean    CV_Std
0  Logistic Regression  0.842639  0.848497  0.013337
3             LightGBM  0.827278  0.831208  0.013766
1        Random Forest  0.822187  0.822893  0.012126
2              XGBoost  0.809940  0.821297  0.010363

Best Model: Logistic Regression


####  Detailed Best Model Analysis

In [8]:
# Detailed evaluation of best model
print(f"Detailed Results for {best_model_name}:")
print("=" * 40)

# Classification report
print(classification_report(y_test, results[best_model_name]['y_pred']))

# Confusion matrix
cm = confusion_matrix(y_test, results[best_model_name]['y_pred'])
print(f"\nConfusion Matrix:")
print(cm)

Detailed Results for Logistic Regression:
              precision    recall  f1-score   support

           0       0.88      0.82      0.85      1035
           1       0.57      0.69      0.63       374

    accuracy                           0.78      1409
   macro avg       0.73      0.75      0.74      1409
weighted avg       0.80      0.78      0.79      1409


Confusion Matrix:
[[844 191]
 [117 257]]


#### Feature Importance Analysis

In [9]:
# Feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))
else:
    print("Feature importance not available for this model type")

Feature importance not available for this model type


####  Save Models and Results

In [10]:
# Create directories
import os
os.makedirs('../models', exist_ok=True)
os.makedirs('../reports', exist_ok=True)

# Save best model
model_filename = f"../models/best_model_{best_model_name.lower().replace(' ', '_')}.joblib"
joblib.dump(best_model, model_filename)

# Save comparison results
comparison_df.to_csv('../reports/model_comparison.csv', index=False)

print("Model Development Complete")
print(f"Best model saved: {model_filename}")
print(f"Results saved: ../reports/model_comparison.csv")

Model Development Complete
Best model saved: ../models/best_model_logistic_regression.joblib
Results saved: ../reports/model_comparison.csv
