In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import os

# For preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

# For modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# For evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, precision_recall_curve

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Create models directory if it doesn't exist
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Load the processed data
print("Loading data...")
data_path = Path('../data/processed/telco_churn_for_modeling.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
df.head()

# Define features and target
print("\nPreparing features and target...")
# Drop unnecessary columns
drop_cols = ['CustomerID', 'Count', 'Quarter', 'Churn Label', 'Churn Value', 
             'Churn Score Category', 'CLTV Category', 'Churn Category', 'Churn Reason',
             'Satisfaction Score Label', 'Customer Status', 'Lat Long']

X = df.drop(drop_cols + ['Churn'], axis=1)
y = df['Churn']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical features: {len(categorical_features)}")
print(f"Numerical features: {len(numerical_features)}")

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define model evaluation function
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model performance and display results."""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.show()
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'model': model
    }

# 1. Logistic Regression Model
print("\nTraining Logistic Regression model...")
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Define hyperparameters for grid search
lr_param_grid = {
    'classifier__C': [0.01, 0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}

lr_grid_search = GridSearchCV(
    lr_pipeline,
    param_grid=lr_param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

lr_grid_search.fit(X_train, y_train)
print(f"Best parameters: {lr_grid_search.best_params_}")
lr_best_model = lr_grid_search.best_estimator_

# Evaluate Logistic Regression
lr_results = evaluate_model(lr_best_model, X_test, y_test, "Logistic Regression")

# 2. Random Forest Model
print("\nTraining Random Forest model...")
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameters for grid search
rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

rf_grid_search = GridSearchCV(
    rf_pipeline,
    param_grid=rf_param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

rf_grid_search.fit(X_train, y_train)
print(f"Best parameters: {rf_grid_search.best_params_}")
rf_best_model = rf_grid_search.best_estimator_

# Evaluate Random Forest
rf_results = evaluate_model(rf_best_model, X_test, y_test, "Random Forest")

# 3. XGBoost Model
print("\nTraining XGBoost model...")
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

# Define hyperparameters for grid search
xgb_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.8, 1.0]
}

xgb_grid_search = GridSearchCV(
    xgb_pipeline,
    param_grid=xgb_param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

xgb_grid_search.fit(X_train, y_train)
print(f"Best parameters: {xgb_grid_search.best_params_}")
xgb_best_model = xgb_grid_search.best_estimator_

# Evaluate XGBoost
xgb_results = evaluate_model(xgb_best_model, X_test, y_test, "XGBoost")

# Compare models
models = {
    'Logistic Regression': lr_results,
    'Random Forest': rf_results,
    'XGBoost': xgb_results
}

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Accuracy': [models[m]['accuracy'] for m in models],
    'Precision': [models[m]['precision'] for m in models],
    'Recall': [models[m]['recall'] for m in models],
    'F1 Score': [models[m]['f1'] for m in models],
    'ROC AUC': [models[m]['roc_auc'] for m in models]
})

print("\nModel Comparison:")
comparison_df.set_index('Model')

# Visualize model comparison
plt.figure(figsize=(12, 8))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
for i, metric in enumerate(metrics):
    plt.subplot(2, 3, i+1)
    sns.barplot(x='Model', y=metric, data=comparison_df, palette='viridis')
    plt.title(f'Model Comparison - {metric}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Identify the best model based on ROC AUC
best_model_name = comparison_df.loc[comparison_df['ROC AUC'].idxmax(), 'Model']
best_model = models[best_model_name]['model']
print(f"\nBest performing model: {best_model_name} (ROC AUC: {models[best_model_name]['roc_auc']:.4f})")

# Feature importance for the best model (if applicable)
if best_model_name in ['Random Forest', 'XGBoost']:
    # Get feature names after preprocessing
    preprocessor = best_model.named_steps['preprocessor']
    feature_names = []
    
    # Get numerical feature names (unchanged)
    feature_names.extend(numerical_features)
    
    # Get one-hot encoded feature names
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    categorical_features_encoded = ohe.get_feature_names_out(categorical_features)
    feature_names.extend(categorical_features_encoded)
    
    # Get feature importances
    if best_model_name == 'Random Forest':
        importances = best_model.named_steps['classifier'].feature_importances_
    else:  # XGBoost
        importances = best_model.named_steps['classifier'].feature_importances_
    
    # Create a dataframe of feature importances
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })
    
    # Sort by importance
    feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False).head(20)
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.tight_layout()
    plt.show()

# Save the best model
print(f"\nSaving the best model ({best_model_name})...")
joblib.dump(best_model, models_dir / 'best_churn_model.joblib')
print(f"Model saved to {models_dir / 'best_churn_model.joblib'}")

# Create a simple model card
model_card = f"""
# Churn Prediction Model Card

## Model Details
- **Model Type:** {best_model_name}
- **Version:** 1.0
- **Date Created:** {pd.Timestamp.now().strftime('%Y-%m-%d')}

## Performance Metrics
- **Accuracy:** {models[best_model_name]['accuracy']:.4f}
- **Precision:** {models[best_model_name]['precision']:.4f}
- **Recall:** {models[best_model_name]['recall']:.4f}
- **F1 Score:** {models[best_model_name]['f1']:.4f}
- **ROC AUC:** {models[best_model_name]['roc_auc']:.4f}

## Intended Use
- **Primary Use Case:** Predict customer churn for telecom services
- **Intended Users:** Business analysts, customer retention teams

## Training Data
- **Source:** Telco Customer Churn dataset
- **Size:** {X_train.shape[0]} training samples, {X_test.shape[0]} test samples
- **Features:** {X.shape[1]} features (after preprocessing)

## Ethical Considerations
- The model should be used as a decision support tool, not as the sole basis for customer interventions.
- Regular monitoring is required to ensure the model remains fair and unbiased across different customer segments.

## Limitations
- The model is trained on historical data and may not capture new or emerging churn patterns.
- Performance may vary across different customer segments.

## Recommendations
- Deploy model in a monitoring framework to track performance over time.
- Retrain periodically with fresh data to maintain accuracy.
- Use model predictions alongside domain expertise for retention strategies.
"""

# Save the model card
with open(models_dir / 'model_card.md', 'w') as f:
    f.write(model_card)

print("Model card created and saved.")

# Final summary
print("\nModel Training Summary:")
print(f"1. Best performing model: {best_model_name}")
print(f"2. ROC AUC score: {models[best_model_name]['roc_auc']:.4f}")
print("3. Model and documentation saved to the models directory")
print("4. Next steps: Implement the model in the Flask API for real-time predictions")