# Customer Churn prediction

In [None]:
!pip install pandas jinja2 --quiet

In [None]:
import pandas as pd

In [None]:
cc_df = pd.read_csv('data/synth_customer_churn.csv')

In [None]:
def style_dataframe(df: pd.DataFrame):
    return df.style.set_table_styles([
        # Header styling
        {"selector": "thead th", "props": [
            ("background-color", "#f2f2f2"),  
            ("color", "black"),              
            ("font-weight", "bold"),         
            ("border", "1px solid #ddd"),    
            ("text-align", "center")        
        ]},
        # Body styling
        {"selector": "tbody td", "props": [
            ("background-color", "white"),   
            ("color", "black"),              
            ("border", "1px solid #ddd"),    
            ("text-align", "center")         
        ]}
    ]).set_properties(**{
        "border-collapse": "collapse",      
        "font-size": "12px",                
        "font-family": "Arial, sans-serif" 
    })

## EDA (Exploratory data analysis)

In [None]:
!pip install modelviz seaborn pandas --quiet

In [None]:
from modelviz.histogram import plot_feature_histograms 

In [None]:
plot_feature_histograms(cc_df)

## Identify missing values

In [None]:
# Custom module in supporting GitHub repository, using functions built
# In previous chapters
from multiclass.class_utils.missing_values import missing_values_summarizer

In [None]:
props_df, _ = missing_values_summarizer(df=cc_df)
style_dataframe(props_df)

## Class Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_class_imbalance(df, column, 
                              palette="Set2",
                              plt_title='Class imbalance',
                              xlabel='Class', 
                              ylabel='Frequency',
                              label_fontsize=12,
                              title_fontsize=14,
                              tick_fontsize=12,
                              text_fontsize=12,
                              text_halignment='center',
                              figsize=(8,6)):
    
    class_counts = df[column].value_counts()

    plt.figure(figsize=figsize)
    bar_plot = sns.barplot(
        x=class_counts.index,
        y=class_counts.values,
        hue=class_counts.values,
        palette=palette
    )

    # Add the value labels to the bars
    for i, value in enumerate(class_counts.values):
        bar_plot.text(i, value + 0.5, str(value), 
                      ha=text_halignment, 
                      fontsize=text_fontsize)
    
    plt.title(plt_title, fontsize=title_fontsize)
    plt.xlabel(xlabel, fontsize=label_fontsize)
    plt.ylabel(ylabel, fontsize=label_fontsize)
    plt.xticks(fontsize=tick_fontsize)
    plt.yticks(fontsize=tick_fontsize)
    sns.despine()
    bar_plot.get_legend().remove() if bar_plot.get_legend() else None
    plt.show()

In [None]:
visualize_class_imbalance(df=cc_df, 
                          column='ChurnCategory')

## Split the data

In [None]:
from sklearn.model_selection import train_test_split
X = cc_df.drop(columns=['ChurnCategory', 'CustomerID'])  
X['CustomerSupportCalls'].astype(float)
X_cols = X.columns

In [None]:
y = cc_df['ChurnCategory']

In [None]:
label_mapping = {
    'Low Risk': 0,
    'Medium Risk': 1,
    'High Risk': 2
}
y_mapped = y.map(label_mapping)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y_mapped, 
                                                    test_size=0.2,
                                                    random_state=42, 
                                                    stratify=y_mapped)

## Missing value imputation

In [None]:
X_train_df = pd.DataFrame(X_train, columns=X_cols)
X_test_df = pd.DataFrame(X_test, columns=X_cols)
len(X_train_df)

In [None]:
len(X_test_df)

### Modal Impute

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd

def modal_impute(train_df, test_df, columns):
    imputer = SimpleImputer(strategy="most_frequent")
    train_df[columns] = imputer.fit_transform(train_df[columns])
    test_df[columns] = imputer.transform(test_df[columns])
    return train_df, test_df

In [None]:
cols_to_impute = list(X_train_df.select_dtypes(include=["object"]).columns)
cols_to_impute

In [None]:
modal_X_train, modal_X_test = modal_impute(X_train_df, 
                                           X_test_df, 
                                           cols_to_impute)

In [None]:
props_df, _ = missing_values_summarizer(df=modal_X_train)

In [None]:
style_dataframe(props_df)

### Impute and scale with `ColumnTransformer`

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (MinMaxScaler, 
                                   StandardScaler, 
                                   RobustScaler)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('age_pipeline', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
        ]), ['Age']),
        ('tenure_pipeline', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ]), ['Tenure']),
        ('monthlycharges_pipeline', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), ['MonthlyCharges']),
        ('serviceusage_pipeline', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
        ]), ['ServiceUsage']),
        ('customer_service_pipeline', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), ['CustomerSupportCalls'])
    ],
    remainder='passthrough'  
)

In [None]:
X_train_transformed = preprocessor.fit_transform(modal_X_train)
X_test_transformed = preprocessor.transform(modal_X_test)

In [None]:
pipeline_cols = ['Age', 'Tenure', 'MonthlyCharges', 
                 'ServiceUsage', 'CustomerSupportCalls',
                 'Gender', 'ContractType', 'PaymentMethod']

In [None]:
type(X_train_transformed)

In [None]:
X_train_transformed_df = pd.DataFrame(X_train_transformed, 
                                      columns=pipeline_cols, 
                                      index=modal_X_train.index)

X_test_transformed_df = pd.DataFrame(X_test_transformed, 
                                     columns=pipeline_cols, 
                                     index=modal_X_test.index)


In [None]:
style_dataframe(X_train_transformed_df.head(10))

## One Hot encoding

In [None]:
cat_cols = ['Gender', 'ContractType', 'PaymentMethod']

In [None]:
train_dummies = pd.get_dummies(X_train_transformed_df[cat_cols], drop_first=True).astype(int)
test_dummies = pd.get_dummies(X_test_transformed_df[cat_cols], drop_first=True).astype(int)

In [None]:
if len(train_dummies.columns) != len(test_dummies.columns):
    assert ValueError('Expected the columns to match when encoding')

In [None]:
X_train_final = pd.concat(
    [X_train_transformed_df.drop(columns=cat_cols, axis=1), 
     train_dummies], axis=1
     )

In [None]:
X_test_final = pd.concat(
    [X_test_transformed_df.drop(columns=cat_cols, axis=1),
    test_dummies], axis=1)

In [None]:
if len(X_test_final.columns) != len(X_train_final.columns):
    assert ValueError("The number of columns of the train and test data frame should match")
else: 
    print(f'The number of columns in the training set is {len(X_train_final.columns)} and in the testing set is: {len(X_test_final.columns)}')

In [None]:
import numpy as np
X_train = np.array(X_train_final)
X_test = np.array(X_test_final)

## Class rebalancing

In [None]:
!pip install imbalanced-learn --quiet

### Oversample

In [None]:
from imblearn.combine import SMOTEENN
from collections import Counter
increase_class_1 = 1.4
increase_class_2 = 2.5
original_counts = Counter(y_train)
target_counts = {
    1: int(original_counts[1] * increase_class_1),   
    2: int(original_counts[2] * increase_class_2)    
}

In [None]:
smote_enn = SMOTEENN(sampling_strategy=target_counts, random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

### Undersample

In [None]:
from imblearn.under_sampling import RandomUnderSampler
reduction_proportion = 0.5
original_counts = Counter(y_train)
target_majority_size = int(original_counts[0] * reduction_proportion)
undersampler = RandomUnderSampler(sampling_strategy={0: target_majority_size}, random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_resampled, y_train_resampled)

## Evaluation metrics

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
)

def compute_metrics(y_true, y_pred):
    """Compute metrics for multiclass classification."""
    class_report = classification_report(y_true, y_pred, output_dict=True)

    # Extract precision and recall per class
    per_clss_metrics = {
        f"Precision per (Class {cls})": class_report[cls]["precision"]
        for cls in class_report if cls not in ["accuracy", "macro avg", "weighted avg"]
    }
    per_clss_metrics.update({
        f"Recall per (Class {cls})": class_report[cls]["recall"]
        for cls in class_report if cls not in ["accuracy", "macro avg", "weighted avg"]
    })

    # Combine all metrics
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision (Micro)": precision_score(y_true, y_pred, average='micro'),
        "Precision (Macro)": precision_score(y_true, y_pred, average='macro'),
        "Recall (Micro)": recall_score(y_true, y_pred, average='micro'),
        "Recall (Macro)": recall_score(y_true, y_pred, average='macro'),
        "F1 Score (Micro)": f1_score(y_true, y_pred, average='micro'),
        "F1 Score (Macro)": f1_score(y_true, y_pred, average='macro'),
        "Classification Report": class_report,
        "Confusion Matrix": confusion_matrix(y_true, y_pred),
        **per_clss_metrics  # Add per-class precision and recall
    }
    return metrics


## Classifier instantiate

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000, 
                                             class_weight='balanced'),
    "SVM Balanced": SVC(probability=True, 
                        class_weight='balanced'),
    "RandomForest":RandomForestClassifier(n_estimators=100, 
                                          class_weight='balanced', 
                                          random_state=42),
    "Balanced CatBoost": CatBoostClassifier(auto_class_weights='Balanced', 
                                            random_state=42)
}

## Determine strategies

### Create custom Threshold Classifier strategy

In [None]:
class ThresholdClassifier:
    def __init__(self, base_classifier, thresholds):
        self.base_classifier = base_classifier
        self.thresholds = thresholds
    
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.base_classifier.fit(X, y)
    
    def predict(self, X):
        probabilities = self.base_classifier.predict_proba(X) 
        predictions = []
        for prob in probabilities:
            for i, threshold in enumerate(self.thresholds):
                if prob[i] >= threshold:
                    predictions.append(self.classes_[i])
                    break
            else:
                predictions.append(self.classes_[np.argmax(prob)])  
        return np.array(predictions)

In [None]:
from sklearn.multiclass import (OneVsRestClassifier, OneVsOneClassifier, 
                                OutputCodeClassifier)
results = [] 
cms = []
strategies = [
    ("OvR", OneVsRestClassifier),
    ("OvO", OneVsOneClassifier),
    ("ECOC", lambda clf: OutputCodeClassifier(clf, 
                                              code_size=2, 
                                              random_state=42)),
    ("CustomThreshold", lambda clf: ThresholdClassifier(clf, 
                                                        thresholds=[0.6, 0.5, 0.2])) 
]

## Train and evaluation loop

In [None]:
def train_and_eval_class(classifiers, strategies, 
                         X_train, y_train, 
                         X_test, 
                         y_test):
    results = []
    cms = []

    for classifier_name, clf in classifiers.items():
        for strategy_name, strategy in strategies:
            print(f'Applying strategy: {strategy_name} to Classifier: {classifier_name}')
            model = strategy(clf)
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

            # Compute metrics
            metrics = compute_metrics(y_test, preds)
            result = {
                'Classifier': classifier_name,
                'Strategy': strategy_name,
                "Accuracy": metrics['Accuracy'],
                'Precision (micro)': metrics['Precision (Micro)'],
                "Precision (macro)": metrics['Precision (Macro)'],
                'Recall (micro)': metrics['Recall (Micro)'],
                "Recall (macro)": metrics['Recall (Macro)'],
                "F1 Score (micro)": metrics['F1 Score (Micro)'],
                "F1 Score (macro)": metrics['F1 Score (Macro)']
            }

            # Add precision per class
            for key in metrics:
                if key.startswith("Precision per (Class"):
                    result[key] = metrics[key]
            # Add recall per class
            for key in metrics:
                if key.startswith("Recall per (Class"):
                    result[key] = metrics[key]

            results.append(result)

            cms.append({
                "Classifier": f'{classifier_name} - {strategy_name}',
                "ConfusionMatrix": metrics['Confusion Matrix']
            })

            print(f"Classification Report for {classifier_name} ({strategy_name}):\n")
            print(pd.DataFrame(metrics["Classification Report"]).transpose())
            print("\n")

    results_df = pd.DataFrame(results)
    return results_df, cms


In [None]:
results_df, cms = train_and_eval_class(classifiers=classifiers, 
                                       strategies=strategies,
                                       X_train=X_train_resampled, 
                                       y_train=y_train_resampled,
                                       y_test=y_test, 
                                       X_test=X_test)

In [None]:
style_dataframe(results_df)

## Confusion Matrix 

In [None]:
from modelviz.confusion_matrix import plot_confusion_matrix

In [None]:
best_model = 'RandomForest - OvO'

In [None]:
classes = ['Low risk', 'Medium risk', 'High risk']
for idx, cm in enumerate(cms):
    if cm['Classifier'] == best_model:
        print(f"Model: {cm['Classifier']} found at index: {idx}")
        plot_confusion_matrix(cm['ConfusionMatrix'], 
                              model_name=cm['Classifier'], 
                              classes=classes)