# Objetivo

## Reduzir churn

A empresa de telecomunicações contratou a A3Data para avaliar o cenário de
churn elevado dos seus clientes e, uma vez que estamos falando de um produto
com custo elevado de setup (instalação), a empresa gostaria de uma estratégia
para reduzir esse churn.

# Imports iniciais

In [None]:
import pandas as pd
import numpy as np
from typing import Dict, Any, List


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
)

In [2]:
churn = pd.read_csv("data/customer_churn_verified.csv")

In [3]:
churn.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_bucket,charge_quantile,addon_count,ChurnFlag,high_value_customer
0,7569-NMZYQ,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,Yes,...,Yes,Bank transfer (automatic),118.75,8672.45,No,61m+,Q4,6,0,1
1,8984-HPEMB,Female,0,No,No,71,Yes,Yes,Fiber optic,Yes,...,Yes,Electronic check,118.65,8477.6,No,61m+,Q4,6,0,1
2,5989-AXPUC,Female,0,Yes,No,68,Yes,Yes,Fiber optic,Yes,...,No,Mailed check,118.6,7990.05,No,61m+,Q4,6,0,1
3,5734-EJKXG,Female,0,No,No,61,Yes,Yes,Fiber optic,Yes,...,Yes,Electronic check,118.6,7365.7,No,61m+,Q4,6,0,1
4,8199-ZLLSA,Male,0,No,No,67,Yes,Yes,Fiber optic,Yes,...,Yes,Bank transfer (automatic),118.35,7804.15,Yes,61m+,Q4,6,1,1


# Preparação

In [4]:
num_cols = ["tenure", "MonthlyCharges", "TotalCharges", "addon_count"]
cat_cols = [
    c
    for c in churn.columns
    if c not in num_cols + ["customerID", "Churn", "ChurnFlag"]
    and churn[c].dtype == "object"
]

In [5]:
churn["ChurnFlag"] = churn["Churn"].map({"No": 0, "Yes": 1})

In [6]:
cat_cols += ["SeniorCitizen", "high_value_customer"]

In [7]:
X = churn[num_cols + cat_cols]
y = churn["ChurnFlag"]

In [8]:
random_number = 42

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=random_number
)

In [10]:
numeric_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler()),
    ]
)
categorical_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    [
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ]
)

# Modelagem

In [None]:
def train_eval(
    name: str,
    estimator,
    *,
    preprocess: ColumnTransformer,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    num_cols: List[str],
    cat_cols: List[str],
) -> Dict[str, Any]:
    """
    Treina <estimator> dentro de um Pipeline, avalia no hold-out
    e retorna métricas + importância de variáveis (quando disponível).

    Parameters
    ----------
    name : str
        Rótulo para o modelo (ex.: 'RandomForest').
    estimator : scikit-learn estimator
        Algoritmo de classificação já instanciado.
    preprocess : ColumnTransformer
        Transformações numéricas/categóricas.
    X_train, X_test, y_train, y_test
        Conjuntos de dados estratificados.
    num_cols, cat_cols : list[str]
        Listas originais de colunas numéricas e categóricas.

    Returns
    -------
    dict
        Contém métricas, previsões, nomes de features expandidas
        e DataFrame `importance_df` ordenado (ou None).
    """

    pipe = Pipeline([("prep", preprocess), ("clf", estimator)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, proba)
    cm = confusion_matrix(y_test, pred)
    report = classification_report(y_test, pred, digits=3, output_dict=True)

    # Recuperando nome das variáveis
    numeric_feats = num_cols

    encoder = (pipe.named_steps["prep"].named_transformers_["cat"]
               .named_steps["encode"])
    cat_feats = encoder.get_feature_names_out(cat_cols).tolist()

    feature_names = numeric_feats + cat_feats

    clf = pipe.named_steps["clf"]
    importances = None

    if hasattr(clf, "feature_importances_"):
        importances = clf.feature_importances_

    # Modelos lineares (LogisticRegression)
    elif hasattr(clf, "coef_"):
        importances = np.abs(clf.coef_).ravel()

    if importances is not None:
        importance_df = (
            pd.DataFrame({"feature": feature_names, "importance": importances})
            .sort_values("importance", ascending=False)
            .reset_index(drop=True)
        )
    else:
        importance_df = None  # Ex.: KNN não possui importâncias

    return {
        "name": name,
        "model": pipe,
        "auc": auc,
        "cm": cm,
        "report": report,
        "feature_names": feature_names,
        "importances": importances,
        "importance_df": importance_df,
        "pred": pred,
        "proba": proba,
    }

In [12]:
models_info = []

In [None]:
models_info.append(
    train_eval(
        "LogReg",
        LogisticRegression(max_iter=3000, class_weight="balanced", 
                           solver="liblinear"),
        preprocess=preprocess,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test,
        num_cols=num_cols,
        cat_cols=cat_cols,
    )
)

In [14]:
models_info[0]["importance_df"]

Unnamed: 0,feature,importance
0,tenure_bucket_≤6m,0.932911
1,Contract_Two year,0.827167
2,Contract_Month-to-month,0.732737
3,tenure_bucket_61m+,0.687112
4,tenure,0.543585
5,InternetService_Fiber optic,0.539928
6,InternetService_DSL,0.488058
7,MonthlyCharges,0.425754
8,charge_quantile_Q1,0.272953
9,tenure_bucket_49-60m,0.258452


In [15]:
models_info.append(
    train_eval(
        "RandomForest",
        RandomForestClassifier(
            n_estimators=1000,
            random_state=42,
        ),
        preprocess=preprocess,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test,
        num_cols=num_cols,
        cat_cols=cat_cols,
    )
)

In [16]:
models_info[1]["importance_df"]

Unnamed: 0,feature,importance
0,TotalCharges,0.133855
1,MonthlyCharges,0.117079
2,tenure,0.110197
3,Contract_Month-to-month,0.046068
4,addon_count,0.02746
5,PaymentMethod_Electronic check,0.027411
6,OnlineSecurity_No,0.026989
7,tenure_bucket_≤6m,0.026306
8,TechSupport_No,0.023605
9,InternetService_Fiber optic,0.021682


In [17]:
models_info.append(
    train_eval(
        "XGBoost",
        XGBClassifier(
            n_estimators=1000,
            random_state=42,
            n_jobs=-1,
        ),
        preprocess=preprocess,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test,
        num_cols=num_cols,
        cat_cols=cat_cols,
    )
)

In [18]:
models_info[2]["importance_df"]

Unnamed: 0,feature,importance
0,Contract_Month-to-month,0.312433
1,InternetService_Fiber optic,0.216978
2,OnlineSecurity_No,0.032429
3,Contract_Two year,0.029008
4,InternetService_DSL,0.025713
5,PhoneService_No,0.020268
6,TechSupport_No,0.016871
7,Contract_One year,0.015967
8,tenure,0.014778
9,OnlineBackup_No,0.013714


Muitas variáveis no modelo vamos precisar utilizar algum método de seleção de variáveis.

# Métricas

In [26]:
summary_rows = []
for m in models_info:
    prec = m["report"]["weighted avg"]["precision"]
    rec = m["report"]["weighted avg"]["recall"]
    f1 = m["report"]["weighted avg"]["f1-score"]
    summary_rows.append(
        {
            "Model": m["name"],
            "AUC": round(m["auc"], 3),
            "Precision": round(prec, 3),
            "Recall": round(rec, 3),
            "F1": round(f1, 3),
        }
    )
summary_churn = pd.DataFrame(summary_rows)

In [27]:
summary_churn

Unnamed: 0,Model,AUC,Precision,Recall,F1
0,LogReg,0.846,0.807,0.763,0.774
1,RandomForest,0.819,0.784,0.796,0.786
2,XGBoost,0.8,0.759,0.767,0.762


Random Forest é o melhor modelo pela F1.

In [21]:
# avg_monthly = churn["MonthlyCharges"].mean()
# scenarios = {"3x": 3 * avg_monthly, "6x": 6 * avg_monthly, "12x": 12 * avg_monthly}

In [22]:
# def cost_sim(model_info, cost_install):
#     TP = ((model_info["pred"] == 1) & (y_test == 1)).sum()
#     FP_idx = (model_info["pred"] == 1) & (y_test == 0)
#     FP = FP_idx.sum()
#     retention_cost = 0.1 * avg_monthly

#     savings = TP * cost_install - FP * retention_cost
#     return savings

In [23]:
# savings_rows = []
# for m in models_info:
#     for label, cost in scenarios.items():
#         savings_rows.append(
#             {
#                 "Model": m["name"],
#                 "Scenario": label,
#                 "InstallCost": round(cost, 2),
#                 "NetSavings": round(cost_sim(m, cost), 2),
#             }
#         )

In [24]:
# savings_churn = pd.DataFrame(savings_rows)

In [25]:
# savings_churn