In [0]:
PERSONA_DEFINITIONS = [
    {
        "id": 0,
        "code": "price_sensitive",
        "label": "Better deal / pricing",
        "desc": "Cliente sensível a preço, percebe que paga mais do que recebe ou encontra oferta melhor."
    },
    {
        "id": 1,
        "code": "tech_speed",
        "label": "Technical issues / speed",
        "desc": "Problemas de velocidade ou performance prejudicam a experiência, comum em fibra e clientes novos."
    },
    {
        "id": 2,
        "code": "reliability_outages",
        "label": "Service reliability / outages",
        "desc": "Quedas frequentes ou instabilidade tornam o serviço pouco confiável."
    },
    {
        "id": 3,
        "code": "billing_issues",
        "label": "Payment / billing issues",
        "desc": "Erros de cobrança, valores inesperados ou dificuldade para pagar."
    },
    {
        "id": 4,
        "code": "personal_reasons",
        "label": "Personal reasons",
        "desc": "Motivos externos ao serviço (mudança, corte de despesas, eventos pessoais)."
    },
    {
        "id": 5,
        "code": "product_mismatch",
        "label": "Product / plan mismatch",
        "desc": "Plano não atende à necessidade real ou expectativa de uso."
    },
    {
        "id": 6,
        "code": "onboarding_early_tenure",
        "label": "Tenure baixo / onboarding",
        "desc": "Clientes 0–9 meses com problemas de onboarding ou expectativas não atendidas."
    }
]

In [0]:
def classify_persona(row):
    monthly = row.get("MonthlyCharges", 0)
    tech_support = row.get("TechSupport", "")
    online_sec = row.get("OnlineSecurity", "")
    tenure = float(row.get("tenure", 0) or 0)
    internet = row.get("InternetService", "")
    payment = row.get("PaymentMethod", "")
    feedback = (row.get("CustomerFeedback", "") or "").lower()

    # ------------------------------  
    # 1) Sensível a preço
    # ------------------------------
    if monthly > 100 or "caro" in feedback or "preço" in feedback:
        return "price_sensitive"

    # ------------------------------  
    # 2) Performance / velocidade
    # ------------------------------
    if "lento" in feedback or "internet" in feedback or tech_support == "No":
        return "tech_speed"

    # ------------------------------  
    # 3) Quedas / instabilidade
    # ------------------------------
    if "cair" in feedback or "queda" in feedback or "oscilação" in feedback:
        return "reliability_outages"

    # ------------------------------  
    # 4) Billing issues
    # ------------------------------
    if "cobrança" in feedback or "boleto" in feedback or payment == "Electronic check":
        return "billing_issues"

    # ------------------------------  
    # 5) Baixo tenure (onboarding problem)
    # ------------------------------
    if tenure <= 9:
        return "onboarding_early_tenure"

    # ------------------------------  
    # 6) Product mismatch
    # ------------------------------
    if internet == "DSL" or "plano" in feedback or "não atende" in feedback:
        return "product_mismatch"

    # ------------------------------  
    # 7) Personal reasons
    # ------------------------------
    if "mudança" in feedback or "me mudar" in feedback:
        return "personal_reasons"

    # fallback
    return "product_mismatch"

In [0]:
ACTIONS = {
  "price_sensitive": [
    "Oferecer desconto temporário",
    "Migrar para plano mais barato",
    "Reforçar benefícios inclusos no plano"
  ],
  "tech_speed": [
    "Enviar análise técnica remotamente",
    "Agendar visita técnica",
    "Recomendar upgrade de velocidade"
  ],
  "reliability_outages": [
    "Revisar estabilidade da rede local",
    "Trocar roteador ou cabo",
    "Ativar monitoramento proativo"
  ],
  "billing_issues": [
    "Revisar fatura",
    "Regularizar método de pagamento",
    "Oferecer orientação de cobrança"
  ],
  "personal_reasons": [
    "Oferecer suspensão temporária",
    "Facilitar portabilidade / mudança de endereço"
  ],
  "product_mismatch": [
    "Recomendar plano mais adequado ao uso",
    "Ajustar recomendação inicial"
  ],
  "onboarding_early_tenure": [
    "Contato ativo para entender problemas",
    "Reforçar benefícios e tirar dúvidas técnicas",
    "Garantir acompanhamento nos primeiros meses"
  ]
}

In [0]:
import mlflow.pyfunc
import pandas as pd
import json


# ----- PERSONAS COM DESCRIÇÕES -----
PERSONA_DEFS = {
    "price_sensitive": "Cliente sensível a preço; percebe que paga mais do que recebe.",
    "tech_speed": "Problemas de velocidade ou performance prejudicam a experiência.",
    "reliability_outages": "Quedas frequentes tornam o serviço instável.",
    "billing_issues": "Erros de cobrança ou dificuldade de pagamento.",
    "personal_reasons": "Motivos externos ao serviço (mudança, despesas).",
    "product_mismatch": "Plano não atende às expectativas ou uso real.",
    "onboarding_early_tenure": "Cliente novo (0–9 meses) com onboarding ruim."
}

ACTIONS = {
    "price_sensitive": ["Oferecer desconto", "Migrar para plano adequado"],
    "tech_speed": ["Checar velocidade", "Enviar técnico", "Revisar modem"],
    "reliability_outages": ["Revisar rede", "Trocar equipamento"],
    "billing_issues": ["Revisar fatura", "Orientar pagamento"],
    "personal_reasons": ["Oferecer suspensão", "Facilitar mudança de endereço"],
    "product_mismatch": ["Recomendar plano mais adequado"],
    "onboarding_early_tenure": ["Contato ativo", "Reforçar benefícios"]
}


# ----- CLASSIFICADOR COM REGRAS -----
def classify(row):
    feedback = (row.get("CustomerFeedback") or "").lower()
    tenure = float(row.get("tenure", 0) or 0)
    monthly = float(row.get("MonthlyCharges", 0) or 0)
    contract = row.get("Contract", "")
    payment = row.get("PaymentMethod", "")
    internet = row.get("InternetService", "")
    tech_support = row.get("TechSupport", "")

    # billing
    if payment == "Electronic check" or "cobrança" in feedback:
        return "billing_issues"

    # personal
    if "mudança" in feedback or "despesa" in feedback:
        return "personal_reasons"

    # speed / technical
    if "lento" in feedback or "velocidade" in feedback or (internet == "Fiber optic" and tech_support == "No"):
        return "tech_speed"

    # outages
    if "queda" in feedback or "oscilação" in feedback or "cair" in feedback:
        return "reliability_outages"

    # price sensitive
    if monthly > 90 or "caro" in feedback or contract == "Month-to-month":
        return "price_sensitive"

    # onboarding
    if tenure <= 9:
        return "onboarding_early_tenure"

    # mismatch
    if "plano" in feedback or "não atende" in feedback:
        return "product_mismatch"

    return "product_mismatch"


# ----- MODELO MLflow -----
class PersonaRuleModel(mlflow.pyfunc.PythonModel):

    def predict(self, context, model_input):
        results = []

        for _, row in model_input.iterrows():
            p = classify(row)

            results.append({
                "persona": p,
                "description": PERSONA_DEFS[p],
                "actions": json.dumps(ACTIONS[p], ensure_ascii=False)
            })

        return pd.DataFrame(results)

In [0]:
# =====================================================
# 1) Imports
# =====================================================
import pandas as pd
import numpy as np
import mlflow
import mlflow.pyfunc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier


# =====================================================
# 2) Load datasets
# =====================================================

df_hist = pd.read_csv("./history_clean.csv")
df_reason = pd.read_csv("./churn_reason_final.csv")

# Unifica o nome para merge
df_reason.rename(columns={"customer_id": "customerID"}, inplace=True)

# Define target explicitamente
df_reason["persona"] = df_reason["churn_category"]


# =====================================================
# 3) Merge
# =====================================================
df = df_hist.merge(df_reason[["customerID", "persona", "CustomerFeedback_clean"]],
                   on="customerID",
                   how="inner")

print("Merged dataset:", df.shape)
df.head()

In [0]:
TARGET = "persona"
df = df.dropna(subset=["persona"])

In [0]:
df.isna().sum()

In [0]:
cols_to_drop = ["Churn", "CustomerFeedback", "CustomerFeedback_clean_x"]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

In [0]:
# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
import numpy as np
import mlflow
import mlflow.pyfunc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# ============================================================
# LOAD DATA
# ============================================================


# ============================================================
# TARGET
# ============================================================
TARGET = "persona"


# ============================================================
# DROP COLUNAS QUE NÃO DEVEM IR PARA ML
# ============================================================
df = df.drop(columns=[
    "Churn",
    "CustomerFeedback",
    "CustomerFeedback_clean_x",    # usa a versão Y
])


# ============================================================
# FEATURES
# ============================================================
numeric_features = [
    "SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges", "MonthlyIncome"
]

binary_features = [
    "Partner", "Dependents", "PhoneService", "MultipleLines",
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies", "PaperlessBilling"
]

dummy_features = [
    "gender_Male",
    "InternetService_Fiber optic",
    "InternetService_No",
    "Contract_One year",
    "Contract_Two year",
    "PaymentMethod_Credit card (automatic)",
    "PaymentMethod_Electronic check",
    "PaymentMethod_Mailed check"
]

text_feature = "CustomerFeedback_clean_y"

all_features = numeric_features + binary_features + dummy_features + [text_feature]

X = df[all_features]
y = df[TARGET]


# ============================================================
# PREPROCESSING
# ============================================================
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat_bin", OneHotEncoder(drop="if_binary"), binary_features),
        ("dummy", "passthrough", dummy_features),
        ("text", TfidfVectorizer(max_features=5000), text_feature),
    ],
    remainder="drop"
)


# ============================================================
# MODEL
# ============================================================
model = RandomForestClassifier(
    n_estimators=350,
    max_depth=16,
    class_weight="balanced",
    random_state=42
)

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("clf", model)
])


# ============================================================
# TRAIN / TEST SPLIT
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


# ============================================================
# SAVE MODEL TO MLFLOW
# ============================================================
from mlflow.models import infer_signature

with mlflow.start_run():
    signature = infer_signature(X_train, pipeline.predict(X_train))
    
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="persona_classifier",
        signature=signature,
        input_example=X_train.head(3)
    )